diff --git a/.github/dependabot.yml b/.github/dependabot.yml index a5c01bee7..6b89502b5 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -5,6 +5,11 @@ updates: - package-ecosystem: github-actions directory: / schedule: - interval: daily + interval: weekly labels: - type:ci + # https://til.simonwillison.net/github/dependabot-python-setup + groups: + github-actions: + patterns: + - '*' diff --git a/.github/labels.yml b/.github/labels.yml new file mode 100644 index 000000000..20a4408dd --- /dev/null +++ b/.github/labels.yml @@ -0,0 +1,153 @@ +# config for https://github.com/marketplace/actions/github-labeler + + - name: attention:help wanted + description: Extra attention is needed + color: bfdadc + from_name: help wanted + + - name: attention:invalid + description: This doesn't seem right + color: ea2357 + from_name: invalid + + - name: ci:skip-changelog + description: Add this label to skip changelog file check + color: 04990f + + - name: component:connection + description: Connection-related + color: 75aeb3 + + - name: component:db + description: DB-related + color: 5319e7 + + - name: component:file + description: DB-related + color: 75f526 + + - name: component:hooks + description: Hooks-related + color: 006b75 + + - name: component:hwm + description: HWM-related + color: 670f54 + + - name: component:plugins + description: Plugins-related + color: 77ba7b + + - name: component:spark + description: Spark-related + color: e09183 + + - name: db:clickhouse + description: Clickhouse-related + color: 1d76db + + - name: db:greenplum + description: Greenplum-related + color: c2e0c6 + + - name: db:hive + description: Hive-related + color: ef2895 + + - name: db:kafka + description: Kafka-related + color: f00a38 + + - name: db:mongodb + description: MongoDB-related + color: 1587c4 + + - name: db:mssql + description: MSSQL-related + color: ad368a + + - name: db:mysql + description: MySQL-related + color: '473255' + + - name: db:oracle + description: Oracle-related + color: '646821' + + - name: db:postgres + description: Postgres-related + color: 1077a6 + + - name: db:teradata + description: Teradata-related + color: 04115d + + - name: file:ftp + description: FTP-related + color: e41c74 + + - name: file:ftps + description: FTPS-related + color: 07c469 + + - name: file:hdfs + description: HDFS-related + color: 0295d0 + + - name: file:s3 + description: S3-related + color: b0e8f9 + + - name: file:sftp + description: SFTP-related + color: 3e9d6d + + - name: file:webdav + description: WebDAV-related + color: 6d1c10 + + - name: kind:bug + description: Something isn't working + color: d73a4a + from_name: bug + + - name: kind:feature + description: New feature or request + color: 389a3f + + - name: kind:improvement + description: Improvement of some existing feature + color: 1a92c2 + from_name: enhancement + + - name: kind:question + description: Further information is requested + color: 0e857c + from_name: question + + - name: resolution:duplicate + description: This issue or pull request already exists + color: cfd3d7 + from_name: duplicate + + - name: resolution:wontfix + description: This will not be worked on + color: ec103b + from_name: wontfix + + - name: type:ci + description: CI-related changes + color: cdb0bd + + - name: type:dependency + description: Dependency-related changes + color: 214efe + + - name: type:documentation + description: Improvements or additions to documentation + color: 6b9f54 + from_name: documentation + + - name: type:tests + description: Tests-related changes + color: 5cca5b diff --git a/.github/workflows/automerge.yml b/.github/workflows/automerge.yml index c29d2d6c2..40bec6775 100644 --- a/.github/workflows/automerge.yml +++ b/.github/workflows/automerge.yml @@ -10,7 +10,7 @@ jobs: if: github.event.pull_request.user.login == 'pre-commit-ci[bot]' || github.event.pull_request.user.login == 'dependabot[bot]' steps: - - uses: alexwilson/enable-github-automerge-action@1.0.0 + - uses: alexwilson/enable-github-automerge-action@2.0.0 with: github-token: ${{ secrets.AUTOMERGE_TOKEN }} merge-method: REBASE diff --git a/.github/workflows/data/base/tracked.txt b/.github/workflows/data/base/tracked.txt index a2fac112a..c7bb90eb0 100644 --- a/.github/workflows/data/base/tracked.txt +++ b/.github/workflows/data/base/tracked.txt @@ -3,4 +3,5 @@ .github/workflows/data/base/** requirements/core.txt requirements/tests/base.txt +requirements/tests/pydantic-*.txt .env.local diff --git a/.github/workflows/data/clickhouse/matrix.yml b/.github/workflows/data/clickhouse/matrix.yml index 7c9b53384..9c8c558ba 100644 --- a/.github/workflows/data/clickhouse/matrix.yml +++ b/.github/workflows/data/clickhouse/matrix.yml @@ -1,17 +1,20 @@ min: &min spark-version: 2.3.1 + pydantic-version: 1 python-version: '3.7' java-version: 8 os: ubuntu-latest max: &max spark-version: 3.5.0 + pydantic-version: 2 python-version: '3.12' java-version: 20 os: ubuntu-latest latest: &latest spark-version: latest + pydantic-version: latest python-version: '3.12' java-version: 20 os: ubuntu-latest diff --git a/.github/workflows/data/core/matrix.yml b/.github/workflows/data/core/matrix.yml index 522e2160a..a7339e139 100644 --- a/.github/workflows/data/core/matrix.yml +++ b/.github/workflows/data/core/matrix.yml @@ -1,17 +1,20 @@ min: &min spark-version: 2.3.1 + pydantic-version: 1 python-version: '3.7' java-version: 8 os: ubuntu-latest max: &max spark-version: 3.5.0 + pydantic-version: 2 python-version: '3.12' java-version: 20 os: ubuntu-latest latest: &latest spark-version: latest + pydantic-version: latest python-version: '3.12' java-version: 20 os: ubuntu-latest diff --git a/.github/workflows/data/ftp/matrix.yml b/.github/workflows/data/ftp/matrix.yml index 86243228c..49468d914 100644 --- a/.github/workflows/data/ftp/matrix.yml +++ b/.github/workflows/data/ftp/matrix.yml @@ -1,8 +1,15 @@ min: &min + pydantic-version: 1 python-version: '3.7' os: ubuntu-latest max: &max + pydantic-version: 2 + python-version: '3.12' + os: ubuntu-latest + +latest: &latest + pydantic-version: latest python-version: '3.12' os: ubuntu-latest @@ -11,9 +18,13 @@ matrix: # chonjay21/ftps image has only latest tag - ftp-version: latest <<: *max - full: &full + full: - ftp-version: latest <<: *min - ftp-version: latest <<: *max - nightly: *full + nightly: + - ftp-version: latest + <<: *min + - ftp-version: latest + <<: *latest diff --git a/.github/workflows/data/ftps/matrix.yml b/.github/workflows/data/ftps/matrix.yml index bb61dc6b0..ec5a862cd 100644 --- a/.github/workflows/data/ftps/matrix.yml +++ b/.github/workflows/data/ftps/matrix.yml @@ -1,8 +1,15 @@ min: &min + pydantic-version: 1 python-version: '3.7' os: ubuntu-latest max: &max + pydantic-version: 2 + python-version: '3.12' + os: ubuntu-latest + +latest: &latest + pydantic-version: latest python-version: '3.12' os: ubuntu-latest @@ -11,9 +18,13 @@ matrix: # chonjay21/ftps image has only latest tag - ftps-version: latest <<: *max - full: &full + full: - ftps-version: latest <<: *min - ftps-version: latest <<: *max - nightly: *full + nightly: + - ftps-version: latest + <<: *min + - ftps-version: latest + <<: *latest diff --git a/.github/workflows/data/greenplum/matrix.yml b/.github/workflows/data/greenplum/matrix.yml index 7715ed5c3..2b66c0e19 100644 --- a/.github/workflows/data/greenplum/matrix.yml +++ b/.github/workflows/data/greenplum/matrix.yml @@ -1,6 +1,7 @@ min: &min # Spark 2.3.0 does not support passing ivysettings.xml spark-version: 2.3.1 + pydantic-version: 1 python-version: '3.7' java-version: 8 os: ubuntu-latest @@ -8,6 +9,15 @@ min: &min max: &max # Greenplum connector does not support Spark 3.3+ spark-version: 3.2.4 + pydantic-version: 2 + python-version: '3.10' + java-version: 11 + os: ubuntu-latest + +latest: &latest + # Greenplum connector does not support Spark 3.3+ + spark-version: 3.2.4 + pydantic-version: latest python-version: '3.10' java-version: 11 os: ubuntu-latest @@ -15,10 +25,19 @@ max: &max matrix: small: - greenplum-version: 7.0.0 + package-version: 2.3.1 <<: *max - full: &full + full: - greenplum-version: 6.25.3 + package-version: 2.2.0 <<: *min - greenplum-version: 7.0.0 + package-version: 2.3.1 <<: *max - nightly: *full + nightly: + - greenplum-version: 6.25.3 + package-version: 2.2.0 + <<: *min + - greenplum-version: 7.0.0 + package-version: 2.3.1 + <<: *latest diff --git a/.github/workflows/data/hdfs/matrix.yml b/.github/workflows/data/hdfs/matrix.yml index c117c8d6f..e62f0242a 100644 --- a/.github/workflows/data/hdfs/matrix.yml +++ b/.github/workflows/data/hdfs/matrix.yml @@ -1,6 +1,7 @@ min: &min hadoop-version: hadoop2-hdfs spark-version: 2.3.1 + pydantic-version: 1 python-version: '3.7' java-version: 8 os: ubuntu-latest @@ -8,6 +9,7 @@ min: &min max: &max hadoop-version: hadoop3-hdfs spark-version: 3.5.0 + pydantic-version: 2 python-version: '3.12' java-version: 20 os: ubuntu-latest @@ -15,6 +17,7 @@ max: &max latest: &latest hadoop-version: hadoop3-hdfs spark-version: latest + pydantic-version: latest python-version: '3.12' java-version: 20 os: ubuntu-latest diff --git a/.github/workflows/data/hive/matrix.yml b/.github/workflows/data/hive/matrix.yml index 370d22d95..0f7d4ba6b 100644 --- a/.github/workflows/data/hive/matrix.yml +++ b/.github/workflows/data/hive/matrix.yml @@ -1,17 +1,20 @@ min: &min spark-version: 2.3.1 + pydantic-version: 1 python-version: '3.7' java-version: 8 os: ubuntu-latest max: &max spark-version: 3.5.0 + pydantic-version: 2 python-version: '3.12' java-version: 20 os: ubuntu-latest latest: &latest spark-version: latest + pydantic-version: latest python-version: '3.12' java-version: 20 os: ubuntu-latest diff --git a/.github/workflows/data/kafka/matrix.yml b/.github/workflows/data/kafka/matrix.yml index 26ccf4af7..29e587721 100644 --- a/.github/workflows/data/kafka/matrix.yml +++ b/.github/workflows/data/kafka/matrix.yml @@ -1,6 +1,7 @@ min: &min # kafka_version: 0.10.2-1-r3 kafka-version: 3.5.1 + pydantic-version: 1 spark-version: 2.4.8 python-version: '3.7' java-version: 8 @@ -8,6 +9,7 @@ min: &min max: &max kafka-version: 3.5.1 + pydantic-version: 2 spark-version: 3.5.0 python-version: '3.12' java-version: 20 @@ -15,6 +17,7 @@ max: &max latest: &latest kafka-version: latest + pydantic-version: latest spark-version: latest python-version: '3.12' java-version: 20 diff --git a/.github/workflows/data/local-fs/matrix.yml b/.github/workflows/data/local-fs/matrix.yml index 6dcf8efd3..b3db2391f 100644 --- a/.github/workflows/data/local-fs/matrix.yml +++ b/.github/workflows/data/local-fs/matrix.yml @@ -1,29 +1,34 @@ min: &min spark-version: 2.3.1 + pydantic-version: 1 python-version: '3.7' java-version: 8 os: ubuntu-latest min_avro: &min_avro spark-version: 2.4.8 + pydantic-version: 1 python-version: '3.7' java-version: 8 os: ubuntu-latest min_excel: &min_excel spark-version: 3.2.4 + pydantic-version: 1 python-version: '3.7' java-version: 8 os: ubuntu-latest max: &max spark-version: 3.5.0 + pydantic-version: 2 python-version: '3.12' java-version: 20 os: ubuntu-latest latest: &latest spark-version: latest + pydantic-version: latest python-version: '3.12' java-version: 20 os: ubuntu-latest diff --git a/.github/workflows/data/mongodb/matrix.yml b/.github/workflows/data/mongodb/matrix.yml index 34e66b85a..c916cc306 100644 --- a/.github/workflows/data/mongodb/matrix.yml +++ b/.github/workflows/data/mongodb/matrix.yml @@ -1,18 +1,21 @@ min: &min # MongoDB connector does not support Spark 2 spark-version: 3.2.4 + pydantic-version: 1 python-version: '3.7' java-version: 8 os: ubuntu-latest max: &max spark-version: 3.4.2 + pydantic-version: 2 python-version: '3.12' java-version: 20 os: ubuntu-latest latest: &latest spark-version: latest + pydantic-version: latest python-version: '3.12' java-version: 20 os: ubuntu-latest diff --git a/.github/workflows/data/mssql/matrix.yml b/.github/workflows/data/mssql/matrix.yml index bbb121c39..0138805bb 100644 --- a/.github/workflows/data/mssql/matrix.yml +++ b/.github/workflows/data/mssql/matrix.yml @@ -1,17 +1,20 @@ min: &min spark-version: 2.3.1 + pydantic-version: 1 python-version: '3.7' java-version: 8 os: ubuntu-latest max: &max spark-version: 3.5.0 + pydantic-version: 2 python-version: '3.12' java-version: 20 os: ubuntu-latest latest: &latest spark-version: latest + pydantic-version: latest python-version: '3.12' java-version: 20 os: ubuntu-latest diff --git a/.github/workflows/data/mysql/matrix.yml b/.github/workflows/data/mysql/matrix.yml index aa1c575d1..9b64e3b93 100644 --- a/.github/workflows/data/mysql/matrix.yml +++ b/.github/workflows/data/mysql/matrix.yml @@ -1,17 +1,20 @@ min: &min spark-version: 2.3.1 + pydantic-version: 1 python-version: '3.7' java-version: 8 os: ubuntu-latest max: &max spark-version: 3.5.0 + pydantic-version: 2 python-version: '3.12' java-version: 20 os: ubuntu-latest latest: &latest spark-version: latest + pydantic-version: latest python-version: '3.12' java-version: 20 os: ubuntu-latest diff --git a/.github/workflows/data/oracle/matrix.yml b/.github/workflows/data/oracle/matrix.yml index 1230f659c..55dc4c185 100644 --- a/.github/workflows/data/oracle/matrix.yml +++ b/.github/workflows/data/oracle/matrix.yml @@ -1,17 +1,20 @@ min: &min spark-version: 2.3.1 + pydantic-version: 1 python-version: '3.7' java-version: 8 os: ubuntu-latest max: &max spark-version: 3.5.0 + pydantic-version: 2 python-version: '3.12' java-version: 20 os: ubuntu-latest latest: &latest spark-version: latest + pydantic-version: latest python-version: '3.12' java-version: 20 os: ubuntu-latest diff --git a/.github/workflows/data/postgres/matrix.yml b/.github/workflows/data/postgres/matrix.yml index 78de0b38b..8cdff4f63 100644 --- a/.github/workflows/data/postgres/matrix.yml +++ b/.github/workflows/data/postgres/matrix.yml @@ -1,17 +1,20 @@ min: &min spark-version: 2.3.1 + pydantic-version: 1 python-version: '3.7' java-version: 8 os: ubuntu-latest max: &max spark-version: 3.5.0 + pydantic-version: 2 python-version: '3.12' java-version: 20 os: ubuntu-latest latest: &latest spark-version: latest + pydantic-version: latest python-version: '3.12' java-version: 20 os: ubuntu-latest diff --git a/.github/workflows/data/s3/matrix.yml b/.github/workflows/data/s3/matrix.yml index b2d595d6b..a0825603b 100644 --- a/.github/workflows/data/s3/matrix.yml +++ b/.github/workflows/data/s3/matrix.yml @@ -3,6 +3,7 @@ min: &min minio-version: 2021.3.17 # Minimal Spark version with Hadoop 3.x support spark-version: 3.2.4 + pydantic-version: 1 python-version: '3.7' java-version: 8 os: ubuntu-latest @@ -10,6 +11,7 @@ min: &min max: &max minio-version: 2023.7.18 spark-version: 3.5.0 + pydantic-version: 2 python-version: '3.12' java-version: 20 os: ubuntu-latest @@ -17,6 +19,7 @@ max: &max latest: &latest minio-version: latest spark-version: latest + pydantic-version: latest python-version: '3.12' java-version: 20 os: ubuntu-latest diff --git a/.github/workflows/data/samba/matrix.yml b/.github/workflows/data/samba/matrix.yml index f03adcd08..5b0b2628e 100644 --- a/.github/workflows/data/samba/matrix.yml +++ b/.github/workflows/data/samba/matrix.yml @@ -1,8 +1,15 @@ min: &min + pydantic-version: 1 python-version: '3.7' os: ubuntu-latest max: &max + pydantic-version: 2 + python-version: '3.12' + os: ubuntu-latest + +latest: &latest + pydantic-version: latest python-version: '3.12' os: ubuntu-latest @@ -10,9 +17,13 @@ matrix: small: - server-version: latest <<: *max - full: &full + full: - server-version: latest <<: *min - server-version: latest <<: *max - nightly: *full + nightly: + - server-version: latest + <<: *min + - server-version: latest + <<: *latest diff --git a/.github/workflows/data/sftp/matrix.yml b/.github/workflows/data/sftp/matrix.yml index 3379a6e3e..0dfd9e730 100644 --- a/.github/workflows/data/sftp/matrix.yml +++ b/.github/workflows/data/sftp/matrix.yml @@ -1,8 +1,15 @@ min: &min + pydantic-version: 1 python-version: '3.7' os: ubuntu-latest max: &max + pydantic-version: 2 + python-version: '3.12' + os: ubuntu-latest + +latest: &latest + pydantic-version: latest python-version: '3.12' os: ubuntu-latest @@ -20,4 +27,4 @@ matrix: - openssh-version: 8.1_p1-r0-ls5 <<: *min - openssh-version: latest - <<: *max + <<: *latest diff --git a/.github/workflows/data/teradata/matrix.yml b/.github/workflows/data/teradata/matrix.yml index 5391077c9..9647daec6 100644 --- a/.github/workflows/data/teradata/matrix.yml +++ b/.github/workflows/data/teradata/matrix.yml @@ -1,11 +1,13 @@ max: &max spark-version: 3.5.0 + pydantic-version: 2 python-version: '3.12' java-version: 20 os: ubuntu-latest latest: &latest spark-version: latest + pydantic-version: latest python-version: '3.12' java-version: 20 os: ubuntu-latest diff --git a/.github/workflows/data/webdav/matrix.yml b/.github/workflows/data/webdav/matrix.yml index 0a67ff838..8d8f012a7 100644 --- a/.github/workflows/data/webdav/matrix.yml +++ b/.github/workflows/data/webdav/matrix.yml @@ -1,8 +1,15 @@ min: &min + pydantic-version: 1 python-version: '3.7' os: ubuntu-latest max: &max + pydantic-version: 2 + python-version: '3.12' + os: ubuntu-latest + +latest: &latest + pydantic-version: latest python-version: '3.12' os: ubuntu-latest @@ -11,9 +18,13 @@ matrix: # chonjay21/webdav image has only latest tag - webdav-version: latest <<: *max - full: &full + full: - webdav-version: latest <<: *min - webdav-version: latest <<: *max - nightly: *full + nightly: + - webdav-version: latest + <<: *min + - webdav-version: latest + <<: *latest diff --git a/.github/workflows/get-matrix.yml b/.github/workflows/get-matrix.yml index a97dd5a7d..e4579eb28 100644 --- a/.github/workflows/get-matrix.yml +++ b/.github/workflows/get-matrix.yml @@ -86,7 +86,7 @@ jobs: - name: Check if base files are changed id: changed-base - uses: tj-actions/changed-files@v42 + uses: tj-actions/changed-files@v43 with: files_from_source_file: .github/workflows/data/base/tracked.txt files_ignore_from_source_file: .github/workflows/data/base/ignored.txt @@ -97,7 +97,7 @@ jobs: - name: Check if db-related files are changed id: changed-db - uses: tj-actions/changed-files@v42 + uses: tj-actions/changed-files@v43 with: files_from_source_file: .github/workflows/data/db/tracked.txt files_ignore_from_source_file: .github/workflows/data/db/ignored.txt @@ -108,7 +108,7 @@ jobs: - name: Check if file-related files are changed id: changed-file - uses: tj-actions/changed-files@v42 + uses: tj-actions/changed-files@v43 with: files_from_source_file: .github/workflows/data/file/tracked.txt files_ignore_from_source_file: .github/workflows/data/file/ignored.txt @@ -119,7 +119,7 @@ jobs: - name: Check if file-df-related files are changed id: changed-file-df - uses: tj-actions/changed-files@v42 + uses: tj-actions/changed-files@v43 with: files_from_source_file: .github/workflows/data/file-df/tracked.txt files_ignore_from_source_file: .github/workflows/data/file-df/ignored.txt @@ -130,7 +130,7 @@ jobs: - name: Check if core files are changed id: changed-core - uses: tj-actions/changed-files@v42 + uses: tj-actions/changed-files@v43 with: files_from_source_file: .github/workflows/data/core/tracked.txt files_ignore_from_source_file: .github/workflows/data/core/ignored.txt @@ -154,13 +154,13 @@ jobs: - name: Get Core matrix id: matrix-core - uses: mikefarah/yq@v4.40.5 + uses: mikefarah/yq@v4.42.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/core/matrix.yml - name: Check if Clickhouse files are changed id: changed-clickhouse - uses: tj-actions/changed-files@v42 + uses: tj-actions/changed-files@v43 with: files_from_source_file: .github/workflows/data/clickhouse/tracked.txt files_ignore_from_source_file: .github/workflows/data/clickhouse/ignored.txt @@ -184,13 +184,13 @@ jobs: - name: Get Clickhouse matrix id: matrix-clickhouse - uses: mikefarah/yq@v4.40.5 + uses: mikefarah/yq@v4.42.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/clickhouse/matrix.yml - name: Check if Greenplum files are changed id: changed-greenplum - uses: tj-actions/changed-files@v42 + uses: tj-actions/changed-files@v43 with: files_from_source_file: .github/workflows/data/greenplum/tracked.txt files_ignore_from_source_file: .github/workflows/data/greenplum/ignored.txt @@ -214,13 +214,13 @@ jobs: - name: Get Greenplum matrix id: matrix-greenplum - uses: mikefarah/yq@v4.40.5 + uses: mikefarah/yq@v4.42.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/greenplum/matrix.yml - name: Check if Hive files are changed id: changed-hive - uses: tj-actions/changed-files@v42 + uses: tj-actions/changed-files@v43 with: files_from_source_file: .github/workflows/data/hive/tracked.txt files_ignore_from_source_file: .github/workflows/data/hive/ignored.txt @@ -244,13 +244,13 @@ jobs: - name: Get Hive matrix id: matrix-hive - uses: mikefarah/yq@v4.40.5 + uses: mikefarah/yq@v4.42.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/hive/matrix.yml - name: Check if Kafka files are changed id: changed-kafka - uses: tj-actions/changed-files@v42 + uses: tj-actions/changed-files@v43 with: files_from_source_file: .github/workflows/data/kafka/tracked.txt files_ignore_from_source_file: .github/workflows/data/kafka/ignored.txt @@ -274,13 +274,13 @@ jobs: - name: Get Kafka matrix id: matrix-kafka - uses: mikefarah/yq@v4.40.5 + uses: mikefarah/yq@v4.42.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/kafka/matrix.yml - name: Check if LocalFS files are changed id: changed-local-fs - uses: tj-actions/changed-files@v42 + uses: tj-actions/changed-files@v43 with: files_from_source_file: .github/workflows/data/local-fs/tracked.txt files_ignore_from_source_file: .github/workflows/data/local-fs/ignored.txt @@ -304,13 +304,13 @@ jobs: - name: Get LocalFS matrix id: matrix-local-fs - uses: mikefarah/yq@v4.40.5 + uses: mikefarah/yq@v4.42.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/local-fs/matrix.yml - name: Check if MongoDB files are changed id: changed-mongodb - uses: tj-actions/changed-files@v42 + uses: tj-actions/changed-files@v43 with: files_from_source_file: .github/workflows/data/mongodb/tracked.txt files_ignore_from_source_file: .github/workflows/data/mongodb/ignored.txt @@ -334,13 +334,13 @@ jobs: - name: Get MongoDB matrix id: matrix-mongodb - uses: mikefarah/yq@v4.40.5 + uses: mikefarah/yq@v4.42.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/mongodb/matrix.yml - name: Check if MSSQL files are changed id: changed-mssql - uses: tj-actions/changed-files@v42 + uses: tj-actions/changed-files@v43 with: files_from_source_file: .github/workflows/data/mssql/tracked.txt files_ignore_from_source_file: .github/workflows/data/mssql/ignored.txt @@ -364,13 +364,13 @@ jobs: - name: Get MSSQL matrix id: matrix-mssql - uses: mikefarah/yq@v4.40.5 + uses: mikefarah/yq@v4.42.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/mssql/matrix.yml - name: Check if MySQL files are changed id: changed-mysql - uses: tj-actions/changed-files@v42 + uses: tj-actions/changed-files@v43 with: files_from_source_file: .github/workflows/data/mysql/tracked.txt files_ignore_from_source_file: .github/workflows/data/mysql/ignored.txt @@ -394,13 +394,13 @@ jobs: - name: Get MySQL matrix id: matrix-mysql - uses: mikefarah/yq@v4.40.5 + uses: mikefarah/yq@v4.42.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/mysql/matrix.yml - name: Check if Oracle files are changed id: changed-oracle - uses: tj-actions/changed-files@v42 + uses: tj-actions/changed-files@v43 with: files_from_source_file: .github/workflows/data/oracle/tracked.txt files_ignore_from_source_file: .github/workflows/data/oracle/ignored.txt @@ -424,13 +424,13 @@ jobs: - name: Get Oracle matrix id: matrix-oracle - uses: mikefarah/yq@v4.40.5 + uses: mikefarah/yq@v4.42.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/oracle/matrix.yml - name: Check if Postgres files are changed id: changed-postgres - uses: tj-actions/changed-files@v42 + uses: tj-actions/changed-files@v43 with: files_from_source_file: .github/workflows/data/postgres/tracked.txt files_ignore_from_source_file: .github/workflows/data/postgres/ignored.txt @@ -454,13 +454,13 @@ jobs: - name: Get Postgres matrix id: matrix-postgres - uses: mikefarah/yq@v4.40.5 + uses: mikefarah/yq@v4.42.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/postgres/matrix.yml - name: Check if Teradata files are changed id: changed-teradata - uses: tj-actions/changed-files@v42 + uses: tj-actions/changed-files@v43 with: files_from_source_file: .github/workflows/data/teradata/tracked.txt files_ignore_from_source_file: .github/workflows/data/teradata/ignored.txt @@ -484,13 +484,13 @@ jobs: - name: Get Teradata matrix id: matrix-teradata - uses: mikefarah/yq@v4.40.5 + uses: mikefarah/yq@v4.42.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/teradata/matrix.yml - name: Check if FTP files are changed id: changed-ftp - uses: tj-actions/changed-files@v42 + uses: tj-actions/changed-files@v43 with: files_from_source_file: .github/workflows/data/ftp/tracked.txt files_ignore_from_source_file: .github/workflows/data/ftp/ignored.txt @@ -514,13 +514,13 @@ jobs: - name: Get FTP matrix id: matrix-ftp - uses: mikefarah/yq@v4.40.5 + uses: mikefarah/yq@v4.42.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/ftp/matrix.yml - name: Check if FTPS files are changed id: changed-ftps - uses: tj-actions/changed-files@v42 + uses: tj-actions/changed-files@v43 with: files_from_source_file: .github/workflows/data/ftps/tracked.txt files_ignore_from_source_file: .github/workflows/data/ftps/ignored.txt @@ -544,13 +544,13 @@ jobs: - name: Get FTPS matrix id: matrix-ftps - uses: mikefarah/yq@v4.40.5 + uses: mikefarah/yq@v4.42.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/ftps/matrix.yml - name: Check if HDFS files are changed id: changed-hdfs - uses: tj-actions/changed-files@v42 + uses: tj-actions/changed-files@v43 with: files_from_source_file: .github/workflows/data/hdfs/tracked.txt files_ignore_from_source_file: .github/workflows/data/hdfs/ignored.txt @@ -574,13 +574,13 @@ jobs: - name: Get HDFS matrix id: matrix-hdfs - uses: mikefarah/yq@v4.40.5 + uses: mikefarah/yq@v4.42.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/hdfs/matrix.yml - name: Check if S3 files are changed id: changed-s3 - uses: tj-actions/changed-files@v42 + uses: tj-actions/changed-files@v43 with: files_from_source_file: .github/workflows/data/s3/tracked.txt files_ignore_from_source_file: .github/workflows/data/s3/ignored.txt @@ -604,13 +604,13 @@ jobs: - name: Get S3 matrix id: matrix-s3 - uses: mikefarah/yq@v4.40.5 + uses: mikefarah/yq@v4.42.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/s3/matrix.yml - name: Check if SFTP files are changed id: changed-sftp - uses: tj-actions/changed-files@v42 + uses: tj-actions/changed-files@v43 with: files_from_source_file: .github/workflows/data/sftp/tracked.txt files_ignore_from_source_file: .github/workflows/data/sftp/ignored.txt @@ -634,13 +634,13 @@ jobs: - name: Get SFTP matrix id: matrix-sftp - uses: mikefarah/yq@v4.40.5 + uses: mikefarah/yq@v4.42.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/sftp/matrix.yml - name: Check if Samba files are changed id: changed-samba - uses: tj-actions/changed-files@v42 + uses: tj-actions/changed-files@v43 with: files_from_source_file: .github/workflows/data/samba/tracked.txt files_ignore_from_source_file: .github/workflows/data/samba/ignored.txt @@ -664,13 +664,13 @@ jobs: - name: Get Samba matrix id: matrix-samba - uses: mikefarah/yq@v4.40.5 + uses: mikefarah/yq@v4.42.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/samba/matrix.yml - name: Check if WebDAV files are changed id: changed-webdav - uses: tj-actions/changed-files@v42 + uses: tj-actions/changed-files@v43 with: files_from_source_file: .github/workflows/data/webdav/tracked.txt files_ignore_from_source_file: .github/workflows/data/webdav/ignored.txt @@ -694,6 +694,6 @@ jobs: - name: Get WebDAV matrix id: matrix-webdav - uses: mikefarah/yq@v4.40.5 + uses: mikefarah/yq@v4.42.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/webdav/matrix.yml diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index 58b34cee1..34a0af1b3 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -20,7 +20,7 @@ jobs: nightly: true tests-core: - name: Run core tests (spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run core tests (spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -30,13 +30,14 @@ jobs: uses: ./.github/workflows/test-core.yml with: spark-version: ${{ matrix.spark-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} with-cache: false tests-clickhouse: - name: Run Clickhouse tests (server=${{ matrix.clickhouse-version }}, spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run Clickhouse tests (server=${{ matrix.clickhouse-version }}, spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -48,13 +49,14 @@ jobs: clickhouse-image: ${{ matrix.clickhouse-image }} clickhouse-version: ${{ matrix.clickhouse-version }} spark-version: ${{ matrix.spark-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} with-cache: false tests-greenplum: - name: Run Greenplum tests (server=${{ matrix.greenplum-version }}, spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run Greenplum tests (server=${{ matrix.greenplum-version }}, spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -65,6 +67,8 @@ jobs: with: greenplum-version: ${{ matrix.greenplum-version }} spark-version: ${{ matrix.spark-version }} + package-version: ${{ matrix.package-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} @@ -74,7 +78,7 @@ jobs: GREENPLUM_PACKAGES_PASSWORD: ${{ secrets.GREENPLUM_PACKAGES_PASSWORD }} tests-hive: - name: Run Hive tests (spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run Hive tests (spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -84,13 +88,14 @@ jobs: uses: ./.github/workflows/test-hive.yml with: spark-version: ${{ matrix.spark-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} with-cache: false tests-kafka: - name: Run Kafka tests (server=${{ matrix.kafka-version }}, spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run Kafka tests (server=${{ matrix.kafka-version }}, spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -101,13 +106,14 @@ jobs: with: kafka-version: ${{ matrix.kafka-version }} spark-version: ${{ matrix.spark-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} with-cache: false tests-local-fs: - name: Run LocalFS tests (spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run LocalFS tests (spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -117,13 +123,14 @@ jobs: uses: ./.github/workflows/test-local-fs.yml with: spark-version: ${{ matrix.spark-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} with-cache: false tests-mongodb: - name: Run MongoDB tests (server=${{ matrix.mongodb-version }}, spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run MongoDB tests (server=${{ matrix.mongodb-version }}, spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -134,13 +141,14 @@ jobs: with: mongodb-version: ${{ matrix.mongodb-version }} spark-version: ${{ matrix.spark-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} with-cache: false tests-mssql: - name: Run MSSQL tests (server=${{ matrix.mssql-version }}, spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run MSSQL tests (server=${{ matrix.mssql-version }}, spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -151,13 +159,14 @@ jobs: with: mssql-version: ${{ matrix.mssql-version }} spark-version: ${{ matrix.spark-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} with-cache: false tests-mysql: - name: Run MySQL tests (server=${{ matrix.mysql-version }}, spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run MySQL tests (server=${{ matrix.mysql-version }}, spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -168,13 +177,14 @@ jobs: with: mysql-version: ${{ matrix.mysql-version }} spark-version: ${{ matrix.spark-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} with-cache: false tests-oracle: - name: Run Oracle tests (server=${{ matrix.oracle-version }}, spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run Oracle tests (server=${{ matrix.oracle-version }}, spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -187,13 +197,14 @@ jobs: oracle-version: ${{ matrix.oracle-version }} db-name: ${{ matrix.db-name }} spark-version: ${{ matrix.spark-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} with-cache: false tests-postgres: - name: Run Postgres tests (server=${{ matrix.postgres-version }}, spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run Postgres tests (server=${{ matrix.postgres-version }}, spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -204,13 +215,14 @@ jobs: with: postgres-version: ${{ matrix.postgres-version }} spark-version: ${{ matrix.spark-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} with-cache: false tests-teradata: - name: Run Teradata tests (spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run Teradata tests (spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -220,13 +232,14 @@ jobs: uses: ./.github/workflows/test-teradata.yml with: spark-version: ${{ matrix.spark-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} with-cache: false tests-ftp: - name: Run FTP tests (server=${{ matrix.ftp-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run FTP tests (server=${{ matrix.ftp-version }}, pydantic=${{ matrix.pydantic-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -236,12 +249,13 @@ jobs: uses: ./.github/workflows/test-ftp.yml with: ftp-version: ${{ matrix.ftp-version }} + pydantic-version: ${{ matrix.pydantic-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} with-cache: false tests-ftps: - name: Run FTPS tests (server=${{ matrix.ftps-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run FTPS tests (server=${{ matrix.ftps-version }}, pydantic=${{ matrix.pydantic-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -251,12 +265,13 @@ jobs: uses: ./.github/workflows/test-ftps.yml with: ftps-version: ${{ matrix.ftps-version }} + pydantic-version: ${{ matrix.pydantic-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} with-cache: false tests-hdfs: - name: Run HDFS tests (server=${{ matrix.hadoop-version }}, spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run HDFS tests (server=${{ matrix.hadoop-version }}, spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -267,13 +282,14 @@ jobs: with: hadoop-version: ${{ matrix.hadoop-version }} spark-version: ${{ matrix.spark-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} with-cache: false tests-s3: - name: Run S3 tests (server=${{ matrix.minio-version }}, spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run S3 tests (server=${{ matrix.minio-version }}, spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -284,13 +300,14 @@ jobs: with: minio-version: ${{ matrix.minio-version }} spark-version: ${{ matrix.spark-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} with-cache: false tests-sftp: - name: Run SFTP tests (server=${{ matrix.openssh-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run SFTP tests (server=${{ matrix.openssh-version }}, pydantic=${{ matrix.pydantic-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -300,12 +317,13 @@ jobs: uses: ./.github/workflows/test-sftp.yml with: openssh-version: ${{ matrix.openssh-version }} + pydantic-version: ${{ matrix.pydantic-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} with-cache: false tests-samba: - name: Run Samba tests (server=${{ matrix.server-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run Samba tests (server=${{ matrix.server-version }}, pydantic=${{ matrix.pydantic-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -315,12 +333,13 @@ jobs: uses: ./.github/workflows/test-samba.yml with: server-version: ${{ matrix.server-version }} + pydantic-version: ${{ matrix.pydantic-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} with-cache: false tests-webdav: - name: Run WebDAV tests (server=${{ matrix.openwebdavssh-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run WebDAV tests (server=${{ matrix.openwebdavssh-version }}, pydantic=${{ matrix.pydantic-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -330,6 +349,7 @@ jobs: uses: ./.github/workflows/test-webdav.yml with: webdav-version: ${{ matrix.webdav-version }} + pydantic-version: ${{ matrix.pydantic-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} with-cache: false diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 90dfc638d..72aab7800 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -105,7 +105,7 @@ jobs: - name: Create Github release id: create_release - uses: softprops/action-gh-release@v1 + uses: softprops/action-gh-release@v2 with: token: ${{ secrets.GITHUB_TOKEN }} draft: false diff --git a/.github/workflows/repo-labels-sync.yml b/.github/workflows/repo-labels-sync.yml new file mode 100644 index 000000000..c0b1e4c5b --- /dev/null +++ b/.github/workflows/repo-labels-sync.yml @@ -0,0 +1,27 @@ +name: Repo labels sync + +on: + push: + branches: + - develop + paths: + - .github/labels.yml + - .github/workflows/repo-labels-sync.yml + pull_request: + paths: + - .github/labels.yml + - .github/workflows/repo-labels-sync.yml + +jobs: + labeler: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Run Labeler + uses: crazy-max/ghaction-github-labeler@v5 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + yaml-file: .github/labels.yml + dry-run: ${{ github.event_name == 'pull_request' }} diff --git a/.github/workflows/test-clickhouse.yml b/.github/workflows/test-clickhouse.yml index 236c1b125..4a29f84cb 100644 --- a/.github/workflows/test-clickhouse.yml +++ b/.github/workflows/test-clickhouse.yml @@ -11,6 +11,9 @@ on: spark-version: required: true type: string + pydantic-version: + required: true + type: string java-version: required: true type: string @@ -27,7 +30,7 @@ on: jobs: test-clickhouse: - name: Run Clickhouse tests (server=${{ inputs.clickhouse-version }}, spark=${{ inputs.spark-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) + name: Run Clickhouse tests (server=${{ inputs.clickhouse-version }}, spark=${{ inputs.spark-version }}, pydantic=${{ inputs.pydantic-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) runs-on: ${{ inputs.os }} services: clickhouse: @@ -58,27 +61,27 @@ jobs: if: inputs.with-cache with: path: ~/.ivy2 - key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-clickhouse-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-clickhouse-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} restore-keys: | - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-clickhouse-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-clickhouse- + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-clickhouse-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-clickhouse- - name: Cache pip uses: actions/cache@v4 if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-clickhouse-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/clickhouse.txt', 'requirements/tests/spark-*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-clickhouse-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/clickhouse.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-clickhouse-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/clickhouse.txt', 'requirements/tests/spark-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-clickhouse- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-clickhouse-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/clickhouse.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-clickhouse- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel - name: Install dependencies run: | - pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/clickhouse.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt + pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/clickhouse.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - name: Wait for Clickhouse to be ready run: | diff --git a/.github/workflows/test-core.yml b/.github/workflows/test-core.yml index 6e9036df1..cdd977398 100644 --- a/.github/workflows/test-core.yml +++ b/.github/workflows/test-core.yml @@ -5,6 +5,9 @@ on: spark-version: required: true type: string + pydantic-version: + required: true + type: string java-version: required: true type: string @@ -21,7 +24,7 @@ on: jobs: test-core: - name: Run core tests (spark=${{ inputs.spark-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) + name: Run core tests (spark=${{ inputs.spark-version }}, pydantic=${{ inputs.pydantic-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) runs-on: ${{ inputs.os }} steps: @@ -44,27 +47,27 @@ jobs: if: inputs.with-cache with: path: ~/.ivy2 - key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-clickhouse-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-clickhouse-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} restore-keys: | - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-clickhouse-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-clickhouse- + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-clickhouse-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-clickhouse- - name: Cache pip uses: actions/cache@v4 if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-tests-core-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-core-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-tests-core-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-tests-core- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-core-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-core- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel - name: Install dependencies run: | - pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt + pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - name: Run tests run: | diff --git a/.github/workflows/test-ftp.yml b/.github/workflows/test-ftp.yml index 962013077..4e947d738 100644 --- a/.github/workflows/test-ftp.yml +++ b/.github/workflows/test-ftp.yml @@ -5,6 +5,9 @@ on: ftp-version: required: true type: string + pydantic-version: + required: true + type: string python-version: required: true type: string @@ -18,7 +21,7 @@ on: jobs: test-ftp: - name: Run FTP tests (server=${{ inputs.ftp-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) + name: Run FTP tests (server=${{ inputs.ftp-version }}, pydantic=${{ inputs.pydantic-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) runs-on: ${{ inputs.os }} steps: @@ -35,17 +38,17 @@ jobs: if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-ftp-${{ hashFiles('requirements/core.txt', 'requirements/ftp.txt', 'requirements/tests/base.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-pydantic-${{ inputs.pydantic-version }}-tests-ftp-${{ hashFiles('requirements/core.txt', 'requirements/ftp.txt', 'requirements/tests/base.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-ftp-${{ hashFiles('requirements/core.txt', 'requirements/ftp.txt', 'requirements/tests/base.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-ftp- + ${{ runner.os }}-python-${{ inputs.python-version }}-pydantic-${{ inputs.pydantic-version }}-tests-ftp-${{ hashFiles('requirements/core.txt', 'requirements/ftp.txt', 'requirements/tests/base.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-pydantic-${{ inputs.pydantic-version }}-tests-ftp- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel - name: Install dependencies run: | - pip install -I -r requirements/core.txt -r requirements/ftp.txt -r requirements/tests/base.txt + pip install -I -r requirements/core.txt -r requirements/ftp.txt -r requirements/tests/base.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt # Replace with Github Actions' services after https://github.com/chonjay21/docker-ftp/pull/3 # Cannot use services because we need to mount config file from the repo, but services start before checkout. diff --git a/.github/workflows/test-ftps.yml b/.github/workflows/test-ftps.yml index 3be745aed..19cce458c 100644 --- a/.github/workflows/test-ftps.yml +++ b/.github/workflows/test-ftps.yml @@ -5,6 +5,9 @@ on: ftps-version: required: true type: string + pydantic-version: + required: true + type: string python-version: required: true type: string @@ -18,7 +21,7 @@ on: jobs: test-ftps: - name: Run FTPS tests (server=${{ inputs.ftps-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) + name: Run FTPS tests (server=${{ inputs.ftps-version }}, pydantic=${{ inputs.pydantic-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) runs-on: ${{ inputs.os }} steps: @@ -35,17 +38,17 @@ jobs: if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-ftps-${{ hashFiles('requirements/core.txt', 'requirements/ftp.txt', 'requirements/tests/base.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-pydantic-${{ inputs.pydantic-version }}-tests-ftps-${{ hashFiles('requirements/core.txt', 'requirements/ftp.txt', 'requirements/tests/base.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-ftps-${{ hashFiles('requirements/core.txt', 'requirements/ftp.txt', 'requirements/tests/base.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-ftps- + ${{ runner.os }}-python-${{ inputs.python-version }}-pydantic-${{ inputs.pydantic-version }}-tests-ftps-${{ hashFiles('requirements/core.txt', 'requirements/ftp.txt', 'requirements/tests/base.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-pydantic-${{ inputs.pydantic-version }}-tests-ftps- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel - name: Install dependencies run: | - pip install -I -r requirements/core.txt -r requirements/ftp.txt -r requirements/tests/base.txt + pip install -I -r requirements/core.txt -r requirements/ftp.txt -r requirements/tests/base.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt # Replace with Github Actions' services after https://github.com/chonjay21/docker-ftps/pull/3 # Cannot use services because we need to mount config file from the repo, but services start before checkout. diff --git a/.github/workflows/test-greenplum.yml b/.github/workflows/test-greenplum.yml index 94b99436c..bf5e5012e 100644 --- a/.github/workflows/test-greenplum.yml +++ b/.github/workflows/test-greenplum.yml @@ -5,9 +5,15 @@ on: greenplum-version: required: true type: string + package-version: + required: true + type: string spark-version: required: true type: string + pydantic-version: + required: true + type: string java-version: required: true type: string @@ -30,7 +36,7 @@ on: jobs: test-greenplum: if: github.repository == 'MobileTeleSystems/onetl' # prevent running on forks - name: Run Greenplum tests (server=${{ inputs.greenplum-version }}, spark=${{ inputs.spark-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) + name: Run Greenplum tests (server=${{ inputs.greenplum-version }}, spark=${{ inputs.spark-version }}, pydantic=${{ inputs.pydantic-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) runs-on: ${{ inputs.os }} services: greenplum: @@ -60,20 +66,20 @@ jobs: if: inputs.with-cache with: path: ~/.ivy2 - key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-greenplum-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-greenplum-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} restore-keys: | - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-greenplum-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-greenplum- + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-greenplum-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-greenplum- - name: Cache pip uses: actions/cache@v4 if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-tests-greenplum-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/postgres.txt', 'requirements/tests/spark-*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-greenplum-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/postgres.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-tests-greenplum-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/postgres.txt', 'requirements/tests/spark-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-tests-greenplum- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-greenplum-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/postgres.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-greenplum- - name: Set up Postgres client if: runner.os == 'Linux' @@ -86,7 +92,7 @@ jobs: - name: Install dependencies run: | - pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/postgres.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt + pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/postgres.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - name: Wait for Greenplum to be ready run: | @@ -104,6 +110,7 @@ jobs: mkdir reports/ || echo "Directory exists" sed '/^$/d' ./.env.local | sed '/^#/d' | sed 's/^/export /' > ./env source ./env + export ONETL_GP_PACKAGE_VERSION=${{ inputs.package-version }} ./pytest_runner.sh -m greenplum env: ONETL_DB_WITH_GREENPLUM: 'true' diff --git a/.github/workflows/test-hdfs.yml b/.github/workflows/test-hdfs.yml index c97f5357c..e97ee6249 100644 --- a/.github/workflows/test-hdfs.yml +++ b/.github/workflows/test-hdfs.yml @@ -8,6 +8,9 @@ on: spark-version: required: true type: string + pydantic-version: + required: true + type: string java-version: required: true type: string @@ -24,7 +27,7 @@ on: jobs: test-hdfs: - name: Run HDFS tests (server=${{ inputs.hadoop-version }}, spark=${{ inputs.spark-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) + name: Run HDFS tests (server=${{ inputs.hadoop-version }}, spark=${{ inputs.spark-version }}, pydantic=${{ inputs.pydantic-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) runs-on: ${{ inputs.os }} steps: @@ -53,27 +56,27 @@ jobs: if: inputs.with-cache with: path: ~/.ivy2 - key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-hdfs-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-hdfs-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} restore-keys: | - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-hdfs-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-hdfs- + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-hdfs-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-hdfs- - name: Cache pip uses: actions/cache@v4 if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-hdfs-${{ hashFiles('requirements/core.txt', 'requirements/kerberos.txt', 'requirements/hdfs.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-hdfs-${{ hashFiles('requirements/core.txt', 'requirements/kerberos.txt', 'requirements/hdfs.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-hdfs-${{ hashFiles('requirements/core.txt', 'requirements/kerberos.txt', 'requirements/hdfs.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-hdfs- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-hdfs-${{ hashFiles('requirements/core.txt', 'requirements/kerberos.txt', 'requirements/hdfs.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-hdfs- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel - name: Install dependencies run: | - pip install -I -r requirements/core.txt -r requirements/kerberos.txt -r requirements/hdfs.txt -r requirements/tests/base.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt + pip install -I -r requirements/core.txt -r requirements/kerberos.txt -r requirements/hdfs.txt -r requirements/tests/base.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt # Cannot use services because we need to mount config file from the repo, but services start before checkout. # See https://github.com/orgs/community/discussions/25792 diff --git a/.github/workflows/test-hive.yml b/.github/workflows/test-hive.yml index 6c77690fb..446cb76d0 100644 --- a/.github/workflows/test-hive.yml +++ b/.github/workflows/test-hive.yml @@ -5,6 +5,9 @@ on: spark-version: required: true type: string + pydantic-version: + required: true + type: string java-version: required: true type: string @@ -21,7 +24,7 @@ on: jobs: test-hive: - name: Run Hive tests (spark=${{ inputs.spark-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) + name: Run Hive tests (spark=${{ inputs.spark-version }}, pydantic=${{ inputs.pydantic-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) runs-on: ${{ inputs.os }} steps: @@ -44,27 +47,27 @@ jobs: if: inputs.with-cache with: path: ~/.ivy2 - key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-hive-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-hive-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} restore-keys: | - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-hive-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-hive- + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-hive-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-hive- - name: Cache pip uses: actions/cache@v4 if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-hive-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-hive-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-hive-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-hive- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-hive-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-hive- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel - name: Install dependencies run: | - pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt + pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - name: Run tests run: | diff --git a/.github/workflows/test-kafka.yml b/.github/workflows/test-kafka.yml index a72172d84..d1389049f 100644 --- a/.github/workflows/test-kafka.yml +++ b/.github/workflows/test-kafka.yml @@ -8,6 +8,9 @@ on: spark-version: required: true type: string + pydantic-version: + required: true + type: string java-version: required: true type: string @@ -24,7 +27,7 @@ on: jobs: test-kafka: - name: Run Kafka tests (server=${{ inputs.kafka-version }}, spark=${{ inputs.spark-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) + name: Run Kafka tests (server=${{ inputs.kafka-version }}, spark=${{ inputs.spark-version }}, pydantic=${{ inputs.pydantic-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) runs-on: ${{ inputs.os }} services: @@ -91,27 +94,27 @@ jobs: if: inputs.with-cache with: path: ~/.ivy2 - key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-kafka-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-kafka-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} restore-keys: | - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-kafka-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-kafka- + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-kafka-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-kafka- - name: Cache pip uses: actions/cache@v4 if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-kafka-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/kafka.txt', 'requirements/tests/spark-*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-kafka-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/kafka.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-kafka-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/kafka.txt', 'requirements/tests/spark-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-kafka- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-kafka-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/kafka.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-kafka- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel - name: Install dependencies run: | - pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/kafka.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt + pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/kafka.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - name: Wait for Kafka to be ready run: | diff --git a/.github/workflows/test-local-fs.yml b/.github/workflows/test-local-fs.yml index 98c98e652..529deef41 100644 --- a/.github/workflows/test-local-fs.yml +++ b/.github/workflows/test-local-fs.yml @@ -5,6 +5,9 @@ on: spark-version: required: true type: string + pydantic-version: + required: true + type: string java-version: required: true type: string @@ -21,7 +24,7 @@ on: jobs: test-local-fs: - name: Run LocalFS tests (spark=${{ inputs.spark-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) + name: Run LocalFS tests (spark=${{ inputs.spark-version }}, pydantic=${{ inputs.pydantic-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) runs-on: ${{ inputs.os }} steps: @@ -44,27 +47,27 @@ jobs: if: inputs.with-cache with: path: ~/.ivy2 - key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-local-fs-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-local-fs-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} restore-keys: | - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-local-fs-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-local-fs- + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-local-fs-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-local-fs- - name: Cache pip uses: actions/cache@v4 if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-local-fs-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-local-fs-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-local-fs-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-local-fs- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-local-fs-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-local-fs- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel - name: Install dependencies run: | - pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt + pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - name: Run tests run: | diff --git a/.github/workflows/test-mongodb.yml b/.github/workflows/test-mongodb.yml index 3366bb3cb..c842ce7af 100644 --- a/.github/workflows/test-mongodb.yml +++ b/.github/workflows/test-mongodb.yml @@ -8,6 +8,9 @@ on: spark-version: required: true type: string + pydantic-version: + required: true + type: string java-version: required: true type: string @@ -24,7 +27,7 @@ on: jobs: test-mongodb: - name: Run MongoDB tests (server=${{ inputs.mongodb-version }}, spark=${{ inputs.spark-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) + name: Run MongoDB tests (server=${{ inputs.mongodb-version }}, spark=${{ inputs.spark-version }}, pydantic=${{ inputs.pydantic-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) runs-on: ${{ inputs.os }} services: mongodb: @@ -56,27 +59,27 @@ jobs: if: inputs.with-cache with: path: ~/.ivy2 - key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-mongodb-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-mongodb-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} restore-keys: | - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-mongodb-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-mongodb- + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-mongodb-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-mongodb- - name: Cache pip uses: actions/cache@v4 if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-mongodb-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/mongodb.txt', 'requirements/tests/spark-*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-mongodb-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/mongodb.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-mongodb-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/mongodb.txt', 'requirements/tests/spark-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-mongodb- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-mongodb-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/mongodb.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-mongodb- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel - name: Install dependencies run: | - pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/mongodb.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt + pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/mongodb.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - name: Wait for MongoDB to be ready run: | diff --git a/.github/workflows/test-mssql.yml b/.github/workflows/test-mssql.yml index 529f9edec..efc8edce8 100644 --- a/.github/workflows/test-mssql.yml +++ b/.github/workflows/test-mssql.yml @@ -8,6 +8,9 @@ on: spark-version: required: true type: string + pydantic-version: + required: true + type: string java-version: required: true type: string @@ -24,7 +27,7 @@ on: jobs: test-mssql: - name: Run MSSQL tests (server=${{ inputs.mssql-version }}, spark=${{ inputs.spark-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) + name: Run MSSQL tests (server=${{ inputs.mssql-version }}, spark=${{ inputs.spark-version }}, pydantic=${{ inputs.pydantic-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) runs-on: ${{ inputs.os }} services: mssql: @@ -59,27 +62,27 @@ jobs: if: inputs.with-cache with: path: ~/.ivy2 - key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-mssql-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-mssql-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} restore-keys: | - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-mssql-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-mssql- + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-mssql-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-mssql- - name: Cache pip uses: actions/cache@v4 if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-mssql-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/mssql.txt', 'requirements/tests/spark-*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-mssql-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/mssql.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-mssql-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/mssql.txt', 'requirements/tests/spark-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-mssql- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-mssql-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/mssql.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-mssql- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel - name: Install dependencies run: | - pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/mssql.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt + pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/mssql.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - name: Wait for MSSQL to be ready run: | diff --git a/.github/workflows/test-mysql.yml b/.github/workflows/test-mysql.yml index 36d800a57..c98562dd4 100644 --- a/.github/workflows/test-mysql.yml +++ b/.github/workflows/test-mysql.yml @@ -8,6 +8,9 @@ on: spark-version: required: true type: string + pydantic-version: + required: true + type: string java-version: required: true type: string @@ -24,7 +27,7 @@ on: jobs: test-mysql: - name: Run MySQL tests (server=${{ inputs.mysql-version }}, spark=${{ inputs.spark-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) + name: Run MySQL tests (server=${{ inputs.mysql-version }}, spark=${{ inputs.spark-version }}, pydantic=${{ inputs.pydantic-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) runs-on: ${{ inputs.os }} services: mysql: @@ -58,27 +61,27 @@ jobs: if: inputs.with-cache with: path: ~/.ivy2 - key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-mysql-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-mysql-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} restore-keys: | - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-mysql-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-mysql- + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-mysql-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-mysql- - name: Cache pip uses: actions/cache@v4 if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-mysql-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/mysql.txt', 'requirements/tests/spark-*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-mysql-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/mysql.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-mysql-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/mysql.txt', 'requirements/tests/spark-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-mysql- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-mysql-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/mysql.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-mysql- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel - name: Install dependencies run: | - pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/mysql.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt + pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/mysql.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - name: Wait for MySQL to be ready run: | diff --git a/.github/workflows/test-oracle.yml b/.github/workflows/test-oracle.yml index a61395c5b..c22bc9553 100644 --- a/.github/workflows/test-oracle.yml +++ b/.github/workflows/test-oracle.yml @@ -15,6 +15,9 @@ on: spark-version: required: true type: string + pydantic-version: + required: true + type: string java-version: required: true type: string @@ -31,7 +34,7 @@ on: jobs: test-oracle: - name: Run Oracle tests (server=${{ inputs.oracle-version }}, spark=${{ inputs.spark-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) + name: Run Oracle tests (server=${{ inputs.oracle-version }}, spark=${{ inputs.spark-version }}, pydantic=${{ inputs.pydantic-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) runs-on: ${{ inputs.os }} services: oracle: @@ -64,20 +67,20 @@ jobs: if: inputs.with-cache with: path: ~/.ivy2 - key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-oracle-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-oracle-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} restore-keys: | - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-oracle-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-oracle- + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-oracle-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-oracle- - name: Cache pip uses: actions/cache@v4 if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-oracle-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/oracle.txt', 'requirements/tests/spark-*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-oracle-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/oracle.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-oracle-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/oracle.txt', 'requirements/tests/spark-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-oracle- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-oracle-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/oracle.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-oracle- - name: Set up Oracle instantclient if: runner.os == 'Linux' @@ -93,7 +96,7 @@ jobs: - name: Install dependencies run: | - pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/oracle.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt + pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/oracle.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - name: Wait for Oracle to be ready run: | diff --git a/.github/workflows/test-postgres.yml b/.github/workflows/test-postgres.yml index bc2666248..7ac821bc1 100644 --- a/.github/workflows/test-postgres.yml +++ b/.github/workflows/test-postgres.yml @@ -8,6 +8,9 @@ on: spark-version: required: true type: string + pydantic-version: + required: true + type: string java-version: required: true type: string @@ -24,7 +27,7 @@ on: jobs: test-postgres: - name: Run Postgres tests (server=${{ inputs.postgres-version }}, spark=${{ inputs.spark-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) + name: Run Postgres tests (server=${{ inputs.postgres-version }}, spark=${{ inputs.spark-version }}, pydantic=${{ inputs.pydantic-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) runs-on: ${{ inputs.os }} services: postgres: @@ -57,27 +60,27 @@ jobs: if: inputs.with-cache with: path: ~/.ivy2 - key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-postgres-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-postgres-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} restore-keys: | - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-postgres-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-postgres- + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-postgres-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-postgres- - name: Cache pip uses: actions/cache@v4 if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-postgres-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/postgres.txt', 'requirements/tests/spark-*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-postgres-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/postgres.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-postgres-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/postgres.txt', 'requirements/tests/spark-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-postgres- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-postgres-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/postgres.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-postgres- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel - name: Install dependencies run: | - pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/postgres.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt + pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/postgres.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - name: Wait for Postgres to be ready run: | diff --git a/.github/workflows/test-s3.yml b/.github/workflows/test-s3.yml index 0796ed171..a75297fe3 100644 --- a/.github/workflows/test-s3.yml +++ b/.github/workflows/test-s3.yml @@ -8,6 +8,9 @@ on: spark-version: required: true type: string + pydantic-version: + required: true + type: string java-version: required: true type: string @@ -24,7 +27,7 @@ on: jobs: test-s3: - name: Run S3 tests (server=${{ inputs.minio-version }}, spark=${{ inputs.spark-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) + name: Run S3 tests (server=${{ inputs.minio-version }}, spark=${{ inputs.spark-version }}, pydantic=${{ inputs.pydantic-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) runs-on: ${{ inputs.os }} services: s3: @@ -58,27 +61,27 @@ jobs: if: inputs.with-cache with: path: ~/.ivy2 - key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-s3-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-s3-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} restore-keys: | - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-s3-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-s3- + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-s3-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-s3- - name: Cache pip uses: actions/cache@v4 if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-s3-${{ hashFiles('requirements/core.txt', 'requirements/s3.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-s3-${{ hashFiles('requirements/core.txt', 'requirements/s3.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-s3-${{ hashFiles('requirements/core.txt', 'requirements/s3.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-s3- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-s3-${{ hashFiles('requirements/core.txt', 'requirements/s3.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-s3- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel - name: Install dependencies run: | - pip install -I -r requirements/core.txt -r requirements/s3.txt -r requirements/tests/base.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt + pip install -I -r requirements/core.txt -r requirements/s3.txt -r requirements/tests/base.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - name: Wait for S3 to be ready run: | diff --git a/.github/workflows/test-samba.yml b/.github/workflows/test-samba.yml index 4612ff95f..3a7c1c921 100644 --- a/.github/workflows/test-samba.yml +++ b/.github/workflows/test-samba.yml @@ -5,6 +5,9 @@ on: server-version: required: true type: string + pydantic-version: + required: true + type: string python-version: required: true type: string @@ -18,7 +21,7 @@ on: jobs: test-samba: - name: Run Samba tests (server=${{ inputs.server-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) + name: Run Samba tests (server=${{ inputs.server-version }}, pydantic=${{ inputs.pydantic-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) runs-on: ${{ inputs.os }} steps: @@ -35,17 +38,17 @@ jobs: if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-samba-${{ hashFiles('requirements/core.txt', 'requirements/samba.txt', 'requirements/tests/base.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-pydantic-${{ inputs.pydantic-version }}-tests-samba-${{ hashFiles('requirements/core.txt', 'requirements/samba.txt', 'requirements/tests/base.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-samba-${{ hashFiles('requirements/core.txt', 'requirements/samba.txt', 'requirements/tests/base.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-samba- + ${{ runner.os }}-python-${{ inputs.python-version }}-pydantic-${{ inputs.pydantic-version }}-tests-samba-${{ hashFiles('requirements/core.txt', 'requirements/samba.txt', 'requirements/tests/base.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-pydantic-${{ inputs.pydantic-version }}-tests-samba- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel - name: Install dependencies run: | - pip install -I -r requirements/core.txt -r requirements/samba.txt -r requirements/tests/base.txt + pip install -I -r requirements/core.txt -r requirements/samba.txt -r requirements/tests/base.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt # Replace with Github Actions' because of custom parameter for samba container start - name: Start Samba diff --git a/.github/workflows/test-sftp.yml b/.github/workflows/test-sftp.yml index 301dd27c8..569d580f7 100644 --- a/.github/workflows/test-sftp.yml +++ b/.github/workflows/test-sftp.yml @@ -5,6 +5,9 @@ on: openssh-version: required: true type: string + pydantic-version: + required: true + type: string python-version: required: true type: string @@ -18,7 +21,7 @@ on: jobs: test-sftp: - name: Run SFTP tests (server=${{ inputs.openssh-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) + name: Run SFTP tests (server=${{ inputs.openssh-version }}, pydantic=${{ inputs.pydantic-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) runs-on: ${{ inputs.os }} services: sftp: @@ -45,17 +48,17 @@ jobs: if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-sftp-${{ hashFiles('requirements/core.txt', 'requirements/sftp.txt', 'requirements/tests/base.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-pydantic-${{ inputs.pydantic-version }}-tests-sftp-${{ hashFiles('requirements/core.txt', 'requirements/sftp.txt', 'requirements/tests/base.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-sftp-${{ hashFiles('requirements/core.txt', 'requirements/sftp.txt', 'requirements/tests/base.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-sftp- + ${{ runner.os }}-python-${{ inputs.python-version }}-pydantic-${{ inputs.pydantic-version }}-tests-sftp-${{ hashFiles('requirements/core.txt', 'requirements/sftp.txt', 'requirements/tests/base.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-pydantic-${{ inputs.pydantic-version }}-tests-sftp- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel - name: Install dependencies run: | - pip install -I -r requirements/core.txt -r requirements/sftp.txt -r requirements/tests/base.txt + pip install -I -r requirements/core.txt -r requirements/sftp.txt -r requirements/tests/base.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - name: Wait for SFTP to be ready run: | diff --git a/.github/workflows/test-teradata.yml b/.github/workflows/test-teradata.yml index 64d332994..214859e2b 100644 --- a/.github/workflows/test-teradata.yml +++ b/.github/workflows/test-teradata.yml @@ -5,6 +5,9 @@ on: spark-version: required: true type: string + pydantic-version: + required: true + type: string java-version: required: true type: string @@ -21,7 +24,7 @@ on: jobs: test-teradata: - name: Run Teradata tests (spark=${{ inputs.spark-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) + name: Run Teradata tests (spark=${{ inputs.spark-version }}, pydantic=${{ inputs.pydantic-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) runs-on: ${{ inputs.os }} steps: @@ -44,26 +47,26 @@ jobs: if: inputs.with-cache with: path: ~/.ivy2 - key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-teradata-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-teradata-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} restore-keys: | - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-teradata-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-teradata- + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-teradata-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-teradata- - name: Cache pip uses: actions/cache@v4 with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-tests-teradata-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-teradata-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-tests-teradata-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-tests-teradata- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-teradata-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-teradata- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel - name: Install dependencies run: | - pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt + pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - name: Run tests run: | diff --git a/.github/workflows/test-webdav.yml b/.github/workflows/test-webdav.yml index e259d4dfe..ee23f0ae8 100644 --- a/.github/workflows/test-webdav.yml +++ b/.github/workflows/test-webdav.yml @@ -5,6 +5,9 @@ on: webdav-version: required: true type: string + pydantic-version: + required: true + type: string python-version: required: true type: string @@ -18,7 +21,7 @@ on: jobs: test-webdav: - name: Run WebDAV tests (server=${{ inputs.webdav-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) + name: Run WebDAV tests (server=${{ inputs.webdav-version }}, pydantic=${{ inputs.pydantic-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) runs-on: ${{ inputs.os }} steps: @@ -35,17 +38,17 @@ jobs: if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-webdav-${{ hashFiles('requirements/core.txt', 'requirements/webdav.txt', 'requirements/tests/base.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-pydantic-${{ inputs.pydantic-version }}-tests-webdav-${{ hashFiles('requirements/core.txt', 'requirements/webdav.txt', 'requirements/tests/base.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-webdav-${{ hashFiles('requirements/core.txt', 'requirements/webdav.txt', 'requirements/tests/base.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-webdav- + ${{ runner.os }}-python-${{ inputs.python-version }}-pydantic-${{ inputs.pydantic-version }}-tests-webdav-${{ hashFiles('requirements/core.txt', 'requirements/webdav.txt', 'requirements/tests/base.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-pydantic-${{ inputs.pydantic-version }}-tests-webdav- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel - name: Install dependencies run: | - pip install -I -r requirements/core.txt -r requirements/webdav.txt -r requirements/tests/base.txt + pip install -I -r requirements/core.txt -r requirements/webdav.txt -r requirements/tests/base.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt # Replace with Github Actions' services after https://github.com/chonjay21/docker-webdav/pull/3 # Cannot use services because we need to mount config file from the repo, but services start before checkout. diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index b48f88c5a..f7a7cf074 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -21,7 +21,7 @@ jobs: uses: ./.github/workflows/get-matrix.yml tests-core: - name: Run core tests (spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run core tests (spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -31,12 +31,13 @@ jobs: uses: ./.github/workflows/test-core.yml with: spark-version: ${{ matrix.spark-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} tests-clickhouse: - name: Run Clickhouse tests (server=${{ matrix.clickhouse-version }}, spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run Clickhouse tests (server=${{ matrix.clickhouse-version }}, spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -48,12 +49,13 @@ jobs: clickhouse-image: ${{ matrix.clickhouse-image }} clickhouse-version: ${{ matrix.clickhouse-version }} spark-version: ${{ matrix.spark-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} tests-greenplum: - name: Run Greenplum tests (server=${{ matrix.greenplum-version }}, spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run Greenplum tests (server=${{ matrix.greenplum-version }}, spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -64,6 +66,8 @@ jobs: with: greenplum-version: ${{ matrix.greenplum-version }} spark-version: ${{ matrix.spark-version }} + package-version: ${{ matrix.package-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} @@ -72,7 +76,7 @@ jobs: GREENPLUM_PACKAGES_PASSWORD: ${{ secrets.GREENPLUM_PACKAGES_PASSWORD }} tests-hive: - name: Run Hive tests (spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run Hive tests (spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -82,12 +86,13 @@ jobs: uses: ./.github/workflows/test-hive.yml with: spark-version: ${{ matrix.spark-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} tests-kafka: - name: Run Kafka tests (server=${{ matrix.kafka-version }}, spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run Kafka tests (server=${{ matrix.kafka-version }}, spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -98,12 +103,13 @@ jobs: with: kafka-version: ${{ matrix.kafka-version }} spark-version: ${{ matrix.spark-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} tests-local-fs: - name: Run LocalFS tests (spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run LocalFS tests (spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -113,12 +119,13 @@ jobs: uses: ./.github/workflows/test-local-fs.yml with: spark-version: ${{ matrix.spark-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} tests-mongodb: - name: Run MongoDB tests (server=${{ matrix.mongodb-version }}, spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run MongoDB tests (server=${{ matrix.mongodb-version }}, spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -129,12 +136,13 @@ jobs: with: mongodb-version: ${{ matrix.mongodb-version }} spark-version: ${{ matrix.spark-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} tests-mssql: - name: Run MSSQL tests (server=${{ matrix.mssql-version }}, spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run MSSQL tests (server=${{ matrix.mssql-version }}, spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -145,12 +153,13 @@ jobs: with: mssql-version: ${{ matrix.mssql-version }} spark-version: ${{ matrix.spark-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} tests-mysql: - name: Run MySQL tests (server=${{ matrix.mysql-version }}, spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run MySQL tests (server=${{ matrix.mysql-version }}, spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -161,12 +170,13 @@ jobs: with: mysql-version: ${{ matrix.mysql-version }} spark-version: ${{ matrix.spark-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} tests-oracle: - name: Run Oracle tests (server=${{ matrix.oracle-version }}, spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run Oracle tests (server=${{ matrix.oracle-version }}, spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -179,12 +189,13 @@ jobs: oracle-version: ${{ matrix.oracle-version }} db-name: ${{ matrix.db-name }} spark-version: ${{ matrix.spark-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} tests-postgres: - name: Run Postgres tests (server=${{ matrix.postgres-version }}, spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run Postgres tests (server=${{ matrix.postgres-version }}, spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -195,12 +206,13 @@ jobs: with: postgres-version: ${{ matrix.postgres-version }} spark-version: ${{ matrix.spark-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} tests-teradata: - name: Run Teradata tests (spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run Teradata tests (spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -210,12 +222,13 @@ jobs: uses: ./.github/workflows/test-teradata.yml with: spark-version: ${{ matrix.spark-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} tests-ftp: - name: Run FTP tests (server=${{ matrix.ftp-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run FTP tests (server=${{ matrix.ftp-version }}, pydantic=${{ matrix.pydantic-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -225,11 +238,12 @@ jobs: uses: ./.github/workflows/test-ftp.yml with: ftp-version: ${{ matrix.ftp-version }} + pydantic-version: ${{ matrix.pydantic-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} tests-ftps: - name: Run FTPS tests (server=${{ matrix.ftps-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run FTPS tests (server=${{ matrix.ftps-version }}, pydantic=${{ matrix.pydantic-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -239,11 +253,12 @@ jobs: uses: ./.github/workflows/test-ftps.yml with: ftps-version: ${{ matrix.ftps-version }} + pydantic-version: ${{ matrix.pydantic-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} tests-hdfs: - name: Run HDFS tests (server=${{ matrix.hadoop-version }}, spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run HDFS tests (server=${{ matrix.hadoop-version }}, spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -254,12 +269,13 @@ jobs: with: hadoop-version: ${{ matrix.hadoop-version }} spark-version: ${{ matrix.spark-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} tests-s3: - name: Run S3 tests (server=${{ matrix.minio-version }}, spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run S3 tests (server=${{ matrix.minio-version }}, spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -270,12 +286,13 @@ jobs: with: minio-version: ${{ matrix.minio-version }} spark-version: ${{ matrix.spark-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} tests-sftp: - name: Run SFTP tests (server=${{ matrix.openssh-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run SFTP tests (server=${{ matrix.openssh-version }}, pydantic=${{ matrix.pydantic-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -285,11 +302,12 @@ jobs: uses: ./.github/workflows/test-sftp.yml with: openssh-version: ${{ matrix.openssh-version }} + pydantic-version: ${{ matrix.pydantic-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} tests-samba: - name: Run Samba tests (server=${{ matrix.server-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run Samba tests (server=${{ matrix.server-version }}, pydantic=${{ matrix.pydantic-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -299,12 +317,13 @@ jobs: uses: ./.github/workflows/test-samba.yml with: server-version: ${{ matrix.server-version }} + pydantic-version: ${{ matrix.pydantic-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} tests-webdav: - name: Run WebDAV tests (server=${{ matrix.webdav-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run WebDAV tests (server=${{ matrix.webdav-version }}, pydantic=${{ matrix.pydantic-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -314,6 +333,7 @@ jobs: uses: ./.github/workflows/test-webdav.yml with: webdav-version: ${{ matrix.webdav-version }} + pydantic-version: ${{ matrix.pydantic-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} @@ -378,8 +398,7 @@ jobs: token: ${{ secrets.CODECOV_TOKEN }} file: ./reports/coverage.xml fail_ci_if_error: true - # TODO: remove after fixing https://github.com/codecov/codecov-cli/issues/367 - plugin: 'gcov' + plugin: noop - name: All done run: echo 1 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e209d7e54..516ea1cf1 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -27,7 +27,7 @@ repos: - id: trailing-whitespace - repo: https://github.com/Lucas-C/pre-commit-hooks - rev: v1.5.4 + rev: v1.5.5 hooks: - id: forbid-tabs - id: remove-tabs @@ -89,13 +89,13 @@ repos: - id: text-unicode-replacement-char - repo: https://github.com/asottile/pyupgrade - rev: v3.15.0 + rev: v3.15.1 hooks: - id: pyupgrade args: [--py37-plus, --keep-runtime-typing] - repo: https://github.com/psf/black - rev: 24.1.1 + rev: 24.3.0 hooks: - id: black language_version: python3 @@ -105,7 +105,17 @@ repos: hooks: - id: blacken-docs additional_dependencies: - - black==24.1.1 + - black==24.3.0 + + - repo: https://github.com/pycqa/bandit + rev: 1.7.8 + hooks: + - id: bandit + args: + - --aggregate=file + - -iii + - -ll + require_serial: true - repo: meta hooks: @@ -113,7 +123,7 @@ repos: - id: check-useless-excludes - repo: https://github.com/PyCQA/autoflake - rev: v2.2.1 + rev: v2.3.1 hooks: - id: autoflake args: diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index cab05a780..e347a6ecf 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -70,6 +70,7 @@ Create virtualenv and install dependencies: -r requirements/tests/mysql.txt \ -r requirements/tests/postgres.txt \ -r requirements/tests/oracle.txt \ + -r requirements/tests/pydantic-2.txt \ -r requirements/tests/spark-3.5.0.txt # TODO: remove after https://github.com/zqmillet/sphinx-plantuml/pull/4 @@ -343,38 +344,83 @@ Release Process Before making a release from the ``develop`` branch, follow these steps: +0. Checkout to ``develop`` branch and update it to the actual state + +.. code:: bash + + git checkout develop + git pull -p + 1. Backup ``NEXT_RELEASE.rst`` .. code:: bash - cp docs/changelog/NEXT_RELEASE.rst docs/changelog/temp_NEXT_RELEASE.rst + cp "docs/changelog/NEXT_RELEASE.rst" "docs/changelog/temp_NEXT_RELEASE.rst" 2. Build the Release notes with Towncrier .. code:: bash - export VERSION=$(cat onetl/VERSION) - towncrier build --version=${VERSION} + VERSION=$(cat onetl/VERSION) + towncrier build "--version=${VERSION}" --yes -3. Update Changelog +3. Change file with changelog to release version number .. code:: bash - mv docs/changelog/NEXT_RELEASE.rst docs/changelog/${VERSION}.rst + mv docs/changelog/NEXT_RELEASE.rst "docs/changelog/${VERSION}.rst" + +4. Remove content above the version number heading in the ``${VERSION}.rst`` file + +.. code:: bash -4. Edit the ``${VERSION}.rst`` file -Remove content above the version number heading in the ``${VERSION}.rst`` file. + awk '!/^.*towncrier release notes start/' "docs/changelog/${VERSION}.rst" > temp && mv temp "docs/changelog/${VERSION}.rst" 5. Update Changelog Index .. code:: bash - awk -v version=${VERSION} '/NEXT_RELEASE/{print;print " " version;next}1' docs/changelog/index.rst > temp && mv temp docs/changelog/index.rst + awk -v version=${VERSION} '/DRAFT/{print;print " " version;next}1' docs/changelog/index.rst > temp && mv temp docs/changelog/index.rst -6. Reset ``NEXT_RELEASE.rst`` file +6. Restore ``NEXT_RELEASE.rst`` file from backup .. code:: bash - mv docs/changelog/temp_NEXT_RELEASE.rst docs/changelog/NEXT_RELEASE.rst + mv "docs/changelog/temp_NEXT_RELEASE.rst" "docs/changelog/NEXT_RELEASE.rst" -7. Update the patch version in the ``VERSION`` file of ``develop`` branch **after release**. +7. Commit and push changes to ``develop`` branch + +.. code:: bash + + git add . + git commit -m "Prepare for release ${VERSION}" + git push + +8. Merge ``develop`` branch to ``master``, **WITHOUT** squashing + +.. code:: bash + + git checkout master + git pull + git merge develop + git push + +9. Add git tag to the latest commit in ``master`` branch + +.. code:: bash + + git tag "$VERSION" + git push origin "$VERSION" + +10. Update version in ``develop`` branch **after release**: + +.. code:: bash + + git checkout develop + + NEXT_VERSION=$(echo "$VERSION" | awk -F. '/[0-9]+\./{$NF++;print}' OFS=.) + echo "$NEXT_VERSION" > onetl/VERSION + + git add . + git commit -m "Bump version" + git push diff --git a/README.rst b/README.rst index 0372381ae..ffc68e52b 100644 --- a/README.rst +++ b/README.rst @@ -4,7 +4,7 @@ onETL ===== |Repo Status| |PyPI| |PyPI License| |PyPI Python Version| -|Documentation| |Build Status| |Coverage| +|Documentation| |Build Status| |Coverage| |pre-commit.ci| .. |Repo Status| image:: https://www.repostatus.org/badges/latest/active.svg :target: https://github.com/MobileTeleSystems/onetl @@ -20,6 +20,8 @@ onETL :target: https://github.com/MobileTeleSystems/onetl/actions .. |Coverage| image:: https://codecov.io/gh/MobileTeleSystems/onetl/branch/develop/graph/badge.svg?token=RIO8URKNZJ :target: https://codecov.io/gh/MobileTeleSystems/onetl +.. |pre-commit.ci| image:: https://results.pre-commit.ci/badge/github/MobileTeleSystems/onetl/develop.svg + :target: https://results.pre-commit.ci/latest/github/MobileTeleSystems/onetl/develop |Logo| diff --git a/docker-compose.yml b/docker-compose.yml index 5b4fea437..6ba2aca64 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -90,6 +90,7 @@ services: - 1433:1433 networks: - onetl + platform: linux/amd64 mysql: image: ${MYSQL_IMAGE:-mysql:latest} @@ -99,6 +100,7 @@ services: - 3306:3306 networks: - onetl + platform: linux/amd64 postgres: image: ${POSTGRES_IMAGE:-postgres:15.2-alpine} @@ -136,6 +138,7 @@ services: interval: 10s timeout: 5s retries: 10 + platform: linux/amd64 ftp: image: ${FTP_IMAGE:-chonjay21/ftps:latest} diff --git a/docker/Dockerfile b/docker/Dockerfile index 0faa2591e..36cbb129f 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -68,7 +68,8 @@ RUN pip install \ -r /app/requirements/tests/mysql.txt \ -r /app/requirements/tests/oracle.txt \ -r /app/requirements/tests/postgres.txt \ - -r /app/requirements/tests/kafka.txt + -r /app/requirements/tests/kafka.txt \ + -r /app/requirements/tests/pydantic-${PYDANTIC_VERSION}.txt FROM base AS full diff --git a/docs/changelog/0.10.1.rst b/docs/changelog/0.10.1.rst index 7f30ca284..49aed2425 100644 --- a/docs/changelog/0.10.1.rst +++ b/docs/changelog/0.10.1.rst @@ -11,7 +11,7 @@ Features reader = DBReader( connection=Kafka(...), source="topic_name", - hwm=AutoDetectHWM(name="some_hwm_name", expression="offset"), + hwm=DBReader.AutoDetectHWM(name="some_hwm_name", expression="offset"), ) with IncrementalStrategy(): diff --git a/docs/changelog/0.10.2.rst b/docs/changelog/0.10.2.rst new file mode 100644 index 000000000..c516735b9 --- /dev/null +++ b/docs/changelog/0.10.2.rst @@ -0,0 +1,41 @@ +0.10.2 (2024-03-21) +=================== + +Features +-------- + +- Add support of Pydantic v2. (:github:pull:`230`) + + +Improvements +------------ + +- Improve database connections documentation: + * Add "Types" section describing mapping between Clickhouse and Spark types + * Add "Prerequisites" section describing different aspects of connecting to Clickhouse + * Separate documentation of ``DBReader`` and ``.sql()`` / ``.pipeline(...)`` + * Add examples for ``.fetch()`` and ``.execute()`` (:github:pull:`211`, :github:pull:`228`, :github:pull:`229`, :github:pull:`233`, :github:pull:`234`, :github:pull:`235`, :github:pull:`236`, :github:pull:`240`) +- Add notes to Greenplum documentation about issues with IP resolution and building ``gpfdist`` URL (:github:pull:`228`) +- Allow calling ``MongoDB.pipeline(...)`` with passing just collection name, without explicit aggregation pipeline. (:github:pull:`237`) +- Update default ``Postgres(extra={...})`` to include ``{"stringtype": "unspecified"}`` option. + This allows to write text data to non-text column (or vice versa), relying to Postgres cast capabilities. + + For example, now it is possible to read column of type ``money`` as Spark's ``StringType()``, and write it back to the same column, + without using intermediate columns or tables. (:github:pull:`229`) + + +Bug Fixes +--------- + +- Return back handling of ``DBReader(columns="string")``. This was a valid syntax up to v0.10 release, but it was removed because + most of users neved used it. It looks that we were wrong, returning this behavior back, but with deprecation warning. (:github:pull:`238`) +- Downgrade Greenplum package version from ``2.3.0`` to ``2.2.0``. (:github:pull:`239`) + + This is because version 2.3.0 introduced issues with writing data to Greenplum 6.x. + Connector can open transaction with ``SELECT * FROM table LIMIT 0`` query, but does not close it, which leads to deadlocks. + + For using this connector with Greenplum 7.x, please pass package version explicitly: + + .. code-block:: python + + maven_packages = Greenplum.get_packages(package_version="2.3.0", ...) diff --git a/docs/changelog/NEXT_RELEASE.rst b/docs/changelog/NEXT_RELEASE.rst index a598be017..28a2621b2 100644 --- a/docs/changelog/NEXT_RELEASE.rst +++ b/docs/changelog/NEXT_RELEASE.rst @@ -1,6 +1 @@ -.. fill up this file using ``towncrier build`` -.. then delete everything up to the header with version number -.. then rename file to ``{VERSION}.rst`` and add it to index.rst -.. and restore ``NEXT_RELEASE.rst`` content`` as it was before running the command above - .. towncrier release notes start diff --git a/docs/changelog/index.rst b/docs/changelog/index.rst index 557ac69b4..715b7fdf6 100644 --- a/docs/changelog/index.rst +++ b/docs/changelog/index.rst @@ -3,7 +3,7 @@ :caption: Changelog DRAFT - NEXT_RELEASE + 0.10.2 0.10.1 0.10.0 0.9.5 diff --git a/docs/conf.py b/docs/conf.py index 958bfe1c5..188b1cdbd 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -60,7 +60,7 @@ "sphinx.ext.extlinks", "sphinx_favicon", ] -numpydoc_show_class_members = True +numpydoc_show_class_members = False autodoc_pydantic_model_show_config = False autodoc_pydantic_model_show_config_summary = False autodoc_pydantic_model_show_config_member = False @@ -107,6 +107,7 @@ # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ["_static"] +html_extra_path = ["robots.txt"] html_logo = "./_static/logo.svg" favicons = [ {"rel": "icon", "href": "icon.svg", "type": "image/svg+xml"}, diff --git a/docs/connection/db_connection/clickhouse/execute.rst b/docs/connection/db_connection/clickhouse/execute.rst index f43eb3995..11e01c753 100644 --- a/docs/connection/db_connection/clickhouse/execute.rst +++ b/docs/connection/db_connection/clickhouse/execute.rst @@ -3,11 +3,113 @@ Executing statements in Clickhouse ================================== -.. currentmodule:: onetl.connection.db_connection.clickhouse.connection +.. warning:: -.. automethod:: Clickhouse.fetch -.. automethod:: Clickhouse.execute -.. automethod:: Clickhouse.close + Methods below **read all the rows** returned from DB **to Spark driver memory**, and then convert them to DataFrame. + + Do **NOT** use them to read large amounts of data. Use :ref:`DBReader ` or :ref:`Clickhouse.sql ` instead. + +How to +------ + +There are 2 ways to execute some statement in Clickhouse + +Use ``Clickhouse.fetch`` +~~~~~~~~~~~~~~~~~~~~~~~~ + +Use this method to execute some ``SELECT`` query which returns **small number or rows**, like reading +Clickhouse config, or reading data from some reference table. Method returns Spark DataFrame. + +Method accepts :obj:`JDBCOptions `. + +Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. + +.. warning:: + + Please take into account :ref:`clickhouse-types`. + +Syntax support +^^^^^^^^^^^^^^ + +This method supports **any** query syntax supported by Clickhouse, like: + +* ✅︎ ``SELECT ... FROM ...`` +* ✅︎ ``WITH alias AS (...) SELECT ...`` +* ✅︎ ``SELECT func(arg1, arg2)`` - call function +* ✅︎ ``SHOW ...`` +* ❌ ``SET ...; SELECT ...;`` - multiple statements not supported + +Examples +^^^^^^^^ + +.. code-block:: python + + from onetl.connection import Clickhouse + + clickhouse = Clickhouse(...) + + df = clickhouse.fetch( + "SELECT value FROM some.reference_table WHERE key = 'some_constant'", + options=Clickhouse.JDBCOptions(query_timeout=10), + ) + clickhouse.close() + value = df.collect()[0][0] # get value from first row and first column + +Use ``Clickhouse.execute`` +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it. + +Method accepts :obj:`JDBCOptions `. + +Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. + +Syntax support +^^^^^^^^^^^^^^ + +This method supports **any** query syntax supported by Clickhouse, like: + +* ✅︎ ``CREATE TABLE ...``, ``CREATE VIEW ...``, ``DROP TABLE ...``, and so on +* ✅︎ ``ALTER ...`` +* ✅︎ ``INSERT INTO ... SELECT ...``, ``UPDATE ...``, ``DELETE ...``, and so on +* ✅︎ ``DROP TABLE ...``, ``DROP VIEW ...``, and so on +* ✅︎ other statements not mentioned here +* ❌ ``SET ...; SELECT ...;`` - multiple statements not supported + +Examples +^^^^^^^^ + +.. code-block:: python + + from onetl.connection import Clickhouse + + clickhouse = Clickhouse(...) + + with clickhouse: + # automatically close connection after exiting this context manager + clickhouse.execute("DROP TABLE schema.table") + clickhouse.execute( + """ + CREATE TABLE schema.table AS ( + id UInt8, + key String, + value Float32 + ) + ENGINE = MergeTree() + ORDER BY id + """, + options=Clickhouse.JDBCOptions(query_timeout=10), + ) + +Notes +------ + +These methods **read all the rows** returned from DB **to Spark driver memory**, and then convert them to DataFrame. + +So it should **NOT** be used to read large amounts of data. Use :ref:`DBReader ` or :ref:`Clickhouse.sql ` instead. + +Options +------- .. currentmodule:: onetl.connection.db_connection.jdbc_mixin.options diff --git a/docs/connection/db_connection/clickhouse/index.rst b/docs/connection/db_connection/clickhouse/index.rst index 1e0d1de65..fcb0ba18d 100644 --- a/docs/connection/db_connection/clickhouse/index.rst +++ b/docs/connection/db_connection/clickhouse/index.rst @@ -7,6 +7,7 @@ Clickhouse :maxdepth: 1 :caption: Connection + prerequisites connection .. toctree:: @@ -14,5 +15,12 @@ Clickhouse :caption: Operations read + sql write execute + +.. toctree:: + :maxdepth: 1 + :caption: Troubleshooting + + types diff --git a/docs/connection/db_connection/clickhouse/prerequisites.rst b/docs/connection/db_connection/clickhouse/prerequisites.rst new file mode 100644 index 000000000..654add047 --- /dev/null +++ b/docs/connection/db_connection/clickhouse/prerequisites.rst @@ -0,0 +1,60 @@ +.. _clickhouse-prerequisites: + +Prerequisites +============= + +Version Compatibility +--------------------- + +* Clickhouse server versions: 20.7 or higher +* Spark versions: 2.3.x - 3.5.x +* Java versions: 8 - 20 + +See `official documentation `_. + +Installing PySpark +------------------ + +To use Clickhouse connector you should have PySpark installed (or injected to ``sys.path``) +BEFORE creating the connector instance. + +See :ref:`install-spark` installation instruction for more details. + +Connecting to Clickhouse +----------------------- + +Connection port +~~~~~~~~~~~~~~~ + +Connector can only use **HTTP** (usually ``8123`` port) or **HTTPS** (usually ``8443`` port) protocol. + +TCP and GRPC protocols are NOT supported. + +Clickhouse cluster interaction +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +If you're using Clickhouse cluster, it is currently possible to connect only to one specific cluster node. +Connecting to multiple nodes simultaneously is not supported. + +Required grants +~~~~~~~~~~~~~~~ + +Ask your Clickhouse cluster administrator to set following grants for a user, +used for creating a connection: + +.. tabs:: + + .. code-tab:: sql Read + Write + + -- allow creating tables in the target schema + GRANT CREATE TABLE ON myschema.* TO username; + + -- allow read & write access to specific table + GRANT SELECT, INSERT ON myschema.mytable TO username; + + .. code-tab:: sql Read only + + -- allow read access to specific table + GRANT SELECT ON myschema.mytable TO username; + +More details can be found in `official documentation `_. diff --git a/docs/connection/db_connection/clickhouse/read.rst b/docs/connection/db_connection/clickhouse/read.rst index a2c07733c..33bcff4ce 100644 --- a/docs/connection/db_connection/clickhouse/read.rst +++ b/docs/connection/db_connection/clickhouse/read.rst @@ -1,18 +1,89 @@ .. _clickhouse-read: -Reading from Clickhouse -======================= +Reading from Clickhouse using ``DBReader`` +========================================== -There are 2 ways of distributed data reading from Clickhouse: +:obj:`DBReader ` supports :ref:`strategy` for incremental data reading, +but does not support custom queries, like ``JOIN``. -* Using :obj:`DBReader ` with different :ref:`strategy` -* Using :obj:`Clickhouse.sql ` +.. warning:: -Both methods accept :obj:`JDBCReadOptions ` + Please take into account :ref:`clickhouse-types` -.. currentmodule:: onetl.connection.db_connection.clickhouse.connection +Supported DBReader features +--------------------------- -.. automethod:: Clickhouse.sql +* ✅︎ ``columns`` +* ✅︎ ``where`` +* ✅︎ ``hwm``, supported strategies: +* * ✅︎ :ref:`snapshot-strategy` +* * ✅︎ :ref:`incremental-strategy` +* * ✅︎ :ref:`snapshot-batch-strategy` +* * ✅︎ :ref:`incremental-batch-strategy` +* ❌ ``hint`` (is not supported by Clickhouse) +* ❌ ``df_schema`` +* ✅︎ ``options`` (see :obj:`JDBCReadOptions `) + +Examples +-------- + +Snapshot strategy: + +.. code-block:: python + + from onetl.connection import Clickhouse + from onetl.db import DBReader + + clickhouse = Clickhouse(...) + + reader = DBReader( + connection=clickhouse, + source="schema.table", + columns=["id", "key", "CAST(value AS String) value", "updated_dt"], + where="key = 'something'", + options=Clickhouse.ReadOptions(partition_column="id", num_partitions=10), + ) + df = reader.run() + +Incremental strategy: + +.. code-block:: python + + from onetl.connection import Clickhouse + from onetl.db import DBReader + from onetl.strategy import IncrementalStrategy + + clickhouse = Clickhouse(...) + + reader = DBReader( + connection=clickhouse, + source="schema.table", + columns=["id", "key", "CAST(value AS String) value", "updated_dt"], + where="key = 'something'", + hwm=DBReader.AutoDetectHWM(name="clickhouse_hwm", expression="updated_dt"), + options=Clickhouse.ReadOptions(partition_column="id", num_partitions=10), + ) + + with IncrementalStrategy(): + df = reader.run() + +Recommendations +--------------- + +Select only required columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of passing ``"*"`` in ``DBReader(columns=[...])`` prefer passing exact column names. This reduces the amount of data passed from Clickhouse to Spark. + +Pay attention to ``where`` value +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of filtering data on Spark side using ``df.filter(df.column == 'value')`` pass proper ``DBReader(where="column = 'value'")`` clause. +This both reduces the amount of data send from Clickhouse to Spark, and may also improve performance of the query. +Especially if there are indexes or partitions for columns used in ``where`` clause. + +Options +------- .. currentmodule:: onetl.connection.db_connection.jdbc_connection.options diff --git a/docs/connection/db_connection/clickhouse/sql.rst b/docs/connection/db_connection/clickhouse/sql.rst new file mode 100644 index 000000000..1a3a1d52a --- /dev/null +++ b/docs/connection/db_connection/clickhouse/sql.rst @@ -0,0 +1,68 @@ +.. _clickhouse-sql: + +Reading from Clickhouse using ``Clickhouse.sql`` +================================================ + +``Clickhouse.sql`` allows passing custom SQL query, but does not support incremental strategies. + +.. warning:: + + Please take into account :ref:`clickhouse-types` + +.. warning:: + + Statement is executed in **read-write** connection, so if you're calling some functions/procedures with DDL/DML statements inside, + they can change data in your database. + +Syntax support +-------------- + +Only queries with the following syntax are supported: + +* ✅︎ ``SELECT ... FROM ...`` +* ✅︎ ``WITH alias AS (...) SELECT ...`` +* ❌ ``SET ...; SELECT ...;`` - multiple statements not supported + +Examples +-------- + +.. code-block:: python + + from onetl.connection import Clickhouse + + clickhouse = Clickhouse(...) + df = clickhouse.sql( + """ + SELECT + id, + key, + CAST(value AS String) value, + updated_at + FROM + some.mytable + WHERE + key = 'something' + """, + options=Clickhouse.ReadOptions( + partition_column="id", + num_partitions=10, + lower_bound=0, + upper_bound=1000, + ), + ) + +Recommendations +--------------- + +Select only required columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of passing ``SELECT * FROM ...`` prefer passing exact column names ``SELECT col1, col2, ...``. +This reduces the amount of data passed from Clickhouse to Spark. + +Pay attention to ``where`` value +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of filtering data on Spark side using ``df.filter(df.column == 'value')`` pass proper ``WHERE column = 'value'`` clause. +This both reduces the amount of data send from Clickhouse to Spark, and may also improve performance of the query. +Especially if there are indexes or partitions for columns used in ``where`` clause. diff --git a/docs/connection/db_connection/clickhouse/types.rst b/docs/connection/db_connection/clickhouse/types.rst new file mode 100644 index 000000000..1df369dd4 --- /dev/null +++ b/docs/connection/db_connection/clickhouse/types.rst @@ -0,0 +1,430 @@ +.. _clickhouse-types: + +Clickhouse <-> Spark type mapping +================================= + +Type detection & casting +------------------------ + +Spark's DataFrames always have a ``schema`` which is a list of columns with corresponding Spark types. All operations on a column are performed using column type. + +Reading from Clickhouse +~~~~~~~~~~~~~~~~~~~~~~~ + +This is how Clickhouse connector performs this: + +* For each column in query result (``SELECT column1, column2, ... FROM table ...``) get column name and Clickhouse type. +* Find corresponding ``Clickhouse type (read)`` -> ``Spark type`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* Create DataFrame from query with specific column names and Spark types. + +Writing to some existing Clickhouse table +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This is how Clickhouse connector performs this: + +* Get names of columns in DataFrame. [1]_ +* Perform ``SELECT * FROM table LIMIT 0`` query. +* Take only columns present in DataFrame (by name, case insensitive). For each found column get Clickhouse type. +* **Find corresponding** ``Clickhouse type (read)`` -> ``Spark type`` **combination** (see below) for each DataFrame column. If no combination is found, raise exception. [2]_ +* Find corresponding ``Spark type`` -> ``Clickhousetype (write)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* If ``Clickhousetype (write)`` match ``Clickhouse type (read)``, no additional casts will be performed, DataFrame column will be written to Clickhouse as is. +* If ``Clickhousetype (write)`` does not match ``Clickhouse type (read)``, DataFrame column will be casted to target column type **on Clickhouse side**. For example, you can write column with text data to ``Int32`` column, if column contains valid integer values within supported value range and precision. + +.. [1] + This allows to write data to tables with ``DEFAULT`` columns - if DataFrame has no such column, + it will be populated by Clickhouse. + +.. [2] + + Yes, this is weird. + +Create new table using Spark +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. warning:: + + ABSOLUTELY NOT RECOMMENDED! + +This is how Clickhouse connector performs this: + +* Find corresponding ``Spark type`` -> ``Clickhouse type (create)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* Generate DDL for creating table in Clickhouse, like ``CREATE TABLE (col1 ...)``, and run it. +* Write DataFrame to created table as is. + +But Spark does not have specific dialect for Clickhouse, so Generic JDBC dialect is used. +Generic dialect is using SQL ANSI type names while creating tables in target database, not database-specific types. + +If some cases this may lead to using wrong column type. For example, Spark creates column of type ``TIMESTAMP`` +which corresponds to Clickhouse type ``DateTime32`` (precision up to seconds) +instead of more precise ``DateTime64`` (precision up to nanoseconds). +This may lead to incidental precision loss, or sometimes data cannot be written to created table at all. + +So instead of relying on Spark to create tables: + +.. dropdown:: See example + + .. code:: python + + writer = DBWriter( + connection=clickhouse, + table="default.target_tbl", + options=Clickhouse.WriteOptions( + if_exists="append", + # ENGINE is required by Clickhouse + createTableOptions="ENGINE = MergeTree() ORDER BY id", + ), + ) + writer.run(df) + +Always prefer creating tables with specific types **BEFORE WRITING DATA**: + +.. dropdown:: See example + + .. code:: python + + clickhouse.execute( + """ + CREATE TABLE default.target_tbl AS ( + id UInt8, + value DateTime64(6) -- specific type and precision + ) + ENGINE = MergeTree() + ORDER BY id + """, + ) + + writer = DBWriter( + connection=clickhouse, + table="default.target_tbl", + options=Clickhouse.WriteOptions(if_exists="append"), + ) + writer.run(df) + +References +~~~~~~~~~~ + +Here you can find source code with type conversions: + +* `Clickhouse -> JDBC `_ +* `JDBC -> Spark `_ +* `Spark -> JDBC `_ +* `JDBC -> Clickhouse `_ + +Supported types +--------------- + +See `official documentation `_ + +Generic types +~~~~~~~~~~~~~ + +* ``LowCardinality(T)`` is same as ``T`` +* ``Nullable(T)`` is same as ``T``, but Spark column is inferred as ``nullable=True`` + +Numeric types +~~~~~~~~~~~~~ + ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| Clickhouse type (read) | Spark type | Clickhousetype (write) | Clickhouse type (create) | ++================================+===================================+===============================+===============================+ +| ``Bool`` | ``IntegerType()`` | ``Int32`` | ``Int32`` | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``-`` | ``BooleanType()`` | ``UInt64`` | ``UInt64`` | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``Decimal`` | ``DecimalType(P=10, S=0)`` | ``Decimal(P=10, S=0)`` | ``Decimal(P=10, S=0)`` | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``Decimal(P=0..38)`` | ``DecimalType(P=0..38, S=0)`` | ``Decimal(P=0..38, S=0)`` | ``Decimal(P=0..38, S=0)`` | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``Decimal(P=0..38, S=0..38)`` | ``DecimalType(P=0..38, S=0..38)`` | ``Decimal(P=0..38, S=0..38)`` | ``Decimal(P=0..38, S=0..38)`` | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``Decimal(P=39..76, S=0..76)`` | unsupported [3]_ | | | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``Decimal32(P=0..9)`` | ``DecimalType(P=9, S=0..9)`` | ``Decimal(P=9, S=0..9)`` | ``Decimal(P=9, S=0..9)`` | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``Decimal64(S=0..18)`` | ``DecimalType(P=18, S=0..18)`` | ``Decimal(P=18, S=0..18)`` | ``Decimal(P=18, S=0..18)`` | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``Decimal128(S=0..38)`` | ``DecimalType(P=38, S=0..38)`` | ``Decimal(P=38, S=0..38)`` | ``Decimal(P=38, S=0..38)`` | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``Decimal256(S=0..76)`` | unsupported [3]_ | | | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``Float32`` | ``DoubleType()`` | ``Float64`` | ``Float64`` | ++--------------------------------+ | | | +| ``Float64`` | | | | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``-`` | ``FloatType()`` | ``Float32`` | ``Float32`` | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``Int8`` | ``IntegerType()`` | ``Int32`` | ``Int32`` | ++--------------------------------+ | | | +| ``Int16`` | | | | ++--------------------------------+ | | | +| ``Int32`` | | | | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``Int64`` | ``LongType()`` | ``Int64`` | ``Int64`` | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``Int128`` | ``DecimalType(20,0)`` | ``Decimal(20,0)`` | ``Decimal(20,0)`` | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``Int256`` | unsupported [3]_ | | | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``-`` | ``ByteType()`` | ``Int8`` | ``Int8`` | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``-`` | ``ShortType()`` | ``Int32`` | ``Int32`` | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``UInt8`` | ``IntegerType()`` | ``Int32`` | ``Int32`` | ++--------------------------------+ | | | +| ``UInt16`` | | | | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``UInt32`` | ``DecimalType(20,0)`` | ``Decimal(20,0)`` | ``Decimal(20,0)`` | ++--------------------------------+ | | | +| ``UInt64`` | | | | ++--------------------------------+ | | | +| ``UInt128`` | | | | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``UInt256`` | unsupported [3]_ | | | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ + +.. [3] + + Clickhouse support numeric types up to 256 bit - ``Int256``, ``UInt256``, ``Decimal256(S)``, ``Decimal(P=39..76, S=0..76)``. + + But Spark's ``DecimalType(P, S)`` supports maximum ``P=38`` (128 bit). It is impossible to read, write or operate with values of larger precision, + this leads to an exception. + +Temporal types +~~~~~~~~~~~~~~ + +Notes: + * Datetime with timezone has the same precision as without timezone + * ``DateTime`` is alias for ``DateTime32`` + * ``TIMESTAMP`` is alias for ``DateTime32``, but ``TIMESTAMP(N)`` is alias for ``DateTime64(N)`` + ++-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ +| Clickhouse type (read) | Spark type | Clickhousetype (write) | Clickhouse type (create) | ++===================================+======================================+==================================+===============================+ +| ``Date`` | ``DateType()`` | ``Date`` | ``Date`` | ++-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ +| ``Date32`` | unsupported | | | ++-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ +| ``DateTime32``, seconds | ``TimestampType()``, microseconds | ``DateTime64(6)``, microseconds | ``DateTime32``, seconds, | ++-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ +| ``DateTime64(3)``, milliseconds | ``TimestampType()``, microseconds | ``DateTime64(6)``, microseconds | ``DateTime32``, seconds, | +| | | | **precision loss** [5]_ | +| | | | | ++-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ +| ``DateTime64(6)``, microseconds | ``TimestampType()``, microseconds | ``DateTime64(6)``, microseconds | ``DateTime32``, seconds, | ++-----------------------------------+--------------------------------------+----------------------------------+ **cannot be inserted** [6]_ | +| ``DateTime64(7..9)``, nanoseconds | ``TimestampType()``, microseconds, | ``DateTime64(6)``, microseconds, | | +| | **precision loss** [4]_ | **precision loss** [4]_ | | +| | | | | ++-----------------------------------+--------------------------------------+----------------------------------+ | +| ``-`` | ``TimestampNTZType()``, microseconds | ``DateTime64(6)`` | | ++-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ +| ``IntervalNanosecond`` | unsupported | | | ++-----------------------------------+ | | | +| ``IntervalMicrosecond`` | | | | ++-----------------------------------+ | | | +| ``IntervalMillisecond`` | | | | ++-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ +| ``IntervalSecond`` | ``IntegerType()`` | ``Int32`` | ``Int32`` | ++-----------------------------------+ | | | +| ``IntervalMinute`` | | | | ++-----------------------------------+ | | | +| ``IntervalHour`` | | | | ++-----------------------------------+ | | | +| ``IntervalDay`` | | | | ++-----------------------------------+ | | | +| ``IntervalMonth`` | | | | ++-----------------------------------+ | | | +| ``IntervalQuarter`` | | | | ++-----------------------------------+ | | | +| ``IntervalWeek`` | | | | ++-----------------------------------+ | | | +| ``IntervalYear`` | | | | ++-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ + +.. warning:: + + Note that types in Clickhouse and Spark have different value ranges: + + +------------------------+-----------------------------------+-----------------------------------+---------------------+--------------------------------+--------------------------------+ + | Clickhouse type | Min value | Max value | Spark type | Min value | Max value | + +========================+===================================+===================================+=====================+================================+================================+ + | ``Date`` | ``1970-01-01`` | ``2149-06-06`` | ``DateType()`` | ``0001-01-01`` | ``9999-12-31`` | + +------------------------+-----------------------------------+-----------------------------------+---------------------+--------------------------------+--------------------------------+ + | ``DateTime32`` | ``1970-01-01 00:00:00`` | ``2106-02-07 06:28:15`` | ``TimestampType()`` | ``0001-01-01 00:00:00.000000`` | ``9999-12-31 23:59:59.999999`` | + +------------------------+-----------------------------------+-----------------------------------+ | | | + | ``DateTime64(P=0..8)`` | ``1900-01-01 00:00:00.00000000`` | ``2299-12-31 23:59:59.99999999`` | | | | + +------------------------+-----------------------------------+-----------------------------------+ | | | + | ``DateTime64(P=9)`` | ``1900-01-01 00:00:00.000000000`` | ``2262-04-11 23:47:16.999999999`` | | | | + +------------------------+-----------------------------------+-----------------------------------+---------------------+--------------------------------+--------------------------------+ + + So not all of values in Spark DataFrame can be written to Clickhouse. + + References: + * `Clickhouse Date documentation `_ + * `Clickhouse Datetime32 documentation `_ + * `Clickhouse Datetime64 documentation `_ + * `Spark DateType documentation `_ + * `Spark TimestampType documentation `_ + +.. [4] + Clickhouse support datetime up to nanoseconds precision (``23:59:59.999999999``), + but Spark ``TimestampType()`` supports datetime up to microseconds precision (``23:59:59.999999``). + Nanoseconds will be lost during read or write operations. + +.. [5] + Generic JDBC dialect generates DDL with Clickhouse type ``TIMESTAMP`` which is alias for ``DateTime32`` with precision up to seconds (``23:59:59``). + Inserting data with milliseconds precision (``23:59:59.999``) will lead to **throwing away milliseconds**. + +.. [6] + Clickhouse will raise an exception that data in format ``2001-01-01 23:59:59.999999`` has data ``.999999`` which does not match format ``YYYY-MM-DD hh:mm:ss``. + So you can create Clickhouse table with Spark, but cannot write data to column of this type. + +String types +~~~~~~~~~~~~~ + ++--------------------------------------+------------------+------------------------+--------------------------+ +| Clickhouse type (read) | Spark type | Clickhousetype (write) | Clickhouse type (create) | ++======================================+==================+========================+==========================+ +| ``FixedString(N)`` | ``StringType()`` | ``String`` | ``String`` | ++--------------------------------------+ | | | +| ``String`` | | | | ++--------------------------------------+ | | | +| ``Enum8`` | | | | ++--------------------------------------+ | | | +| ``Enum16`` | | | | ++--------------------------------------+ | | | +| ``IPv4`` | | | | ++--------------------------------------+ | | | +| ``IPv6`` | | | | ++--------------------------------------+------------------+ | | +| ``-`` | ``BinaryType()`` | | | ++--------------------------------------+------------------+------------------------+--------------------------+ + +Unsupported types +----------------- + +Columns of these Clickhouse types cannot be read by Spark: + * ``AggregateFunction(func, T)`` + * ``Array(T)`` + * ``JSON`` + * ``Map(K, V)`` + * ``MultiPolygon`` + * ``Nested(field1 T1, ...)`` + * ``Nothing`` + * ``Point`` + * ``Polygon`` + * ``Ring`` + * ``SimpleAggregateFunction(func, T)`` + * ``Tuple(T1, T2, ...)`` + * ``UUID`` + +Dataframe with these Spark types be written to Clickhouse: + * ``ArrayType(T)`` + * ``BinaryType()`` + * ``CharType(N)`` + * ``DayTimeIntervalType(P, S)`` + * ``MapType(K, V)`` + * ``NullType()`` + * ``StructType([...])`` + * ``TimestampNTZType()`` + * ``VarcharType(N)`` + +This is because Spark does not have dedicated Clickhouse dialect, and uses Generic JDBC dialect instead. +This dialect does not have type conversion between some types, like Clickhouse ``Array`` -> Spark ``ArrayType()``, and vice versa. + +The is a way to avoid this - just cast everything to ``String``. + +Explicit type cast +------------------ + +``DBReader`` +~~~~~~~~~~~~ + +Use ``CAST`` or ``toJSONString`` to get column data as string in JSON format, +and then cast string column in resulting dataframe to proper type using `from_json `_: + +.. code:: python + + from pyspark.sql.functions import from_json + from pyspark.sql.types import ArrayType, IntegerType + + reader = DBReader( + connection=clickhouse, + columns=[ + "id", + "toJSONString(array_column) array_column", + ], + ) + df = reader.run() + + # Spark requires all columns to have some specific type, describe it + column_type = ArrayType(IntegerType()) + + df = df.select( + df.id, + from_json(df.array_column, column_type).alias("array_column"), + ) + +``DBWriter`` +~~~~~~~~~~~~ + +Convert dataframe column to JSON using `to_json `_, +and write it as ``String`` column in Clickhouse: + +.. code:: python + + clickhouse.execute( + """ + CREATE TABLE default.target_tbl AS ( + id Int32, + array_column_json String, + ) + ENGINE = MergeTree() + ORDER BY id + """, + ) + + from pyspark.sql.functions import to_json + + df = df.select( + df.id, + to_json(df.array_column).alias("array_column_json"), + ) + + writer.run(df) + +Then you can parse this column on Clickhouse side - for example, by creating a view: + +.. code:: sql + + SELECT + id, + JSONExtract(json_column, 'Array(String)') AS array_column + FROM target_tbl + +You can also use `ALIAS `_ +or `MATERIALIZED `_ columns +to avoid writing such expression in every ``SELECT`` clause all the time: + +.. code-block:: sql + + CREATE TABLE default.target_tbl AS ( + id Int32, + array_column_json String, + -- computed column + array_column Array(String) ALIAS JSONExtract(json_column, 'Array(String)') + -- or materialized column + -- array_column Array(String) MATERIALIZED JSONExtract(json_column, 'Array(String)') + ) + ENGINE = MergeTree() + ORDER BY id + +Downsides: + +* Using ``SELECT JSONExtract(...)`` or ``ALIAS`` column can be expensive, because value is calculated on every row access. This can be especially harmful if such column is used in ``WHERE`` clause. +* ``ALIAS`` and ``MATERIALIZED`` columns are not included in ``SELECT *`` clause, they should be added explicitly: ``SELECT *, calculated_column FROM table``. + +.. warning:: + + `EPHEMERAL `_ columns are not supported by Spark + because they cannot be selected to determine target column type. diff --git a/docs/connection/db_connection/clickhouse/write.rst b/docs/connection/db_connection/clickhouse/write.rst index 97cc95d36..1fe56868b 100644 --- a/docs/connection/db_connection/clickhouse/write.rst +++ b/docs/connection/db_connection/clickhouse/write.rst @@ -1,9 +1,51 @@ .. _clickhouse-write: -Writing to Clickhouse -===================== +Writing to Clickhouse using ``DBWriter`` +======================================== -For writing data to Clickhouse, use :obj:`DBWriter ` with options below. +For writing data to Clickhouse, use :obj:`DBWriter `. + +.. warning:: + + Please take into account :ref:`clickhouse-types` + +.. warning:: + + It is always recommended to create table explicitly using :ref:`Clickhouse.execute ` + instead of relying on Spark's table DDL generation. + + This is because Spark's DDL generator can create columns with different precision and types than it is expected, + causing precision loss or other issues. + +Examples +-------- + +.. code-block:: python + + from onetl.connection import Clickhouse + from onetl.db import DBWriter + + clickhouse = Clickhouse(...) + + df = ... # data is here + + writer = DBWriter( + connection=clickhouse, + target="schema.table", + options=Clickhouse.WriteOptions( + if_exists="append", + # ENGINE is required by Clickhouse + createTableOptions="ENGINE = MergeTree() ORDER BY id", + ), + ) + + writer.run(df) + + +Options +------- + +Method above accepts :obj:`JDBCWriteOptions ` .. currentmodule:: onetl.connection.db_connection.jdbc_connection.options diff --git a/docs/connection/db_connection/greenplum/execute.rst b/docs/connection/db_connection/greenplum/execute.rst index e2179a4ec..c0c415369 100644 --- a/docs/connection/db_connection/greenplum/execute.rst +++ b/docs/connection/db_connection/greenplum/execute.rst @@ -3,6 +3,105 @@ Executing statements in Greenplum ================================== +.. warning:: + + Methods below **read all the rows** returned from DB **to Spark driver memory**, and then convert them to DataFrame. + + Do **NOT** use them to read large amounts of data. Use :ref:`DBReader ` instead. + +How to +------ + +There are 2 ways to execute some statement in Greenplum + +Use ``Greenplum.fetch`` +~~~~~~~~~~~~~~~~~~~~~~~ + +Use this method to execute some ``SELECT`` query which returns **small number or rows**, like reading +Greenplum config, or reading data from some reference table. Method returns Spark DataFrame. + +Method accepts :obj:`JDBCOptions `. + +Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. + +.. warning:: + + ``Greenplum.fetch`` is implemented using Postgres JDBC connection, + so types are handled a bit differently than in ``DBReader``. See :ref:`postgres-types`. + +Syntax support +^^^^^^^^^^^^^^ + +This method supports **any** query syntax supported by Greenplum, like: + +* ✅︎ ``SELECT ... FROM ...`` +* ✅︎ ``WITH alias AS (...) SELECT ...`` +* ✅︎ ``SELECT func(arg1, arg2)`` or ``{call func(arg1, arg2)}`` - special syntax for calling functions +* ❌ ``SET ...; SELECT ...;`` - multiple statements not supported + +Examples +^^^^^^^^ + +.. code-block:: python + + from onetl.connection import Greenplum + + greenplum = Greenplum(...) + + df = greenplum.fetch( + "SELECT value FROM some.reference_table WHERE key = 'some_constant'", + options=Greenplum.JDBCOptions(query_timeout=10), + ) + greenplum.close() + value = df.collect()[0][0] # get value from first row and first column + +Use ``Greenplum.execute`` +~~~~~~~~~~~~~~~~~~~~~~~~~ + +Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it. + +Method accepts :obj:`JDBCOptions `. + +Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. + +Syntax support +^^^^^^^^^^^^^^ + +This method supports **any** query syntax supported by Greenplum, like: + +* ✅︎ ``CREATE TABLE ...``, ``CREATE VIEW ...``, ``DROP TABLE ...``, and so on +* ✅︎ ``ALTER ...`` +* ✅︎ ``INSERT INTO ... SELECT ...``, ``UPDATE ...``, ``DELETE ...``, and so on +* ✅︎ ``DROP TABLE ...``, ``DROP VIEW ...``, and so on +* ✅︎ ``CALL procedure(arg1, arg2) ...`` +* ✅︎ ``SELECT func(arg1, arg2)`` or ``{call func(arg1, arg2)}`` - special syntax for calling functions +* ✅︎ other statements not mentioned here +* ❌ ``SET ...; SELECT ...;`` - multiple statements not supported + +Examples +^^^^^^^^ + +.. code-block:: python + + from onetl.connection import Greenplum + + greenplum = Greenplum(...) + + with greenplum: + # automatically close connection after exiting this context manager + greenplum.execute("DROP TABLE schema.table") + greenplum.execute( + """ + CREATE TABLE schema.table AS ( + id int, + key text, + value real + ) + DISTRIBUTED BY id + """, + options=Greenplum.JDBCOptions(query_timeout=10), + ) + Interaction schema ------------------ @@ -44,12 +143,6 @@ The only port used while interacting with Greenplum in this case is ``5432`` (Gr Options ------- -.. currentmodule:: onetl.connection.db_connection.greenplum.connection - -.. automethod:: Greenplum.fetch -.. automethod:: Greenplum.execute -.. automethod:: Greenplum.close - .. currentmodule:: onetl.connection.db_connection.jdbc_mixin.options .. autopydantic_model:: JDBCOptions diff --git a/docs/connection/db_connection/greenplum/index.rst b/docs/connection/db_connection/greenplum/index.rst index b4ff40331..af445579f 100644 --- a/docs/connection/db_connection/greenplum/index.rst +++ b/docs/connection/db_connection/greenplum/index.rst @@ -17,3 +17,9 @@ Greenplum read write execute + +.. toctree:: + :maxdepth: 1 + :caption: Troubleshooting + + types diff --git a/docs/connection/db_connection/greenplum/prerequisites.rst b/docs/connection/db_connection/greenplum/prerequisites.rst index 4f33b51a0..04595766c 100644 --- a/docs/connection/db_connection/greenplum/prerequisites.rst +++ b/docs/connection/db_connection/greenplum/prerequisites.rst @@ -6,11 +6,11 @@ Prerequisites Version Compatibility --------------------- -* Greenplum server versions: 5.x, 6.x, 7.x +* Greenplum server versions: 5.x, 6.x, 7.x (requires ``Greenplum.get_packages(package_version="2.3.0")`` or higher) * Spark versions: 2.3.x - 3.2.x (Spark 3.3+ is not supported yet) * Java versions: 8 - 11 -See `official documentation `_. +See `official documentation `_. Installing PySpark ------------------ @@ -24,13 +24,19 @@ Downloading VMware package -------------------------- To use Greenplum connector you should download connector ``.jar`` file from -`VMware website `_ +`VMware website `_ and then pass it to Spark session. .. warning:: Please pay attention to :ref:`Spark & Scala version compatibility `. +.. warning:: + + There are issues with using package of version 2.3.0/2.3.1 with Greenplum 6.x - connector can + open transaction with ``SELECT * FROM table LIMIT 0`` query, but does not close it, which leads to deadlocks + during write. + There are several ways to do that. See :ref:`java-packages` for details. .. note:: @@ -53,8 +59,8 @@ Greenplum master only receives commands to start reading/writing process, and ma More details can be found in `official documentation `_. -Number of parallel connections -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Set number of connections +~~~~~~~~~~~~~~~~~~~~~~~~~ .. warning:: @@ -83,32 +89,32 @@ Number of connections can be limited by 2 ways: .. code-tab:: py Spark with master=local - ( + spark = ( SparkSession.builder - # Spark will start EXACTLY 10 executors with 1 core each, so max number of parallel jobs is 10 - .config("spark.master", "local[10]") + # Spark will start EXACTLY 5 executors with 1 core each, so max number of parallel jobs is 10 + .config("spark.master", "local[5]") .config("spark.executor.cores", 1) - ) + ).getOrCreate() .. code-tab:: py Spark with master=yarn or master=k8s, dynamic allocation - ( + spark = ( SparkSession.builder .config("spark.master", "yarn") # Spark will start MAX 10 executors with 1 core each (dynamically), so max number of parallel jobs is 10 .config("spark.dynamicAllocation.maxExecutors", 10) .config("spark.executor.cores", 1) - ) + ).getOrCreate() .. code-tab:: py Spark with master=yarn or master=k8s, static allocation - ( + spark = ( SparkSession.builder .config("spark.master", "yarn") # Spark will start EXACTLY 10 executors with 1 core each, so max number of parallel jobs is 10 .config("spark.executor.instances", 10) .config("spark.executor.cores", 1) - ) + ).getOrCreate() * By limiting connection pool size user by Spark (**only** for Spark with ``master=local``): @@ -140,8 +146,8 @@ e.g. by updating ``pg_hba.conf`` file. More details can be found in `official documentation `_. -Network ports -~~~~~~~~~~~~~ +Set connection port +~~~~~~~~~~~~~~~~~~~ Spark with ``master=k8s`` ^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -159,7 +165,7 @@ To read data from Greenplum using Spark, following ports should be opened in fir .. code:: python - Greenplum(host="master.host", port=5432, ...) + greenplum = Greenplum(host="master.host", port=5432, ...) * Greenplum segments -> some port range (e.g. ``41000-42000``) **listened by Spark executors**. @@ -167,12 +173,12 @@ To read data from Greenplum using Spark, following ports should be opened in fir .. code:: python - Greenplum( - ..., - extra={ - "server.port": "41000-42000", - }, - ) + greenplum = Greenplum( + ..., + extra={ + "server.port": "41000-42000", + }, + ) Number of ports in this range is ``number of parallel running Spark sessions`` * ``number of parallel connections per session``. @@ -189,15 +195,147 @@ More details can be found in official documentation: * `format of server.port value `_ * `port troubleshooting `_ -Required grants -~~~~~~~~~~~~~~~ +Set connection host +~~~~~~~~~~~~~~~~~~~ + +Spark with ``master=k8s`` +^^^^^^^^^^^^^^^^^^^^^^^^^ + +Please follow `the official documentation `_ + +Spark with ``master=local`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +By default, Greenplum connector tries to resolve IP of current host, and then pass it as ``gpfdist`` URL to Greenplum segment. +This may fail in some cases. + +For example, IP can be resolved using ``/etc/hosts`` content like this: + +.. code:: text + + 127.0.0.1 localhost real-host-name + +.. code:: bash + + $ hostname -f + localhost + + $ hostname -i + 127.0.0.1 + +Reading/writing data to Greenplum will fail with following exception: + +.. code-block:: text + + org.postgresql.util.PSQLException: ERROR: connection with gpfdist failed for + "gpfdist://127.0.0.1:49152/local-1709739764667/exec/driver", + effective url: "http://127.0.0.1:49152/local-1709739764667/exec/driver": + error code = 111 (Connection refused); (seg3 slice1 12.34.56.78:10003 pid=123456) + +There are 2 ways to fix that: + +* Explicitly pass your host IP address to connector, like this + + .. code-block:: python + + import os + + # pass here real host IP (accessible from GP segments) + os.environ["HOST_IP"] = "192.168.1.1" + + greenplum = Greenplum( + ..., + extra={ + # connector will read IP from this environment variable + "server.hostEnv": "env.HOST_IP", + }, + spark=spark, + ) + + More details can be found in `official documentation `_. + +* Update ``/etc/hosts`` file to include real host IP: + + .. code:: text + + 127.0.0.1 localhost + # this IP should be accessible from GP segments + 192.168.1.1 driver-host-name + + So Greenplum connector will properly resolve host IP. + +Spark with ``master=yarn`` +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The same issue with resolving IP address can occur on Hadoop cluster node, but it's tricky to fix, because each node has a different IP. + +There are 3 ways to fix that: + +* Pass node hostname to ``gpfdist`` URL. So IP will be resolved on segment side: + + .. code-block:: python + + greenplum = Greenplum( + ..., + extra={ + "server.useHostname": "true", + }, + ) + + But this may fail if Hadoop cluster node hostname cannot be resolved from Greenplum segment side. + + More details can be found in `official documentation `_. + +* Set specific network interface to get IP address from: + + .. code-block:: python + + greenplum = Greenplum( + ..., + extra={ + "server.nic": "eth0", + }, + ) + + You can get list of network interfaces using this command. + + .. note:: + + This command should be executed on Hadoop cluster node, **not** Spark driver host! + + .. code-block:: bash + + $ ip address + 1: lo: mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000 + inet 127.0.0.1/8 scope host lo + valid_lft forever preferred_lft forever + 2: eth0: mtu 1500 qdisc fq_codel state UP group default qlen 1000 + inet 192.168.1.1/24 brd 192.168.1.255 scope global dynamic noprefixroute eth0 + valid_lft 83457sec preferred_lft 83457sec + + Note that in this case **each** Hadoop cluster node node should have network interface with name ``eth0``. + + More details can be found in `official documentation `_. + +* Update ``/etc/hosts`` on each Hadoop cluster node to include real node IP: + + .. code:: text + + 127.0.0.1 localhost + # this IP should be accessible from GP segments + 192.168.1.1 cluster-node-name + + So Greenplum connector will properly resolve node IP. + +Set required grants +~~~~~~~~~~~~~~~~~~~ Ask your Greenplum cluster administrator to set following grants for a user, used for creating a connection: .. tabs:: - .. code-tab:: sql Read + write + .. code-tab:: sql Read + Write -- get access to get tables metadata & cluster information GRANT SELECT ON information_schema.tables TO username; @@ -243,28 +381,4 @@ used for creating a connection: -- allow read access to specific table GRANT SELECT ON schema_to_read.table_to_read TO username; - .. code-tab:: sql Write only - - -- get access to get tables metadata & cluster information - GRANT SELECT ON information_schema.tables TO username; - GRANT SELECT ON pg_attribute TO username; - GRANT SELECT ON pg_class TO username; - GRANT SELECT ON pg_namespace TO username; - GRANT SELECT ON pg_settings TO username; - GRANT SELECT ON pg_stats TO username; - GRANT SELECT ON gp_distributed_xacts TO username; - GRANT SELECT ON gp_segment_configuration TO username; - -- Greenplum 5.x only - GRANT SELECT ON gp_distribution_policy TO username; - - -- allow creating external tables in the same schema as target table - GRANT USAGE ON SCHEMA schema_to_write TO username; - GRANT CREATE ON SCHEMA schema_to_write TO username; - -- yes, ``readable``, because data is read from Spark executor to Greenplum. - ALTER USER username CREATEEXTTABLE(type = 'readable', protocol = 'gpfdist'); - - -- allow read access to specific table (to get column types) - -- allow write access to specific table - GRANT SELECT, INSERT ON schema_to_write.table_to_write TO username; - More details can be found in `official documentation `_. diff --git a/docs/connection/db_connection/greenplum/read.rst b/docs/connection/db_connection/greenplum/read.rst index feecaf0bb..310521677 100644 --- a/docs/connection/db_connection/greenplum/read.rst +++ b/docs/connection/db_connection/greenplum/read.rst @@ -1,15 +1,109 @@ .. _greenplum-read: -Reading from Greenplum -======================= +Reading from Greenplum using ``DBReader`` +========================================== -For reading data from Greenplum, use :obj:`DBReader ` with options below. +Data can be read from Greenplum to Spark using :obj:`DBReader `. +It also supports :ref:`strategy` for incremental data reading. + +.. warning:: + + Please take into account :ref:`greenplum-types`. .. note:: Unlike JDBC connectors, *Greenplum connector for Spark* does not support - executing **custom** SQL queries using ``.sql`` method. Connector can be used to only read some table data - (with filters, if needed) using DBReader. + executing **custom** SQL queries using ``.sql`` method. Connector can be used to only read data from a table or view. + +Supported DBReader features +--------------------------- + +* ✅︎ ``columns`` (see note below) +* ✅︎ ``where`` (see note below) +* ✅︎ ``hwm`` (see note below), supported strategies: +* * ✅︎ :ref:`snapshot-strategy` +* * ✅︎ :ref:`incremental-strategy` +* * ✅︎ :ref:`snapshot-batch-strategy` +* * ✅︎ :ref:`incremental-batch-strategy` +* ❌ ``hint`` (is not supported by Greenplum) +* ❌ ``df_schema`` +* ✅︎ ``options`` (see :obj:`GreenplumReadOptions `) + +.. warning:: + + In case of Greenplum connector, ``DBReader`` does not generate raw ``SELECT`` query. Instead it relies on Spark SQL syntax + which in some cases (using column projection and predicate pushdown) can be converted to Greenplum SQL. + + So ``columns``, ``where`` and ``hwm.expression`` should be specified in Spark SQL syntax, not Greenplum SQL. + + This is OK: + + .. code-block:: python + + DBReader( + columns=[ + "some_column", + # this cast is executed on Spark side + "CAST(another_column AS STRING)", + ], + # this predicate is parsed by Spark, and can be pushed down to Greenplum + where="some_column LIKE 'val1%'", + ) + + This is will fail: + + .. code-block:: python + + DBReader( + columns=[ + "some_column", + # Spark does not have `text` type + "CAST(another_column AS text)", + ], + # Spark does not support ~ syntax for regexp matching + where="some_column ~ 'val1.*'", + ) + +Examples +-------- + +Snapshot strategy: + +.. code-block:: python + + from onetl.connection import Greenplum + from onetl.db import DBReader + + greenplum = Greenplum(...) + + reader = DBReader( + connection=greenplum, + source="schema.table", + columns=["id", "key", "CAST(value AS string) value", "updated_dt"], + where="key = 'something'", + ) + df = reader.run() + +Incremental strategy: + +.. code-block:: python + + from onetl.connection import Greenplum + from onetl.db import DBReader + from onetl.strategy import IncrementalStrategy + + greenplum = Greenplum(...) + + reader = DBReader( + connection=greenplum, + source="schema.table", + columns=["id", "key", "CAST(value AS string) value", "updated_dt"], + where="key = 'something'", + hwm=DBReader.AutoDetectHWM(name="greenplum_hwm", expression="updated_dt"), + ) + + with IncrementalStrategy(): + df = reader.run() Interaction schema ------------------ @@ -91,57 +185,188 @@ High-level schema is described in :ref:`greenplum-prerequisites`. You can find d Recommendations --------------- -Reading from views -~~~~~~~~~~~~~~~~~~ +Select only required columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of passing ``"*"`` in ``DBReader(columns=[...])`` prefer passing exact column names. This reduces the amount of data passed from Greenplum to Spark. -This connector is **NOT** designed to read data from views. +Pay attention to ``where`` value +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -You can technically read data from a view which has -`gp_segment_id `_ column. -But this is **not** recommended because each Spark executor will run the same query, which may lead to running duplicated calculations -and sending data between segments only to skip most of the result and select only small part. +Instead of filtering data on Spark side using ``df.filter(df.column == 'value')`` pass proper ``DBReader(where="column = 'value'")`` clause. +This both reduces the amount of data send from Greenplum to Spark, and may also improve performance of the query. +Especially if there are indexes or partitions for columns used in ``where`` clause. -Prefer following option: - * Create staging table to store result data, using :obj:`Greenplum.execute ` - * Use the same ``.execute`` method run a query ``INSERT INTO staging_table AS SELECT FROM some_view``. This will be done on Greenplum segments side, query will be run only once. - * Read data from staging table to Spark executor using :obj:`DBReader `. - * Drop staging table using ``.execute`` method. +Read data in parallel +~~~~~~~~~~~~~~~~~~~~~ -Using ``JOIN`` on Greenplum side -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +``DBReader`` in case of Greenplum connector requires view or table to have a column which is used by Spark +for parallel reads. + +Choosing proper column allows each Spark executor to read only part of data stored in the specified segment, +avoiding moving large amounts of data between segments, which improves reading performance. + +Using ``gp_segment_id`` +^^^^^^^^^^^^^^^^^^^^^^^ + +By default, ``DBReader`` will use `gp_segment_id `_ +column for parallel data reading. Each DataFrame partition will contain data of a specific Greenplum segment. + +This allows each Spark executor read only data from specific Greenplum segment, avoiding moving large amounts of data between segments. -If you need to get data of joining 2 tables in Greenplum, you should: - * Create staging table to store result data, using ``Greenplum.execute`` - * Use the same ``Greenplum.execute`` run a query ``INSERT INTO staging_table AS SELECT FROM table1 JOIN table2``. This will be done on Greenplum segments side, in a distributed way. - * Read data from staging table to Spark executor using ``DBReader``. - * Drop staging table using ``Greenplum.execute``. +If view is used, it is recommended to include ``gp_segment_id`` column to this view: + +.. dropdown:: Reading from view with gp_segment_id column + + .. code-block:: python + + from onetl.connection import Greenplum + from onetl.db import DBReader + + greenplum = Greenplum(...) + + greenplum.execute( + """ + CREATE VIEW schema.view_with_gp_segment_id AS + SELECT + id, + some_column, + another_column, + gp_segment_id -- IMPORTANT + FROM schema.some_table + """, + ) + + reader = DBReader( + connection=greenplum, + source="schema.view_with_gp_segment_id", + ) + df = reader.run() + +Using custom ``partition_column`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Sometimes table or view is lack of ``gp_segment_id`` column, but there is some column +with value range correlated with Greenplum segment distribution. + +In this case, custom column can be used instead: + +.. dropdown:: Reading from view with custom partition_column + + .. code-block:: python + + from onetl.connection import Greenplum + from onetl.db import DBReader + + greenplum = Greenplum(...) + + greenplum.execute( + """ + CREATE VIEW schema.view_with_partition_column AS + SELECT + id, + some_column, + part_column -- correlated to greenplum segment ID + FROM schema.some_table + """, + ) + + reader = DBReader( + connection=greenplum, + source="schema.view_with_partition_column", + options=Greenplum.Options( + # parallelize data using specified column + partition_column="part_column", + # create 10 Spark tasks, each will read only part of table data + num_partitions=10, + ), + ) + df = reader.run() + +Reading ``DISTRIBUTED REPLICATED`` tables +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Replicated tables do not have ``gp_segment_id`` column at all, so you need to set ``partition_column`` to some column name +of type integer/bigint/smallint. + +Parallel ``JOIN`` execution +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In case of using views which require some data motion between Greenplum segments, like ``JOIN`` queries, another approach should be used. + +Each Spark executor N will run the same query, so each of N query will start its own JOIN process, leading to really heavy load on Greenplum segments. +**This should be avoided**. + +Instead is recommended to run ``JOIN`` query on Greenplum side, save the result to an intermediate table, +and then read this table using ``DBReader``: + +.. dropdown:: Reading from view using intermediate table + + .. code-block:: python + + from onetl.connection import Greenplum + from onetl.db import DBReader + + greenplum = Greenplum(...) + + greenplum.execute( + """ + CREATE UNLOGGED TABLE schema.intermediate_table AS + SELECT + id, + tbl1.col1, + tbl1.data, + tbl2.another_data + FROM + schema.table1 as tbl1 + JOIN + schema.table2 as tbl2 + ON + tbl1.col1 = tbl2.col2 + WHERE ... + """, + ) + + reader = DBReader( + connection=greenplum, + source="schema.intermediate_table", + ) + df = reader.run() + + # write dataframe somethere + + greenplum.execute( + """ + DROP TABLE schema.intermediate_table + """, + ) .. warning:: - Do **NOT** try to read data from ``table1`` and ``table2`` using ``DBReader``, and then join the resulting dataframes! + **NEVER** do that: - This will lead to sending all the data from both tables to Spark executor memory, and then ``JOIN`` - will be performed on Spark side, not Greenplum. This is **very** inefficient. + .. code-block:: python -Using ``TEMPORARY`` tables -~~~~~~~~~~~~~~~~~~~~~~~~~~ + df1 = DBReader(connection=greenplum, table="public.table1", ...).run() + df2 = DBReader(connection=greenplum, table="public.table2", ...).run() -Someone could think that writing data from ``VIEW`` or result of ``JOIN`` to ``TEMPORARY`` table, -and then passing it to DBReader, is an efficient way to read data from Greenplum, because temp tables are not generating WAL files, -and are automatically deleted after finishing the transaction. + joined_df = df1.join(df2, on="col") -That's will **not** work. Each Spark executor establishes its own connection to Greenplum, -and thus reads its own temporary table, which does not contain any data. + This will lead to sending all the data from both ``table1`` and ``table2`` to Spark executor memory, and then ``JOIN`` + will be performed on Spark side, not inside Greenplum. This is **VERY** inefficient. -You should use `UNLOGGED `_ tables -to write data to staging table without generating useless WAL logs. +``TEMPORARY`` tables notice +^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Mapping of Greenplum types to Spark types -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Someone could think that writing data from view or result of ``JOIN`` to ``TEMPORARY`` table, +and then passing it to ``DBReader``, is an efficient way to read data from Greenplum. This is because temp tables are not generating WAL files, +and are automatically deleted after finishing the transaction. + +That will **NOT** work. Each Spark executor establishes its own connection to Greenplum. +And each connection starts its own transaction which means that every executor will read empty temporary table. -See `official documentation `_ -for more details. -onETL does not perform any additional casting of types while reading data. +You should use `UNLOGGED `_ tables +to write data to intermediate table without generating WAL logs. Options ------- diff --git a/docs/connection/db_connection/greenplum/types.rst b/docs/connection/db_connection/greenplum/types.rst new file mode 100644 index 000000000..baea34914 --- /dev/null +++ b/docs/connection/db_connection/greenplum/types.rst @@ -0,0 +1,412 @@ +.. _greenplum-types: + +Greenplum <-> Spark type mapping +================================= + +Type detection & casting +------------------------ + +Spark's DataFrames always have a ``schema`` which is a list of columns with corresponding Spark types. All operations on a column are performed using column type. + +Reading from Greenplum +~~~~~~~~~~~~~~~~~~~~~~~ + +This is how Greenplum connector performs this: + +* Execute query ``SELECT * FROM table LIMIT 0`` [1]_. +* For each column in query result get column name and Greenplum type. +* Find corresponding ``Greenplum type (read)`` -> ``Spark type`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* Use Spark column projection and predicate pushdown features to build a final query. +* Create DataFrame from generated query with inferred schema. + +.. [1] + Yes, **all columns of a table**, not just selected ones. + This means that if source table **contains** columns with unsupported type, the entire table cannot be read. + +Writing to some existing Greenplum table +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This is how Greenplum connector performs this: + +* Get names of columns in DataFrame. +* Perform ``SELECT * FROM table LIMIT 0`` query. +* For each column in query result get column name and Greenplum type. +* Match table columns with DataFrame columns (by name, case insensitive). + If some column is present only in target table, but not in DataFrame (like ``DEFAULT`` or ``SERIAL`` column), and vice versa, raise an exception. + See `Explicit type cast`_. +* Find corresponding ``Spark type`` -> ``Greenplumtype (write)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* If ``Greenplumtype (write)`` match ``Greenplum type (read)``, no additional casts will be performed, DataFrame column will be written to Greenplum as is. +* If ``Greenplumtype (write)`` does not match ``Greenplum type (read)``, DataFrame column will be casted to target column type **on Greenplum side**. For example, you can write column with text data to ``json`` column which Greenplum connector currently does not support. + +Create new table using Spark +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. warning:: + + ABSOLUTELY NOT RECOMMENDED! + +This is how Greenplum connector performs this: + +* Find corresponding ``Spark type`` -> ``Greenplum type (create)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* Generate DDL for creating table in Greenplum, like ``CREATE TABLE (col1 ...)``, and run it. +* Write DataFrame to created table as is. + +More details `can be found here `_. + +But Greenplum connector support only limited number of types and almost no custom clauses (like ``PARTITION BY``). +So instead of relying on Spark to create tables: + +.. dropdown:: See example + + .. code:: python + + writer = DBWriter( + connection=greenplum, + table="public.table", + options=Greenplum.WriteOptions( + if_exists="append", + # by default distribution is random + distributedBy="id", + # partitionBy is not supported + ), + ) + writer.run(df) + +Always prefer creating table with desired DDL **BEFORE WRITING DATA**: + +.. dropdown:: See example + + .. code:: python + + greenplum.execute( + """ + CREATE TABLE public.table AS ( + id int32, + business_dt timestamp(6), + value json + ) + PARTITION BY RANGE (business_dt) + DISTRIBUTED BY id + """, + ) + + writer = DBWriter( + connection=greenplum, + table="public.table", + options=Greenplum.WriteOptions(if_exists="append"), + ) + writer.run(df) + +See Greenplum `CREATE TABLE `_ documentation. + +Supported types +--------------- + +See: + * `official connector documentation `_ + * `list of Greenplum types `_ + +Numeric types +~~~~~~~~~~~~~ + ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +| Greenplum type (read) | Spark type | Greenplumtype (write) | Greenplum type (create) | ++==================================+===================================+===============================+=========================+ +| ``decimal`` | ``DecimalType(P=38, S=18)`` | ``decimal(P=38, S=18)`` | ``decimal`` (unbounded) | ++----------------------------------+-----------------------------------+-------------------------------+ | +| ``decimal(P=0..38)`` | ``DecimalType(P=0..38, S=0)`` | ``decimal(P=0..38, S=0)`` | | ++----------------------------------+-----------------------------------+-------------------------------+ | +| ``decimal(P=0..38, S=0..38)`` | ``DecimalType(P=0..38, S=0..38)`` | ``decimal(P=0..38, S=0..38)`` | | ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +| ``decimal(P=39.., S=0..)`` | unsupported [2]_ | | | ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +| ``real`` | ``FloatType()`` | ``real`` | ``real`` | ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +| ``double precision`` | ``DoubleType()`` | ``double precision`` | ``double precision`` | ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +| ``-`` | ``ByteType()`` | unsupported | unsupported | ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +| ``smallint`` | ``ShortType()`` | ``smallint`` | ``smallint`` | ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +| ``integer`` | ``IntegerType()`` | ``integer`` | ``integer`` | ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +| ``bigint`` | ``LongType()`` | ``bigint`` | ``bigint`` | ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +| ``money`` | unsupported | | | ++----------------------------------+ | | | +| ``int4range`` | | | | ++----------------------------------+ | | | +| ``int8range`` | | | | ++----------------------------------+ | | | +| ``numrange`` | | | | ++----------------------------------+ | | | +| ``int2vector`` | | | | ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ + +.. [2] + + Greenplum support decimal types with unlimited precision. + + But Spark's ``DecimalType(P, S)`` supports maximum ``P=38`` (128 bit). It is impossible to read, write or operate with values of larger precision, + this leads to an exception. + +Temporal types +~~~~~~~~~~~~~~ + ++------------------------------------+-------------------------+-----------------------+-------------------------+ +| Greenplum type (read) | Spark type | Greenplumtype (write) | Greenplum type (create) | ++====================================+=========================+=======================+=========================+ +| ``date`` | ``DateType()`` | ``date`` | ``date`` | ++------------------------------------+-------------------------+-----------------------+-------------------------+ +| ``time`` | ``TimestampType()``, | ``timestamp`` | ``timestamp`` | ++------------------------------------+ time format quirks [3]_ | | | +| ``time(0..6)`` | | | | ++------------------------------------+ | | | +| ``time with time zone`` | | | | ++------------------------------------+ | | | +| ``time(0..6) with time zone`` | | | | ++------------------------------------+-------------------------+-----------------------+-------------------------+ +| ``timestamp`` | ``TimestampType()`` | ``timestamp`` | ``timestamp`` | ++------------------------------------+ | | | +| ``timestamp(0..6)`` | | | | ++------------------------------------+ | | | +| ``timestamp with time zone`` | | | | ++------------------------------------+ | | | +| ``timestamp(0..6) with time zone`` | | | | ++------------------------------------+-------------------------+-----------------------+-------------------------+ +| ``interval`` or any precision | unsupported | | | ++------------------------------------+ | | | +| ``daterange`` | | | | ++------------------------------------+ | | | +| ``tsrange`` | | | | ++------------------------------------+ | | | +| ``tstzrange`` | | | | ++------------------------------------+-------------------------+-----------------------+-------------------------+ + +.. warning:: + + Note that types in Greenplum and Spark have different value ranges: + + +----------------+---------------------------------+----------------------------------+---------------------+--------------------------------+--------------------------------+ + | Greenplum type | Min value | Max value | Spark type | Min value | Max value | + +================+=================================+==================================+=====================+================================+================================+ + | ``date`` | ``-4713-01-01`` | ``5874897-01-01`` | ``DateType()`` | ``0001-01-01`` | ``9999-12-31`` | + +----------------+---------------------------------+----------------------------------+---------------------+--------------------------------+--------------------------------+ + | ``timestamp`` | ``-4713-01-01 00:00:00.000000`` | ``294276-12-31 23:59:59.999999`` | ``TimestampType()`` | ``0001-01-01 00:00:00.000000`` | ``9999-12-31 23:59:59.999999`` | + +----------------+---------------------------------+----------------------------------+ | | | + | ``time`` | ``00:00:00.000000`` | ``24:00:00.000000`` | | | | + +----------------+---------------------------------+----------------------------------+---------------------+--------------------------------+--------------------------------+ + + So not all of values can be read from Greenplum to Spark. + + References: + * `Greenplum types documentation `_ + * `Spark DateType documentation `_ + * `Spark TimestampType documentation `_ + +.. [3] + + ``time`` type is the same as ``timestamp`` with date ``1970-01-01``. So instead of reading data from Postgres like ``23:59:59`` + it is actually read ``1970-01-01 23:59:59``, and vice versa. + +String types +~~~~~~~~~~~~ + ++-----------------------------+------------------+-----------------------+-------------------------+ +| Greenplum type (read) | Spark type | Greenplumtype (write) | Greenplum type (create) | ++=============================+==================+=======================+=========================+ +| ``character`` | ``StringType()`` | ``text`` | ``text`` | ++-----------------------------+ | | | +| ``character(N)`` | | | | ++-----------------------------+ | | | +| ``character varying`` | | | | ++-----------------------------+ | | | +| ``character varying(N)`` | | | | ++-----------------------------+ | | | +| ``text`` | | | | ++-----------------------------+ | | | +| ``xml`` | | | | ++-----------------------------+ | | | +| ``CREATE TYPE ... AS ENUM`` | | | | ++-----------------------------+------------------+-----------------------+-------------------------+ +| ``json`` | unsupported | | | ++-----------------------------+ | | | +| ``jsonb`` | | | | ++-----------------------------+------------------+-----------------------+-------------------------+ + +Binary types +~~~~~~~~~~~~ + ++--------------------------+-------------------+-----------------------+-------------------------+ +| Greenplum type (read) | Spark type | Greenplumtype (write) | Greenplum type (create) | ++==========================+===================+=======================+=========================+ +| ``boolean`` | ``BooleanType()`` | ``boolean`` | ``boolean`` | ++--------------------------+-------------------+-----------------------+-------------------------+ +| ``bit`` | unsupported | | | ++--------------------------+ | | | +| ``bit(N)`` | | | | ++--------------------------+ | | | +| ``bit varying`` | | | | ++--------------------------+ | | | +| ``bit varying(N)`` | | | | ++--------------------------+-------------------+-----------------------+-------------------------+ +| ``bytea`` | unsupported [4]_ | | | ++--------------------------+-------------------+-----------------------+-------------------------+ +| ``-`` | ``BinaryType()`` | ``bytea`` | ``bytea`` | ++--------------------------+-------------------+-----------------------+-------------------------+ + +.. [4] Yes, that's weird. + +Struct types +~~~~~~~~~~~~ + ++--------------------------------+------------------+-----------------------+-------------------------+ +| Greenplum type (read) | Spark type | Greenplumtype (write) | Greenplum type (create) | ++================================+==================+=======================+=========================+ +| ``T[]`` | unsupported | | | ++--------------------------------+------------------+-----------------------+-------------------------+ +| ``-`` | ``ArrayType()`` | unsupported | | ++--------------------------------+------------------+-----------------------+-------------------------+ +| ``CREATE TYPE sometype (...)`` | ``StringType()`` | ``text`` | ``text`` | ++--------------------------------+------------------+-----------------------+-------------------------+ +| ``-`` | ``StructType()`` | unsupported | | ++--------------------------------+------------------+ | | +| ``-`` | ``MapType()`` | | | ++--------------------------------+------------------+-----------------------+-------------------------+ + +Unsupported types +----------------- + +Columns of these types cannot be read/written by Spark: + * ``cidr`` + * ``inet`` + * ``macaddr`` + * ``macaddr8`` + * ``circle`` + * ``box`` + * ``line`` + * ``lseg`` + * ``path`` + * ``point`` + * ``polygon`` + * ``tsvector`` + * ``tsquery`` + * ``uuid`` + +The is a way to avoid this - just cast unsupported types to ``text``. But the way this can be done is not a straightforward. + +Explicit type cast +------------------ + +``DBReader`` +~~~~~~~~~~~~ + +Unfortunately, it is not possible to cast unsupported column to some supported type on ``DBReader`` side: + +.. code-block:: python + + DBReader( + connection=greenplum, + # will fail + columns=["CAST(column AS text)"], + ) + +This is related to Greenplum connector implementation. Instead of passing this ``CAST`` expression to ``SELECT`` query +as is, it performs type cast on Spark side, so this syntax is not supported. + +But there is a workaround - create a view with casting unsupported column to ``text`` (or any other supported type). + +For example, you can use ``to_json`` Postgres function for convert column of any type to string representation. +You can then parse this column on Spark side using `from_json `_: + +.. code:: python + + from pyspark.sql.functions import from_json + from pyspark.sql.types import ArrayType, IntegerType + + from onetl.connection import Greenplum + from onetl.db import DBReader + + greenplum = Greenplum(...) + + # create view with proper type cast + greenplum.execute( + """ + CREATE VIEW schema.view_with_json_column AS + SELECT + id, + supported_column, + to_json(array_column) array_column_as_json, + gp_segment_id -- ! important ! + FROM + schema.table_with_unsupported_columns + """, + ) + + # create dataframe using this view + reader = DBReader( + connection=greenplum, + source="schema.view_with_json_column", + ) + df = reader.run() + + # Spark requires all columns to have some type, describe it + column_type = ArrayType(IntegerType()) + + # cast column content to proper Spark type + df = df.select( + df.id, + df.supported_column, + from_json(df.array_column_as_json, schema).alias("array_column"), + ) + +``DBWriter`` +~~~~~~~~~~~~ + +It is always possible to convert data on Spark side to string, and then write it to ``text`` column in Greenplum table. + +For example, you can convert data using `to_json `_ function. + +.. code:: python + + from pyspark.sql.functions import to_json + + from onetl.connection import Greenplum + from onetl.db import DBReader + + greenplum = Greenplum(...) + + greenplum.execute( + """ + CREATE TABLE schema.target_table ( + id int, + supported_column timestamp, + array_column_as_json jsonb, -- or text + ) + DISTRIBUTED BY id + """, + ) + + write_df = df.select( + df.id, + df.supported_column, + to_json(df.array_column).alias("array_column_json"), + ) + + writer = DBWriter( + connection=greenplum, + target="schema.target_table", + ) + writer.run(write_df) + +Then you can parse this column on Greenplum side: + +.. code-block:: sql + + SELECT + id, + supported_column, + -- access first item of an array + array_column_as_json->0 + FROM + schema.target_table diff --git a/docs/connection/db_connection/greenplum/write.rst b/docs/connection/db_connection/greenplum/write.rst index 8dfa5a4c6..d7e993d41 100644 --- a/docs/connection/db_connection/greenplum/write.rst +++ b/docs/connection/db_connection/greenplum/write.rst @@ -1,10 +1,46 @@ .. _greenplum-write: -Writing to Greenplum -===================== +Writing to Greenplum using ``DBWriter`` +======================================== -For writing data to Greenplum, use :obj:`DBWriter ` with options below. +For writing data to Greenplum, use :obj:`DBWriter ` +with :obj:`GreenplumWriteOptions `. +.. warning:: + + Please take into account :ref:`greenplum-types`. + +.. warning:: + + It is always recommended to create table explicitly using :ref:`Greenplum.execute ` + instead of relying on Spark's table DDL generation. + + This is because Spark's DDL generator can create columns with different types than it is expected. + +Examples +-------- + +.. code-block:: python + + from onetl.connection import Greenplum + from onetl.db import DBWriter + + greenplum = Greenplum(...) + + df = ... # data is here + + writer = DBWriter( + connection=greenplum, + target="schema.table", + options=Greenplum.WriteOptions( + if_exists="append", + # by default distribution is random + distributedBy="id", + # partitionBy is not supported + ), + ) + + writer.run(df) Interaction schema ------------------ @@ -87,16 +123,6 @@ High-level schema is described in :ref:`greenplum-prerequisites`. You can find d deactivate "Spark driver" @enduml -Recommendations ---------------- - -Mapping of Spark types to Greenplum types -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -See `official documentation `_ -for more details. -onETL does not perform any additional casting of types while writing data. - Options ------- diff --git a/docs/connection/db_connection/mongodb/index.rst b/docs/connection/db_connection/mongodb/index.rst index a863265a4..6eacc8548 100644 --- a/docs/connection/db_connection/mongodb/index.rst +++ b/docs/connection/db_connection/mongodb/index.rst @@ -7,6 +7,7 @@ MongoDB :maxdepth: 1 :caption: Connection + prerequisites connection .. toctree:: @@ -14,4 +15,11 @@ MongoDB :caption: Operations read + pipeline write + +.. toctree:: + :maxdepth: 1 + :caption: Troubleshooting + + types diff --git a/docs/connection/db_connection/mongodb/pipeline.rst b/docs/connection/db_connection/mongodb/pipeline.rst new file mode 100644 index 000000000..f9d60e0f6 --- /dev/null +++ b/docs/connection/db_connection/mongodb/pipeline.rst @@ -0,0 +1,35 @@ +.. _mongodb-sql: + +Reading from MongoDB using ``MongoDB.pipeline`` +=============================================== + +:obj:`MongoDB.sql ` allows passing custom pipeline, +but does not support incremental strategies. + +.. warning:: + + Please take into account :ref:`mongodb-types` + +Recommendations +--------------- + +Pay attention to ``pipeline`` value +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of filtering data on Spark side using ``df.filter(df.column == 'value')`` pass proper ``mongodb.pipeline(..., pipeline={"$match": {"column": {"$eq": "value"}}})`` value. +This both reduces the amount of data send from MongoDB to Spark, and may also improve performance of the query. +Especially if there are indexes for columns used in ``pipeline`` value. + +References +---------- + +.. currentmodule:: onetl.connection.db_connection.mongodb.connection + +.. automethod:: MongoDB.pipeline + +.. currentmodule:: onetl.connection.db_connection.mongodb.options + +.. autopydantic_model:: MongoDBPipelineOptions + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false diff --git a/docs/connection/db_connection/mongodb/prerequisites.rst b/docs/connection/db_connection/mongodb/prerequisites.rst new file mode 100644 index 000000000..6bc6e90e9 --- /dev/null +++ b/docs/connection/db_connection/mongodb/prerequisites.rst @@ -0,0 +1,77 @@ +.. _mongodb-prerequisites: + +Prerequisites +============= + +Version Compatibility +--------------------- + +* MongoDB server versions: 4.0 or higher +* Spark versions: 3.2.x - 3.4.x +* Scala versions: 2.12 - 2.13 +* Java versions: 8 - 20 + +See `official documentation `_. + +Installing PySpark +------------------ + +To use MongoDB connector you should have PySpark installed (or injected to ``sys.path``) +BEFORE creating the connector instance. + +See :ref:`install-spark` installation instruction for more details. + +Connecting to MongoDB +--------------------- + +Connection host +~~~~~~~~~~~~~~~ + +It is possible to connect to MongoDB host by using either DNS name of host or it's IP address. + +It is also possible to connect to MongoDB shared cluster: + +.. code-block:: python + + + mongo = MongoDB( + host="master.host.or.ip", + user="user", + password="*****", + database="target_database", + spark=spark, + extra={ + # read data from secondary cluster node, switch to primary if not available + "readPreference": "secondaryPreferred", + }, + ) + +Supported ``readPreference`` values are described in `official documentation `_. + +Connection port +~~~~~~~~~~~~~~~ + +Connection is usually performed to port ``27017``. Port may differ for different MongoDB instances. +Please ask your MongoDB administrator to provide required information. + +Required grants +~~~~~~~~~~~~~~~ + +Ask your MongoDB cluster administrator to set following grants for a user, +used for creating a connection: + +.. tabs:: + + .. code-tab:: js Read + Write + + // allow writing data to specific database + db.grantRolesToUser("username", [{db: "somedb", role: "readWrite"}]) + + .. code-tab:: js Read only + + // allow reading data from specific database + db.grantRolesToUser("username", [{db: "somedb", role: "read"}]) + +See: + * `db.grantRolesToUser documentation `_ + * `MongoDB builtin roles `_ diff --git a/docs/connection/db_connection/mongodb/read.rst b/docs/connection/db_connection/mongodb/read.rst index b3d51686a..e90bc6e2b 100644 --- a/docs/connection/db_connection/mongodb/read.rst +++ b/docs/connection/db_connection/mongodb/read.rst @@ -1,16 +1,137 @@ .. _mongodb-read: -Reading from MongoDB -==================== +Reading from MongoDB using ``DBReader`` +======================================= -There are 2 ways of distributed data reading from MongoDB: +:obj:`DBReader ` supports :ref:`strategy` for incremental data reading, +but does not support custom pipelines, e.g. aggregation. -* Using :obj:`DBReader ` with different :ref:`strategy` and :obj:`MongoDBReadOptions ` -* Using :obj:`MongoDB.pipeline ` with :obj:`MongoDBPipelineOptions ` +.. warning:: -.. currentmodule:: onetl.connection.db_connection.mongodb.connection + Please take into account :ref:`mongodb-types` -.. automethod:: MongoDB.pipeline +Supported DBReader features +--------------------------- + +* ❌ ``columns`` (for now, all document fields are read) +* ✅︎ ``where`` (passed to ``{"$match": ...}`` aggregation pipeline) +* ✅︎ ``hwm``, supported strategies: +* * ✅︎ :ref:`snapshot-strategy` +* * ✅︎ :ref:`incremental-strategy` +* * ✅︎ :ref:`snapshot-batch-strategy` +* * ✅︎ :ref:`incremental-batch-strategy` +* * Note that ``expression`` field of HWM can only be a field name, not a custom expression +* ✅︎ ``hint`` (see `official documentation `_) +* ✅︎ ``df_schema`` (mandatory) +* ✅︎ ``options`` (see :obj:`MongoDBReadOptions `) + +Examples +-------- + +Snapshot strategy: + +.. code-block:: python + + from onetl.connection import MongoDB + from onetl.db import DBReader + + from pyspark.sql.types import ( + StructType, + StructField, + IntegerType, + StringType, + TimestampType, + ) + + mongodb = MongoDB(...) + + # mandatory + df_schema = StructType( + [ + StructField("_id", StringType()), + StructField("some", StringType()), + StructField( + "field", + StructType( + [ + StructField("nested", IntegerType()), + ], + ), + ), + StructField("updated_dt", TimestampType()), + ] + ) + + reader = DBReader( + connection=mongodb, + source="some_collection", + df_schema=df_schema, + where={"field": {"$eq": 123}}, + hint={"field": 1}, + options=MongoDBReadOptions(batchSize=10000), + ) + df = reader.run() + +Incremental strategy: + +.. code-block:: python + + from onetl.connection import MongoDB + from onetl.db import DBReader + from onetl.strategy import IncrementalStrategy + + from pyspark.sql.types import ( + StructType, + StructField, + IntegerType, + StringType, + TimestampType, + ) + + mongodb = MongoDB(...) + + # mandatory + df_schema = StructType( + [ + StructField("_id", StringType()), + StructField("some", StringType()), + StructField( + "field", + StructType( + [ + StructField("nested", IntegerType()), + ], + ), + ), + StructField("updated_dt", TimestampType()), + ] + ) + + reader = DBReader( + connection=mongodb, + source="some_collection", + df_schema=df_schema, + where={"field": {"$eq": 123}}, + hint={"field": 1}, + hwm=DBReader.AutoDetectHWM(name="mongodb_hwm", expression="updated_dt"), + options=MongoDBReadOptions(batchSize=10000), + ) + + with IncrementalStrategy(): + df = reader.run() + +Recommendations +--------------- + +Pay attention to ``where`` value +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of filtering data on Spark side using ``df.filter(df.column == 'value')`` pass proper ``DBReader(where={"column": {"$eq": "value"}})`` clause. +This both reduces the amount of data send from MongoDB to Spark, and may also improve performance of the query. +Especially if there are indexes for columns used in ``where`` clause. + +Read options +------------ .. currentmodule:: onetl.connection.db_connection.mongodb.options @@ -18,8 +139,3 @@ There are 2 ways of distributed data reading from MongoDB: :member-order: bysource :model-show-field-summary: false :field-show-constraints: false - -.. autopydantic_model:: MongoDBPipelineOptions - :member-order: bysource - :model-show-field-summary: false - :field-show-constraints: false diff --git a/docs/connection/db_connection/mongodb/types.rst b/docs/connection/db_connection/mongodb/types.rst new file mode 100644 index 000000000..bd5978aa9 --- /dev/null +++ b/docs/connection/db_connection/mongodb/types.rst @@ -0,0 +1,268 @@ +.. _mongodb-types: + +MongoDB <-> Spark type mapping +============================== + +Type detection & casting +------------------------ + +Spark's DataFrames always have a ``schema`` which is a list of fields with corresponding Spark types. All operations on a field are performed using field type. + +MongoDB is, by design, __schemaless__. So there are 2 ways how this can be handled: + +* User provides DataFrame schema explicitly: + + .. dropdown:: See example + + .. code-block:: python + + from onetl.connection import MongoDB + from onetl.db import DBReader + + from pyspark.sql.types import ( + StructType, + StructField, + IntegerType, + StringType, + TimestampType, + ) + + mongodb = MongoDB(...) + + df_schema = StructType( + [ + StructField("_id", StringType()), + StructField("some", StringType()), + StructField( + "field", + StructType( + [ + StructField("nested", IntegerType()), + ] + ), + ), + ] + ) + + reader = DBReader( + connection=mongodb, + source="some_collection", + df_schema=df_schema, + ) + df = reader.run() + + # or + + df = mongodb.pipeline( + collection="some_collection", + df_schema=df_schema, + ) + +* Rely on MongoDB connector schema infer: + + .. code-block:: python + + df = mongodb.pipeline(collection="some_collection") + + In this case MongoDB connector read a sample of collection documents, and build DataFrame schema based on document fields and values. + +It is highly recommended to pass ``df_schema`` explicitly, to avoid type conversion issues. + +References +~~~~~~~~~~ + +Here you can find source code with type conversions: + +* `MongoDB -> Spark `_ +* `Spark -> MongoDB `_ + +Supported types +--------------- + +See `official documentation `_ + +Numeric types +~~~~~~~~~~~~~ + ++---------------------+-----------------------------+----------------------+ +| MongoDB type (read) | Spark type | MongoDB type (write) | ++=====================+=============================+======================+ +| ``Decimal128`` | ``DecimalType(P=34, S=32)`` | ``Decimal128`` | ++---------------------+-----------------------------+----------------------+ +| ``-`` | ``FloatType()`` | ``Double`` | ++---------------------+-----------------------------+ | +| ``Double`` | ``DoubleType()`` | | ++---------------------+-----------------------------+----------------------+ +| ``-`` | ``ByteType()`` | ``Int32`` | ++---------------------+-----------------------------+ | +| ``-`` | ``ShortType()`` | | ++---------------------+-----------------------------+ | +| ``Int32`` | ``IntegerType()`` | | ++---------------------+-----------------------------+----------------------+ +| ``Int64`` | ``LongType()`` | ``Int64`` | ++---------------------+-----------------------------+----------------------+ + +Temporal types +~~~~~~~~~~~~~~ + ++------------------------+-----------------------------------+-------------------------+ +| MongoDB type (read) | Spark type | MongoDB type (write) | ++========================+===================================+=========================+ +| ``-`` | ``DateType()``, days | ``Date``, milliseconds | ++------------------------+-----------------------------------+-------------------------+ +| ``Date``, milliseconds | ``TimestampType()``, microseconds | ``Date``, milliseconds, | +| | | **precision loss** [2]_ | ++------------------------+-----------------------------------+-------------------------+ +| ``Timestamp``, seconds | ``TimestampType()``, microseconds | ``Date``, milliseconds | ++------------------------+-----------------------------------+-------------------------+ +| ``-`` | ``TimestampNTZType()`` | unsupported | ++------------------------+-----------------------------------+ | +| ``-`` | ``DayTimeIntervalType()`` | | ++------------------------+-----------------------------------+-------------------------+ + +.. warning:: + + Note that types in MongoDB and Spark have different value ranges: + + +---------------+--------------------------------+--------------------------------+---------------------+--------------------------------+--------------------------------+ + | MongoDB type | Min value | Max value | Spark type | Min value | Max value | + +===============+================================+================================+=====================+================================+================================+ + | ``Date`` | -290 million years | 290 million years | ``TimestampType()`` | ``0001-01-01 00:00:00.000000`` | ``9999-12-31 23:59:59.999999`` | + +---------------+--------------------------------+--------------------------------+ | | | + | ``Timestamp`` | ``1970-01-01 00:00:00`` | ``2106-02-07 09:28:16`` | | | | + +---------------+--------------------------------+--------------------------------+---------------------+--------------------------------+--------------------------------+ + + So not all values can be read from MongoDB to Spark, and can written from Spark DataFrame to MongoDB. + + References: + * `MongoDB Date type documentation `_ + * `MongoDB Timestamp documentation `_ + * `Spark DateType documentation `_ + * `Spark TimestampType documentation `_ + +.. [2] + MongoDB ``Date`` type has precision up to milliseconds (``23:59:59.999``). + Inserting data with microsecond precision (``23:59:59.999999``) + will lead to **throwing away microseconds**. + +String types +~~~~~~~~~~~~~ + +Note: fields of deprecated MongoDB type ``Symbol`` are excluded during read. + ++---------------------+------------------+----------------------+ +| MongoDB type (read) | Spark type | MongoDB type (write) | ++=====================+==================+======================+ +| ``String`` | ``StringType()`` | ``String`` | ++---------------------+ | | +| ``Code`` | | | ++---------------------+ | | +| ``RegExp`` | | | ++---------------------+------------------+----------------------+ + +Binary types +~~~~~~~~~~~~ + ++---------------------+-------------------+----------------------+ +| MongoDB type (read) | Spark type | MongoDB type (write) | ++=====================+===================+======================+ +| ``Boolean`` | ``BooleanType()`` | ``Boolean`` | ++---------------------+-------------------+----------------------+ +| ``Binary`` | ``BinaryType()`` | ``Binary`` | ++---------------------+-------------------+----------------------+ + +Struct types +~~~~~~~~~~~~ + ++---------------------+-----------------------+----------------------+ +| MongoDB type (read) | Spark type | MongoDB type (write) | ++=====================+=======================+======================+ +| ``Array[T]`` | ``ArrayType(T)`` | ``Array[T]`` | ++---------------------+-----------------------+----------------------+ +| ``Object[...]`` | ``StructType([...])`` | ``Object[...]`` | ++---------------------+-----------------------+ | +| ``-`` | ``MapType(...)`` | | ++---------------------+-----------------------+----------------------+ + +Special types +~~~~~~~~~~~~~ + ++---------------------+---------------------------------------------------------+---------------------------------------+ +| MongoDB type (read) | Spark type | MongoDB type (write) | ++=====================+=========================================================+=======================================+ +| ``ObjectId`` | ``StringType()`` | ``String`` | ++---------------------+ | | +| ``MaxKey`` | | | ++---------------------+ | | +| ``MinKey`` | | | ++---------------------+---------------------------------------------------------+---------------------------------------+ +| ``Null`` | ``NullType()`` | ``Null`` | ++---------------------+ | | +| ``Undefined`` | | | ++---------------------+---------------------------------------------------------+---------------------------------------+ +| ``DBRef`` | ``StructType([$ref: StringType(), $id: StringType()])`` | ``Object[$ref: String, $id: String]`` | ++---------------------+---------------------------------------------------------+---------------------------------------+ + +Explicit type cast +------------------ + +``DBReader`` +~~~~~~~~~~~~ + +Currently it is not possible to cast field types using ``DBReader``. But this can be done using ``MongoDB.pipeline``. + +``MongoDB.pipeline`` +~~~~~~~~~~~~~~~~~~~~ + +You can use ``$project`` aggregation to cast field types: + +.. code-block:: python + + from pyspark.sql.types import IntegerType, StructField, StructType + + from onetl.connection import MongoDB + from onetl.db import DBReader + + mongodb = MongoDB(...) + + df = mongodb.pipeline( + collection="my_collection", + pipeline=[ + { + "$project": { + # convert unsupported_field to string + "unsupported_field_str": { + "$convert": { + "input": "$unsupported_field", + "to": "string", + }, + }, + # skip unsupported_field from result + "unsupported_field": 0, + } + } + ], + ) + + # cast field content to proper Spark type + df = df.select( + df.id, + df.supported_field, + # explicit cast + df.unsupported_field_str.cast("integer").alias("parsed_integer"), + ) + +``DBWriter`` +~~~~~~~~~~~~ + +Convert dataframe field to string on Spark side, and then write it to MongoDB: + +.. code:: python + + + df = df.select( + df.id, + df.unsupported_field.cast("string").alias("array_field_json"), + ) + + writer.run(df) diff --git a/docs/connection/db_connection/mongodb/write.rst b/docs/connection/db_connection/mongodb/write.rst index 5fff32d70..3ae86fece 100644 --- a/docs/connection/db_connection/mongodb/write.rst +++ b/docs/connection/db_connection/mongodb/write.rst @@ -1,9 +1,41 @@ .. _mongodb-write: -Writing to MongoDB -================== +Writing to MongoDB using ``DBWriter`` +===================================== -For writing data to MongoDB, use :obj:`DBWriter ` with options below. +For writing data to MongoDB, use :obj:`DBWriter `. + +.. warning:: + + Please take into account :ref:`mongodb-types` + +Examples +-------- + +.. code-block:: python + + from onetl.connection import MongoDB + from onetl.db import DBWriter + + mongodb = MongoDB(...) + + df = ... # data is here + + writer = DBWriter( + connection=mongodb, + target="schema.table", + options=MongoDB.WriteOptions( + if_exists="append", + ), + ) + + writer.run(df) + + +Write options +------------- + +Method above accepts :obj:`JDBCWriteOptions ` .. currentmodule:: onetl.connection.db_connection.mongodb.options diff --git a/docs/connection/db_connection/mssql/execute.rst b/docs/connection/db_connection/mssql/execute.rst index bed53fec5..729c75a30 100644 --- a/docs/connection/db_connection/mssql/execute.rst +++ b/docs/connection/db_connection/mssql/execute.rst @@ -3,11 +3,105 @@ Executing statements in MSSQL ============================= -.. currentmodule:: onetl.connection.db_connection.mssql.connection +.. warning:: -.. automethod:: MSSQL.fetch -.. automethod:: MSSQL.execute -.. automethod:: MSSQL.close + Methods below **read all the rows** returned from DB **to Spark driver memory**, and then convert them to DataFrame. + + Do **NOT** use them to read large amounts of data. Use :ref:`DBReader ` or :ref:`MSSQL.sql ` instead. + +How to +------ + +There are 2 ways to execute some statement in MSSQL + +Use ``MSSQL.fetch`` +~~~~~~~~~~~~~~~~~~~ + +Use this method to execute some ``SELECT`` query which returns **small number or rows**, like reading +MSSQL config, or reading data from some reference table. Method returns Spark DataFrame. + +Method accepts :obj:`JDBCOptions `. + +Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. + +.. warning:: + + Please take into account :ref:`mssql-types`. + +Syntax support +^^^^^^^^^^^^^^ + +This method supports **any** query syntax supported by MSSQL, like: + +* ✅︎ ``SELECT ... FROM ...`` +* ✅︎ ``WITH alias AS (...) SELECT ...`` +* ✅︎ ``SELECT func(arg1, arg2) FROM DUAL`` - call function +* ❌ ``SET ...; SELECT ...;`` - multiple statements not supported + +Examples +^^^^^^^^ + +.. code-block:: python + + from onetl.connection import MSSQL + + mssql = MSSQL(...) + + df = mssql.fetch( + "SELECT value FROM some.reference_table WHERE key = 'some_constant'", + options=MSSQL.JDBCOptions(query_timeout=10), + ) + mssql.close() + value = df.collect()[0][0] # get value from first row and first column + +Use ``MSSQL.execute`` +~~~~~~~~~~~~~~~~~~~~~ + +Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it. + +Method accepts :obj:`JDBCOptions `. + +Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. + +Syntax support +^^^^^^^^^^^^^^ + +This method supports **any** query syntax supported by MSSQL, like: + +* ✅︎ ``CREATE TABLE ...``, ``CREATE VIEW ...`` +* ✅︎ ``ALTER ...`` +* ✅︎ ``INSERT INTO ... AS SELECT ...`` +* ✅︎ ``DROP TABLE ...``, ``DROP VIEW ...``, and so on +* ✅︎ ``CALL procedure(arg1, arg2) ...`` or ``{call procedure(arg1, arg2)}`` - special syntax for calling procedure +* ✅︎ ``DECLARE ... BEGIN ... END`` - execute PL/SQL statement +* ✅︎ other statements not mentioned here +* ❌ ``SET ...; SELECT ...;`` - multiple statements not supported + +Examples +^^^^^^^^ + +.. code-block:: python + + from onetl.connection import MSSQL + + mssql = MSSQL(...) + + with mssql: + # automatically close connection after exiting this context manager + mssql.execute("DROP TABLE schema.table") + mssql.execute( + """ + CREATE TABLE schema.table AS ( + id bigint GENERATED ALWAYS AS IDENTITY, + key VARCHAR2(4000), + value NUMBER + ) + """, + options=MSSQL.JDBCOptions(query_timeout=10), + ) + +Options +------- .. currentmodule:: onetl.connection.db_connection.jdbc_mixin.options diff --git a/docs/connection/db_connection/mssql/index.rst b/docs/connection/db_connection/mssql/index.rst index 5e511e83e..696dfda53 100644 --- a/docs/connection/db_connection/mssql/index.rst +++ b/docs/connection/db_connection/mssql/index.rst @@ -1,12 +1,13 @@ .. _mssql: MSSQL -===== +====== .. toctree:: :maxdepth: 1 :caption: Connection + prerequisites connection .. toctree:: @@ -14,5 +15,12 @@ MSSQL :caption: Operations read + sql write execute + +.. toctree:: + :maxdepth: 1 + :caption: Troubleshooting + + types diff --git a/docs/connection/db_connection/mssql/prerequisites.rst b/docs/connection/db_connection/mssql/prerequisites.rst new file mode 100644 index 000000000..33786d61c --- /dev/null +++ b/docs/connection/db_connection/mssql/prerequisites.rst @@ -0,0 +1,77 @@ +.. _mssql-prerequisites: + +Prerequisites +============= + +Version Compatibility +--------------------- + +* SQL Server versions: 2014 - 2022 +* Spark versions: 2.3.x - 3.5.x +* Java versions: 8 - 20 + +See `official documentation `_ +and `official compatibility matrix `_. + +Installing PySpark +------------------ + +To use MSSQL connector you should have PySpark installed (or injected to ``sys.path``) +BEFORE creating the connector instance. + +See :ref:`install-spark` installation instruction for more details. + +Connecting to MSSQL +-------------------- + +Connection port +~~~~~~~~~~~~~~~ + +Connection is usually performed to port 1443. Port may differ for different MSSQL instances. +Please ask your MSSQL administrator to provide required information. + +Connection host +~~~~~~~~~~~~~~~ + +It is possible to connect to MSSQL by using either DNS name of host or it's IP address. + +If you're using MSSQL cluster, it is currently possible to connect only to **one specific node**. +Connecting to multiple nodes to perform load balancing, as well as automatic failover to new master/replica are not supported. + +Required grants +~~~~~~~~~~~~~~~ + +Ask your MSSQL cluster administrator to set following grants for a user, +used for creating a connection: + +.. tabs:: + + .. code-tab:: sql Read + Write (schema is owned by user) + + -- allow creating tables for user + GRANT CREATE TABLE TO username; + + -- allow read & write access to specific table + GRANT SELECT, INSERT ON username.mytable TO username; + + -- only if if_exists="replace_entire_table" is used: + -- allow dropping/truncating tables in any schema + GRANT ALTER ON username.mytable TO username; + + .. code-tab:: sql Read + Write (schema is not owned by user) + + -- allow creating tables for user + GRANT CREATE TABLE TO username; + + -- allow managing tables in specific schema, and inserting data to tables + GRANT ALTER, SELECT, INSERT ON SCHEMA::someschema TO username; + + .. code-tab:: sql Read only + + -- allow read access to specific table + GRANT SELECT ON someschema.mytable TO username; + +More details can be found in official documentation: + * `GRANT ON DATABASE `_ + * `GRANT ON OBJECT `_ + * `GRANT ON SCHEMA `_ diff --git a/docs/connection/db_connection/mssql/read.rst b/docs/connection/db_connection/mssql/read.rst index 3a336f823..0c8599aea 100644 --- a/docs/connection/db_connection/mssql/read.rst +++ b/docs/connection/db_connection/mssql/read.rst @@ -1,18 +1,89 @@ .. _mssql-read: -Reading from MSSQL -================== +Reading from MSSQL using ``DBReader`` +====================================== -There are 2 ways of distributed data reading from MSSQL: +:obj:`DBReader ` supports :ref:`strategy` for incremental data reading, +but does not support custom queries, like ``JOIN``. -* Using :obj:`DBReader ` with different :ref:`strategy` -* Using :obj:`MSSQL.sql ` +.. warning:: -Both methods accept :obj:`JDBCReadOptions ` + Please take into account :ref:`mssql-types` -.. currentmodule:: onetl.connection.db_connection.mssql.connection +Supported DBReader features +--------------------------- -.. automethod:: MSSQL.sql +* ✅︎ ``columns`` +* ✅︎ ``where`` +* ✅︎ ``hwm``, supported strategies: +* * ✅︎ :ref:`snapshot-strategy` +* * ✅︎ :ref:`incremental-strategy` +* * ✅︎ :ref:`snapshot-batch-strategy` +* * ✅︎ :ref:`incremental-batch-strategy` +* ❌ ``hint`` (MSSQL does support hints, but DBReader not, at least for now) +* ❌ ``df_schema`` +* ✅︎ ``options`` (see :obj:`JDBCReadOptions `) + +Examples +-------- + +Snapshot strategy: + +.. code-block:: python + + from onetl.connection import MSSQL + from onetl.db import DBReader + + mssql = MSSQL(...) + + reader = DBReader( + connection=mssql, + source="schema.table", + columns=["id", "key", "CAST(value AS text) value", "updated_dt"], + where="key = 'something'", + options=MSSQL.ReadOptions(partition_column="id", num_partitions=10), + ) + df = reader.run() + +Incremental strategy: + +.. code-block:: python + + from onetl.connection import MSSQL + from onetl.db import DBReader + from onetl.strategy import IncrementalStrategy + + mssql = MSSQL(...) + + reader = DBReader( + connection=mssql, + source="schema.table", + columns=["id", "key", "CAST(value AS text) value", "updated_dt"], + where="key = 'something'", + hwm=DBReader.AutoDetectHWM(name="mssql_hwm", expression="updated_dt"), + options=MSSQL.ReadOptions(partition_column="id", num_partitions=10), + ) + + with IncrementalStrategy(): + df = reader.run() + +Recommendations +--------------- + +Select only required columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of passing ``"*"`` in ``DBReader(columns=[...])`` prefer passing exact column names. This reduces the amount of data passed from MSSQL to Spark. + +Pay attention to ``where`` value +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of filtering data on Spark side using ``df.filter(df.column == 'value')`` pass proper ``DBReader(where="column = 'value'")`` clause. +This both reduces the amount of data send from MSSQL to Spark, and may also improve performance of the query. +Especially if there are indexes or partitions for columns used in ``where`` clause. + +Options +------- .. currentmodule:: onetl.connection.db_connection.jdbc_connection.options diff --git a/docs/connection/db_connection/mssql/sql.rst b/docs/connection/db_connection/mssql/sql.rst new file mode 100644 index 000000000..8a59d376a --- /dev/null +++ b/docs/connection/db_connection/mssql/sql.rst @@ -0,0 +1,68 @@ +.. _mssql-sql: + +Reading from MSSQL using ``MSSQL.sql`` +======================================== + +``MSSQL.sql`` allows passing custom SQL query, but does not support incremental strategies. + +.. warning:: + + Please take into account :ref:`mssql-types` + +.. warning:: + + Statement is executed in **read-write** connection, so if you're calling some functions/procedures with DDL/DML statements inside, + they can change data in your database. + +Syntax support +-------------- + +Only queries with the following syntax are supported: + +* ✅︎ ``SELECT ... FROM ...`` +* ❌ ``WITH alias AS (...) SELECT ...`` +* ❌ ``SET ...; SELECT ...;`` - multiple statements not supported + +Examples +-------- + +.. code-block:: python + + from onetl.connection import MSSQL + + mssql = MSSQL(...) + df = mssql.sql( + """ + SELECT + id, + key, + CAST(value AS text) value, + updated_at + FROM + some.mytable + WHERE + key = 'something' + """, + options=MSSQL.ReadOptions( + partition_column="id", + num_partitions=10, + lower_bound=0, + upper_bound=1000, + ), + ) + +Recommendations +--------------- + +Select only required columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of passing ``SELECT * FROM ...`` prefer passing exact column names ``SELECT col1, col2, ...``. +This reduces the amount of data passed from MSSQL to Spark. + +Pay attention to ``where`` value +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of filtering data on Spark side using ``df.filter(df.column == 'value')`` pass proper ``WHERE column = 'value'`` clause. +This both reduces the amount of data send from MSSQL to Spark, and may also improve performance of the query. +Especially if there are indexes or partitions for columns used in ``where`` clause. diff --git a/docs/connection/db_connection/mssql/types.rst b/docs/connection/db_connection/mssql/types.rst new file mode 100644 index 000000000..ea7fd7102 --- /dev/null +++ b/docs/connection/db_connection/mssql/types.rst @@ -0,0 +1,371 @@ +.. _mssql-types: + +MSSQL <-> Spark type mapping +================================= + +Type detection & casting +------------------------ + +Spark's DataFrames always have a ``schema`` which is a list of columns with corresponding Spark types. All operations on a column are performed using column type. + +Reading from MSSQL +~~~~~~~~~~~~~~~~~~~~~~~ + +This is how MSSQL connector performs this: + +* For each column in query result (``SELECT column1, column2, ... FROM table ...``) get column name and MSSQL type. +* Find corresponding ``MSSQL type (read)`` -> ``Spark type`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* Create DataFrame from query with specific column names and Spark types. + +Writing to some existing MSSQL table +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This is how MSSQL connector performs this: + +* Get names of columns in DataFrame. [1]_ +* Perform ``SELECT * FROM table LIMIT 0`` query. +* Take only columns present in DataFrame (by name, case insensitive). For each found column get MSSQL type. +* Find corresponding ``Spark type`` -> ``MSSQL type (write)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* If ``MSSQL type (write)`` match ``MSSQL type (read)``, no additional casts will be performed, DataFrame column will be written to MSSQL as is. +* If ``MSSQL type (write)`` does not match ``MSSQL type (read)``, DataFrame column will be casted to target column type **on MSSQL side**. + For example, you can write column with text data to ``int`` column, if column contains valid integer values within supported value range and precision [2]_. + +.. [1] + This allows to write data to tables with ``DEFAULT`` and ``GENERATED`` columns - if DataFrame has no such column, + it will be populated by MSSQL. + +.. [2] + This is true only if DataFrame column is a ``StringType()``, because text value is parsed automatically to tagret column type. + + But other types cannot be silently converted, like ``int -> text``. This requires explicit casting, see `DBWriter`_. + +Create new table using Spark +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. warning:: + + ABSOLUTELY NOT RECOMMENDED! + +This is how MSSQL connector performs this: + +* Find corresponding ``Spark type`` -> ``MSSQL type (create)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* Generate DDL for creating table in MSSQL, like ``CREATE TABLE (col1 ...)``, and run it. +* Write DataFrame to created table as is. + +But some cases this may lead to using wrong column type. For example, Spark creates column of type ``timestamp`` +which corresponds to MSSQL's type ``timestamp(0)`` (precision up to seconds) +instead of more precise ``timestamp(6)`` (precision up to nanoseconds). +This may lead to incidental precision loss, or sometimes data cannot be written to created table at all. + +So instead of relying on Spark to create tables: + +.. dropdown:: See example + + .. code:: python + + writer = DBWriter( + connection=mssql, + table="myschema.target_tbl", + options=MSSQL.WriteOptions( + if_exists="append", + ), + ) + writer.run(df) + +Always prefer creating tables with specific types **BEFORE WRITING DATA**: + +.. dropdown:: See example + + .. code:: python + + mssql.execute( + """ + CREATE TABLE schema.table AS ( + id bigint, + key text, + value datetime2(6) -- specific type and precision + ) + """, + ) + + writer = DBWriter( + connection=mssql, + table="myschema.target_tbl", + options=MSSQL.WriteOptions(if_exists="append"), + ) + writer.run(df) + +References +~~~~~~~~~~ + +Here you can find source code with type conversions: + +* `MSSQL -> JDBC `_ +* `JDBC -> Spark `_ +* `Spark -> JDBC `_ +* `JDBC -> MSSQL `_ + +Supported types +--------------- + +See `official documentation `_ + +Numeric types +~~~~~~~~~~~~~ + ++-------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| MSSQL type (read) | Spark type | MSSQL type (write) | MSSQL type (create) | ++===============================+===================================+===============================+===============================+ +| ``decimal`` | ``DecimalType(P=18, S=0)`` | ``decimal(P=18, S=0)`` | ``decimal(P=18, S=0)`` | ++-------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``decimal(P=0..38)`` | ``DecimalType(P=0..38, S=0)`` | ``decimal(P=0..38, S=0)`` | ``decimal(P=0..38, S=0)`` | ++-------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``decimal(P=0..38, S=0..38)`` | ``DecimalType(P=0..38, S=0..38)`` | ``decimal(P=0..38, S=0..38)`` | ``decimal(P=0..38, S=0..38)`` | ++-------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``real`` | ``FloatType()`` | ``real`` | ``real`` | ++-------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``float`` | ``DoubleType()`` | ``float`` | ``float`` | ++-------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``smallint`` | ``ShortType()`` | ``smallint`` | ``smallint`` | ++-------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``tinyint`` | ``IntegerType()`` | ``int`` | ``int`` | ++-------------------------------+ | | | +| ``int`` | | | | ++-------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``bigint`` | ``LongType()`` | ``bigint`` | ``bigint`` | ++-------------------------------+-----------------------------------+-------------------------------+-------------------------------+ + +Temporal types +~~~~~~~~~~~~~~ + +.. note:: + + MSSQL ``timestamp`` type is alias for ``rowversion`` (see `Special types`_). It is not a temporal type! + ++------------------------------------------+--------------------------------------+-----------------------------------+-------------------------------+ +| MSSQL type (read) | Spark type | MSSQL type (write) | MSSQL type (create) | ++==========================================+======================================+===================================+===============================+ +| ``date`` | ``DateType()`` | ``date`` | ``date`` | ++------------------------------------------+--------------------------------------+-----------------------------------+-------------------------------+ +| ``smalldatetime``, minutes | ``TimestampType()``, microseconds | ``datetime2(6)``, microseconds | ``datetime``, milliseconds | ++------------------------------------------+ | | | +| ``datetime``, milliseconds | | | | ++------------------------------------------+ | | | +| ``datetime2(0)``, seconds | | | | ++------------------------------------------+ | | | +| ``datetime2(3)``, milliseconds | | | | ++------------------------------------------+--------------------------------------+-----------------------------------+-------------------------------+ +| ``datetime2(6)``, microseconds | ``TimestampType()``, microseconds | ``datetime2(6)``, microseconds | ``datetime``, milliseconds, | ++------------------------------------------+--------------------------------------+-----------------------------------+ **precision loss** [3]_ | +| ``datetime2(7)``, 100s of nanoseconds | ``TimestampType()``, microseconds, | ``datetime2(6)``, microseconds, | | +| | **precision loss** [4]_ | **precision loss** [4]_ | | ++------------------------------------------+--------------------------------------+-----------------------------------+-------------------------------+ +| ``time(0)``, seconds | ``TimestampType()``, microseconds, | ``datetime2(6)``, microseconds | ``datetime``, milliseconds | ++------------------------------------------+ with time format quirks [5]_ | | | +| ``time(3)``, milliseconds | | | | ++------------------------------------------+--------------------------------------+-----------------------------------+-------------------------------+ +| ``time(6)``, microseconds | ``TimestampType()``, microseconds, | ``datetime2(6)``, microseconds | ``datetime``, milliseconds, | ++ | with time format quirks [5]_ | | **precision loss** [3]_ | ++------------------------------------------+--------------------------------------+-----------------------------------+ + +| ``time``, 100s of nanoseconds | ``TimestampType()``, microseconds, | ``datetime2(6)``, microseconds | | ++------------------------------------------+ **precision loss** [4]_, | **precision loss** [3]_ | | +| ``time(7)``, 100s of nanoseconds | with time format quirks [5]_ | | | ++------------------------------------------+--------------------------------------+-----------------------------------+-------------------------------+ +| ``datetimeoffset`` | ``StringType()`` | ``nvarchar`` | ``nvarchar`` | ++------------------------------------------+--------------------------------------+-----------------------------------+-------------------------------+ + +.. warning:: + + Note that types in MSSQL and Spark have different value ranges: + + +-------------------+--------------------------------+--------------------------------+---------------------+--------------------------------+--------------------------------+ + | MySQL type | Min value | Max value | Spark type | Min value | Max value | + +===================+================================+================================+=====================+================================+================================+ + | ``smalldatetime`` | ``1900-01-01 00:00:00`` | ``2079-06-06 23:59:00`` | ``TimestampType()`` | ``0001-01-01 00:00:00.000000`` | ``9999-12-31 23:59:59.999999`` | + +-------------------+--------------------------------+--------------------------------+ | | | + | ``datetime`` | ``1753-01-01 00:00:00.000`` | ``9999-12-31 23:59:59.997`` | | | | + +-------------------+--------------------------------+--------------------------------+ | | | + | ``datetime2`` | ``0001-01-01 00:00:00.000000`` | ``9999-12-31 23:59:59.999999`` | | | | + +-------------------+--------------------------------+--------------------------------+ | | | + | ``time`` | ``00:00:00.0000000`` | ``23:59:59.9999999`` | | | | + +-------------------+--------------------------------+--------------------------------+---------------------+--------------------------------+--------------------------------+ + + So not all of values in Spark DataFrame can be written to MSSQL. + + References: + * `Clickhouse DateTime documentation `_ + * `Clickhouse DateTime documentation `_ + * `Spark DateType documentation `_ + * `Spark TimestampType documentation `_ + +.. [3] + MSSQL dialect for Spark generates DDL with type ``datetime`` which has precision up to milliseconds (``23:59:59.999``, 10\ :superscript:`-3` seconds). + Inserting data with microsecond and higher precision (``23:59:59.999999`` .. ``23.59:59.9999999``, 10\ :superscript:`-6` .. 10\ :superscript:`-7` seconds) + will lead to **throwing away microseconds**. + +.. [4] + MSSQL support timestamp up to 100s of nanoseconds precision (``23:59:59.9999999999``, 10\ :superscript:`-7` seconds), + but Spark ``TimestampType()`` supports datetime up to microseconds precision (``23:59:59.999999``, 10\ :superscript:`-6` seconds). + Last digit will be lost during read or write operations. + +.. [5] + ``time`` type is the same as ``timestamp`` with date ``1970-01-01``. So instead of reading data from MSSQL like ``23:59:59.999999`` + it is actually read ``1970-01-01 23:59:59.999999``, and vice versa. + +String types +~~~~~~~~~~~~~ + ++-------------------+------------------+--------------------+---------------------+ +| MSSQL type (read) | Spark type | MSSQL type (write) | MSSQL type (create) | ++===================+==================+====================+=====================+ +| ``char`` | ``StringType()`` | ``nvarchar`` | ``nvarchar`` | ++-------------------+ | | | +| ``char(N)`` | | | | ++-------------------+ | | | +| ``nchar`` | | | | ++-------------------+ | | | +| ``nchar(N)`` | | | | ++-------------------+ | | | +| ``varchar`` | | | | ++-------------------+ | | | +| ``varchar(N)`` | | | | ++-------------------+ | | | +| ``nvarchar`` | | | | ++-------------------+ | | | +| ``nvarchar(N)`` | | | | ++-------------------+ | | | +| ``mediumtext`` | | | | ++-------------------+ | | | +| ``text`` | | | | ++-------------------+ | | | +| ``ntext`` | | | | ++-------------------+ | | | +| ``xml`` | | | | ++-------------------+------------------+--------------------+---------------------+ + +Binary types +~~~~~~~~~~~~ + ++--------------------+-------------------+--------------------+---------------------+ +| MSSQL type (read) | Spark type | MSSQL type (write) | MSSQL type (create) | ++====================+===================+====================+=====================+ +| ``bit`` | ``BooleanType()`` | ``bit`` | ``bit`` | ++--------------------+-------------------+--------------------+---------------------+ +| ``binary`` | ``BinaryType()`` | ``varbinary`` | ``varbinary`` | ++--------------------+ | | | +| ``binary(N)`` | | | | ++--------------------+ | | | +| ``varbinary`` | | | | ++--------------------+ | | | +| ``varbinary(N)`` | | | | ++--------------------+ | | | +| ``image`` | | | | ++--------------------+-------------------+--------------------+---------------------+ + +Special types +~~~~~~~~~~~~~~ + ++---------------------------+------------------+--------------------+---------------------+ +| MSSQL type (read) | Spark type | MSSQL type (write) | MSSQL type (create) | ++===========================+==================+====================+=====================+ +| ``geography`` | ``BinaryType()`` | ``varbinary`` | ``varbinary`` | ++---------------------------+ | | | +| ``geometry`` | | | | ++---------------------------+ | | | +| ``hierarchyid`` | | | | ++---------------------------+ | | | +| ``rowversion`` | | | | ++---------------------------+------------------+--------------------+---------------------+ +| ``sql_variant`` | unsupported | | | ++---------------------------+------------------+--------------------+---------------------+ +| ``sysname`` | ``StringType()`` | ``nvarchar`` | ``nvarchar`` | ++---------------------------+ | | | +| ``uniqueidentifier`` | | | | ++---------------------------+------------------+--------------------+---------------------+ + +Explicit type cast +------------------ + +``DBReader`` +~~~~~~~~~~~~ + +It is possible to explicitly cast column type using ``DBReader(columns=...)`` syntax. + +For example, you can use ``CAST(column AS text)`` to convert data to string representation on MSSQL side, and so it will be read as Spark's ``StringType()``: + +.. code-block:: python + + from onetl.connection import MSSQL + from onetl.db import DBReader + + mssql = MSSQL(...) + + DBReader( + connection=mssql, + columns=[ + "id", + "supported_column", + "CAST(unsupported_column AS text) unsupported_column_str", + ], + ) + df = reader.run() + + # cast column content to proper Spark type + df = df.select( + df.id, + df.supported_column, + # explicit cast + df.unsupported_column_str.cast("integer").alias("parsed_integer"), + ) + +``DBWriter`` +~~~~~~~~~~~~ + +Convert dataframe column to JSON using `to_json `_, +and write it as ``text`` column in MSSQL: + +.. code:: python + + mssql.execute( + """ + CREATE TABLE schema.target_tbl AS ( + id bigint, + struct_column_json text -- any string type, actually + ) + """, + ) + + from pyspark.sql.functions import to_json + + df = df.select( + df.id, + to_json(df.struct_column).alias("struct_column_json"), + ) + + writer.run(df) + +Then you can parse this column on MSSQL side - for example, by creating a view: + +.. code:: sql + + SELECT + id, + JSON_VALUE(struct_column_json, "$.nested.field") AS nested_field + FROM target_tbl + +Or by using `computed column `_: + +.. code-block:: sql + + CREATE TABLE schema.target_table ( + id bigint, + supported_column datetime2(6), + struct_column_json text, -- any string type, actually + -- computed column + nested_field AS (JSON_VALUE(struct_column_json, "$.nested.field")) + -- or persisted column + -- nested_field AS (JSON_VALUE(struct_column_json, "$.nested.field")) PERSISTED + ) + +By default, column value is calculated on every table read. +Column marked as ``PERSISTED`` is calculated during insert, but this require additional space. diff --git a/docs/connection/db_connection/mssql/write.rst b/docs/connection/db_connection/mssql/write.rst index c8a5e5906..854283704 100644 --- a/docs/connection/db_connection/mssql/write.rst +++ b/docs/connection/db_connection/mssql/write.rst @@ -1,9 +1,46 @@ .. _mssql-write: -Writing to MSSQL -================ +Writing to MSSQL using ``DBWriter`` +==================================== -For writing data to MSSQL, use :obj:`DBWriter ` with options below. +For writing data to MSSQL, use :obj:`DBWriter `. + +.. warning:: + + Please take into account :ref:`mssql-types` + +.. warning:: + + It is always recommended to create table explicitly using :ref:`MSSQL.execute ` + instead of relying on Spark's table DDL generation. + + This is because Spark's DDL generator can create columns with different precision and types than it is expected, + causing precision loss or other issues. + +Examples +-------- + +.. code-block:: python + + from onetl.connection import MSSQL + from onetl.db import DBWriter + + mssql = MSSQL(...) + + df = ... # data is here + + writer = DBWriter( + connection=mssql, + target="schema.table", + options=MSSQL.WriteOptions(if_exists="append"), + ) + + writer.run(df) + +Options +------- + +Method above accepts :obj:`JDBCWriteOptions ` .. currentmodule:: onetl.connection.db_connection.jdbc_connection.options diff --git a/docs/connection/db_connection/mysql/execute.rst b/docs/connection/db_connection/mysql/execute.rst index ec5d01482..34460edae 100644 --- a/docs/connection/db_connection/mysql/execute.rst +++ b/docs/connection/db_connection/mysql/execute.rst @@ -3,11 +3,106 @@ Executing statements in MySQL ============================= -.. currentmodule:: onetl.connection.db_connection.mysql.connection +.. warning:: -.. automethod:: MySQL.fetch -.. automethod:: MySQL.execute -.. automethod:: MySQL.close + Methods below **read all the rows** returned from DB **to Spark driver memory**, and then convert them to DataFrame. + + Do **NOT** use them to read large amounts of data. Use :ref:`DBReader ` or :ref:`MySQL.sql ` instead. + +How to +------ + +There are 2 ways to execute some statement in MySQL + +Use ``MySQL.fetch`` +~~~~~~~~~~~~~~~~~~~ + +Use this method to execute some ``SELECT`` query which returns **small number or rows**, like reading +MySQL config, or reading data from some reference table. Method returns Spark DataFrame. + +Method accepts :obj:`JDBCOptions `. + +Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. + +.. warning:: + + Please take into account :ref:`mysql-types`. + +Syntax support +^^^^^^^^^^^^^^ + +This method supports **any** query syntax supported by MySQL, like: + +* ✅︎ ``SELECT ... FROM ...`` +* ✅︎ ``WITH alias AS (...) SELECT ...`` +* ✅︎ ``SELECT func(arg1, arg2)`` or ``{?= call func(arg1, arg2)}`` - special syntax for calling function +* ✅︎ ``SHOW ...`` +* ❌ ``SET ...; SELECT ...;`` - multiple statements not supported + +Examples +^^^^^^^^ + +.. code-block:: python + + from onetl.connection import MySQL + + mysql = MySQL(...) + + df = mysql.fetch( + "SELECT value FROM some.reference_table WHERE key = 'some_constant'", + options=MySQL.JDBCOptions(query_timeout=10), + ) + mysql.close() + value = df.collect()[0][0] # get value from first row and first column + +Use ``MySQL.execute`` +~~~~~~~~~~~~~~~~~~~~~ + +Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it. + +Method accepts :obj:`JDBCOptions `. + +Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. + +Syntax support +^^^^^^^^^^^^^^ + +This method supports **any** query syntax supported by MySQL, like: + +* ✅︎ ``CREATE TABLE ...``, ``CREATE VIEW ...``, ``DROP TABLE ...``, and so on +* ✅︎ ``ALTER ...`` +* ✅︎ ``INSERT INTO ... SELECT ...``, ``UPDATE ...``, ``DELETE ...``, and so on +* ✅︎ ``DROP TABLE ...``, ``DROP VIEW ...``, and so on +* ✅︎ ``CALL procedure(arg1, arg2) ...`` or ``{call procedure(arg1, arg2)}`` - special syntax for calling procedure +* ✅︎ other statements not mentioned here +* ❌ ``SET ...; SELECT ...;`` - multiple statements not supported + +Examples +^^^^^^^^ + +.. code-block:: python + + from onetl.connection import MySQL + + mysql = MySQL(...) + + with mysql: + # automatically close connection after exiting this context manager + mysql.execute("DROP TABLE schema.table") + mysql.execute( + """ + CREATE TABLE schema.table AS ( + id bigint, + key text, + value float + ) + ENGINE = InnoDB + """, + options=MySQL.JDBCOptions(query_timeout=10), + ) + +Options +------- .. currentmodule:: onetl.connection.db_connection.jdbc_mixin.options diff --git a/docs/connection/db_connection/mysql/index.rst b/docs/connection/db_connection/mysql/index.rst index e221165cd..f3348141a 100644 --- a/docs/connection/db_connection/mysql/index.rst +++ b/docs/connection/db_connection/mysql/index.rst @@ -7,6 +7,7 @@ MySQL :maxdepth: 1 :caption: Connection + prerequisites connection .. toctree:: @@ -14,5 +15,12 @@ MySQL :caption: Operations read + sql write execute + +.. toctree:: + :maxdepth: 1 + :caption: Troubleshooting + + types diff --git a/docs/connection/db_connection/mysql/prerequisites.rst b/docs/connection/db_connection/mysql/prerequisites.rst new file mode 100644 index 000000000..99b7820cb --- /dev/null +++ b/docs/connection/db_connection/mysql/prerequisites.rst @@ -0,0 +1,64 @@ +.. _mysql-prerequisites: + +Prerequisites +============= + +Version Compatibility +--------------------- + +* MySQL server versions: 5.7, 8.0 +* Spark versions: 2.3.x - 3.5.x +* Java versions: 8 - 20 + +See `official documentation `_. + +Installing PySpark +------------------ + +To use MySQL connector you should have PySpark installed (or injected to ``sys.path``) +BEFORE creating the connector instance. + +See :ref:`install-spark` installation instruction for more details. + +Connecting to MySQL +----------------------- + +Connection host +~~~~~~~~~~~~~~~ + +It is possible to connect to MySQL by using either DNS name of host or it's IP address. + +If you're using MySQL cluster, it is currently possible to connect only to **one specific node**. +Connecting to multiple nodes to perform load balancing, as well as automatic failover to new master/replica are not supported. + +Connection port +~~~~~~~~~~~~~~~ + +Connection is usually performed to port 3306. Port may differ for different MySQL instances. +Please ask your MySQL administrator to provide required information. + +Required grants +~~~~~~~~~~~~~~~ + +Ask your MySQL cluster administrator to set following grants for a user, +used for creating a connection: + +.. tabs:: + + .. code-tab:: sql Read + Write + + -- allow creating tables in the target schema + GRANT CREATE ON myschema.* TO username@'192.168.1.%'; + + -- allow read & write access to specific table + GRANT SELECT, INSERT ON myschema.mytable TO username@'192.168.1.%'; + + .. code-tab:: sql Read only + + -- allow read access to specific table + GRANT SELECT ON myschema.mytable TO username@'192.168.1.%'; + +In example above ``'192.168.1.%''`` is a network subnet ``192.168.1.0 - 192.168.1.255`` +where Spark driver and executors are running. To allow connecting user from any IP, use ``'%'`` (not secure!). + +More details can be found in `official documentation `_. diff --git a/docs/connection/db_connection/mysql/read.rst b/docs/connection/db_connection/mysql/read.rst index 6a6960532..e72da45f1 100644 --- a/docs/connection/db_connection/mysql/read.rst +++ b/docs/connection/db_connection/mysql/read.rst @@ -1,18 +1,91 @@ .. _mysql-read: -Reading from MySQL -================== +Reading from MySQL using ``DBReader`` +===================================== -There are 2 ways of distributed data reading from MySQL: +:obj:`DBReader ` supports :ref:`strategy` for incremental data reading, +but does not support custom queries, like ``JOIN``. -* Using :obj:`DBReader ` with different :ref:`strategy` -* Using :obj:`MySQL.sql ` +.. warning:: -Both methods accept :obj:`JDBCReadOptions ` + Please take into account :ref:`mysql-types` -.. currentmodule:: onetl.connection.db_connection.mysql.connection +Supported DBReader features +--------------------------- -.. automethod:: MySQL.sql +* ✅︎ ``columns`` +* ✅︎ ``where`` +* ✅︎ ``hwm``, supported strategies: +* * ✅︎ :ref:`snapshot-strategy` +* * ✅︎ :ref:`incremental-strategy` +* * ✅︎ :ref:`snapshot-batch-strategy` +* * ✅︎ :ref:`incremental-batch-strategy` +* ✅︎ ``hint`` (see `official documentation `_) +* ❌ ``df_schema`` +* ✅︎ ``options`` (see :obj:`JDBCReadOptions `) + +Examples +-------- + +Snapshot strategy: + +.. code-block:: python + + from onetl.connection import MySQL + from onetl.db import DBReader + + mysql = MySQL(...) + + reader = DBReader( + connection=mysql, + source="schema.table", + columns=["id", "key", "CAST(value AS text) value", "updated_dt"], + where="key = 'something'", + hint="SKIP_SCAN(schema.table key_index)", + options=MySQL.ReadOptions(partition_column="id", num_partitions=10), + ) + df = reader.run() + +Incremental strategy: + +.. code-block:: python + + from onetl.connection import MySQL + from onetl.db import DBReader + from onetl.strategy import IncrementalStrategy + + mysql = MySQL(...) + + reader = DBReader( + connection=mysql, + source="schema.table", + columns=["id", "key", "CAST(value AS text) value", "updated_dt"], + where="key = 'something'", + hint="SKIP_SCAN(schema.table key_index)", + hwm=DBReader.AutoDetectHWM(name="mysql_hwm", expression="updated_dt"), + options=MySQL.ReadOptions(partition_column="id", num_partitions=10), + ) + + with IncrementalStrategy(): + df = reader.run() + +Recommendations +--------------- + +Select only required columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of passing ``"*"`` in ``DBReader(columns=[...])`` prefer passing exact column names. This reduces the amount of data passed from Oracle to Spark. + +Pay attention to ``where`` value +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of filtering data on Spark side using ``df.filter(df.column == 'value')`` pass proper ``DBReader(where="column = 'value'")`` clause. +This both reduces the amount of data send from Oracle to Spark, and may also improve performance of the query. +Especially if there are indexes for columns used in ``where`` clause. + +Options +------- .. currentmodule:: onetl.connection.db_connection.jdbc_connection.options diff --git a/docs/connection/db_connection/mysql/sql.rst b/docs/connection/db_connection/mysql/sql.rst new file mode 100644 index 000000000..c161efa83 --- /dev/null +++ b/docs/connection/db_connection/mysql/sql.rst @@ -0,0 +1,69 @@ +.. _mysql-sql: + +Reading from MySQL using ``MySQL.sql`` +====================================== + +``MySQL.sql`` allows passing custom SQL query, but does not support incremental strategies. + +.. warning:: + + Please take into account :ref:`mysql-types` + +.. warning:: + + Statement is executed in **read-write** connection, so if you're calling some functions/procedures with DDL/DML statements inside, + they can change data in your database. + +Syntax support +-------------- + +Only queries with the following syntax are supported: + +* ✅︎ ``SELECT ... FROM ...`` +* ✅︎ ``WITH alias AS (...) SELECT ...`` +* ❌ ``SHOW ...`` +* ❌ ``SET ...; SELECT ...;`` - multiple statements not supported + +Examples +-------- + +.. code-block:: python + + from onetl.connection import MySQL + + mysql = MySQL(...) + df = mysql.sql( + """ + SELECT + id, + key, + CAST(value AS text) value, + updated_at + FROM + some.mytable + WHERE + key = 'something' + """, + options=MySQL.ReadOptions( + partition_column="id", + num_partitions=10, + lower_bound=0, + upper_bound=1000, + ), + ) + +Recommendations +--------------- + +Select only required columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of passing ``SELECT * FROM ...`` prefer passing exact column names ``SELECT col1, col2, ...``. +This reduces the amount of data passed from MySQL to Spark. + +Pay attention to ``where`` value +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of filtering data on Spark side using ``df.filter(df.column == 'value')`` pass proper ``WHERE column = 'value'`` clause. +This both reduces the amount of data send from MySQL to Spark, and may also improve performance of the query. +Especially if there are indexes or partitions for columns used in ``where`` clause. diff --git a/docs/connection/db_connection/mysql/types.rst b/docs/connection/db_connection/mysql/types.rst new file mode 100644 index 000000000..431665f87 --- /dev/null +++ b/docs/connection/db_connection/mysql/types.rst @@ -0,0 +1,381 @@ +.. _mysql-types: + +MySQL <-> Spark type mapping +================================= + +Type detection & casting +------------------------ + +Spark's DataFrames always have a ``schema`` which is a list of columns with corresponding Spark types. All operations on a column are performed using column type. + +Reading from MySQL +~~~~~~~~~~~~~~~~~~~~~~~ + +This is how MySQL connector performs this: + +* For each column in query result (``SELECT column1, column2, ... FROM table ...``) get column name and MySQL type. +* Find corresponding ``MySQL type (read)`` -> ``Spark type`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* Create DataFrame from query with specific column names and Spark types. + +Writing to some existing MySQL table +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This is how MySQL connector performs this: + +* Get names of columns in DataFrame. [1]_ +* Perform ``SELECT * FROM table LIMIT 0`` query. +* Take only columns present in DataFrame (by name, case insensitive). For each found column get MySQL type. +* Find corresponding ``Spark type`` -> ``MySQL type (write)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* If ``MySQL type (write)`` match ``MySQL type (read)``, no additional casts will be performed, DataFrame column will be written to MySQL as is. +* If ``MySQL type (write)`` does not match ``MySQL type (read)``, DataFrame column will be casted to target column type **on MySQL side**. For example, you can write column with text data to ``int`` column, if column contains valid integer values within supported value range and precision. + +.. [1] + This allows to write data to tables with ``DEFAULT`` and ``GENERATED`` columns - if DataFrame has no such column, + it will be populated by MySQL. + +Create new table using Spark +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. warning:: + + ABSOLUTELY NOT RECOMMENDED! + +This is how MySQL connector performs this: + +* Find corresponding ``Spark type`` -> ``MySQL type (create)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* Generate DDL for creating table in MySQL, like ``CREATE TABLE (col1 ...)``, and run it. +* Write DataFrame to created table as is. + +But some cases this may lead to using wrong column type. For example, Spark creates column of type ``timestamp`` +which corresponds to MySQL type ``timestamp(0)`` (precision up to seconds) +instead of more precise ``timestamp(6)`` (precision up to nanoseconds). +This may lead to incidental precision loss, or sometimes data cannot be written to created table at all. + +So instead of relying on Spark to create tables: + +.. dropdown:: See example + + .. code:: python + + writer = DBWriter( + connection=mysql, + table="myschema.target_tbl", + options=MySQL.WriteOptions( + if_exists="append", + createTableOptions="ENGINE = InnoDB", + ), + ) + writer.run(df) + +Always prefer creating tables with specific types **BEFORE WRITING DATA**: + +.. dropdown:: See example + + .. code:: python + + mysql.execute( + """ + CREATE TABLE schema.table AS ( + id bigint, + key text, + value timestamp(6) -- specific type and precision + ) + ENGINE = InnoDB + """, + ) + + writer = DBWriter( + connection=mysql, + table="myschema.target_tbl", + options=MySQL.WriteOptions(if_exists="append"), + ) + writer.run(df) + +References +~~~~~~~~~~ + +Here you can find source code with type conversions: + +* `MySQL -> JDBC `_ +* `JDBC -> Spark `_ +* `Spark -> JDBC `_ +* `JDBC -> MySQL `_ + +Supported types +--------------- + +See `official documentation `_ + +Numeric types +~~~~~~~~~~~~~ + ++-------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| MySQL type (read) | Spark type | MySQL type (write) | MySQL type (create) | ++===============================+===================================+===============================+===============================+ +| ``decimal`` | ``DecimalType(P=10, S=0)`` | ``decimal(P=10, S=0)`` | ``decimal(P=10, S=0)`` | ++-------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``decimal(P=0..38)`` | ``DecimalType(P=0..38, S=0)`` | ``decimal(P=0..38, S=0)`` | ``decimal(P=0..38, S=0)`` | ++-------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``decimal(P=0..38, S=0..30)`` | ``DecimalType(P=0..38, S=0..30)`` | ``decimal(P=0..38, S=0..30)`` | ``decimal(P=0..38, S=0..30)`` | ++-------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``decimal(P=39..65, S=...)`` | unsupported [2]_ | | | ++-------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``float`` | ``DoubleType()`` | ``double`` | ``double`` | ++-------------------------------+ | | | +| ``double`` | | | | ++-------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``tinyint`` | ``IntegerType()`` | ``int`` | ``int`` | ++-------------------------------+ | | | +| ``smallint`` | | | | ++-------------------------------+ | | | +| ``mediumint`` | | | | ++-------------------------------+ | | | +| ``int`` | | | | ++-------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``bigint`` | ``LongType()`` | ``bigint`` | ``bigint`` | ++-------------------------------+-----------------------------------+-------------------------------+-------------------------------+ + +.. [2] + + MySQL support decimal types with precision ``P`` up to 65. + + But Spark's ``DecimalType(P, S)`` supports maximum ``P=38``. It is impossible to read, write or operate with values of larger precision, + this leads to an exception. + +Temporal types +~~~~~~~~~~~~~~ + ++-----------------------------------+--------------------------------------+-----------------------------------+-------------------------------+ +| MySQL type (read) | Spark type | MySQL type (write) | MySQL type (create) | ++===================================+======================================+===================================+===============================+ +| ``year`` | ``DateType()`` | ``date`` | ``date`` | ++-----------------------------------+ | | | +| ``date`` | | | | ++-----------------------------------+--------------------------------------+-----------------------------------+-------------------------------+ +| ``datetime``, seconds | ``TimestampType()``, microseconds | ``timestamp(6)``, microseconds | ``timestamp(0)``, seconds | ++-----------------------------------+ | | | +| ``timestamp``, seconds | | | | ++-----------------------------------+ | | | +| ``datetime(0)``, seconds | | | | ++-----------------------------------+ | | | +| ``timestamp(0)``, seconds | | | | ++-----------------------------------+--------------------------------------+-----------------------------------+-------------------------------+ +| ``datetime(3)``, milliseconds | ``TimestampType()``, microseconds | ``timestamp(6)``, microseconds | ``timestamp(0)``, seconds, | ++-----------------------------------+ | | **precision loss** [4]_, | +| ``timestamp(3)``, milliseconds | | | | ++-----------------------------------+ | | | +| ``datetime(6)``, microseconds | | | | ++-----------------------------------+ | | | +| ``timestamp(6)``, microseconds | | | | ++-----------------------------------+--------------------------------------+-----------------------------------+-------------------------------+ +| ``time``, seconds | ``TimestampType()``, microseconds, | ``timestamp(6)``, microseconds | ``timestamp(0)``, seconds | ++-----------------------------------+ with time format quirks [5]_ | | | +| ``time(0)``, seconds | | | | ++-----------------------------------+--------------------------------------+-----------------------------------+-------------------------------+ +| ``time(3)``, milliseconds | ``TimestampType()``, microseconds | ``timestamp(6)``, microseconds | ``timestamp(0)``, seconds, | ++-----------------------------------+ with time format quirks [5]_ | | **precision loss** [4]_, | +| ``time(6)``, microseconds | | | | ++-----------------------------------+--------------------------------------+-----------------------------------+-------------------------------+ + +.. warning:: + + Note that types in MySQL and Spark have different value ranges: + + +---------------+--------------------------------+--------------------------------+---------------------+--------------------------------+--------------------------------+ + | MySQL type | Min value | Max value | Spark type | Min value | Max value | + +===============+================================+================================+=====================+================================+================================+ + | ``year`` | ``1901`` | ``2155`` | ``DateType()`` | ``0001-01-01`` | ``9999-12-31`` | + +---------------+--------------------------------+--------------------------------+ | | | + | ``date`` | ``1000-01-01`` | ``9999-12-31`` | | | | + +---------------+--------------------------------+--------------------------------+---------------------+--------------------------------+--------------------------------+ + | ``datetime`` | ``1000-01-01 00:00:00.000000`` | ``9999-12-31 23:59:59.499999`` | ``TimestampType()`` | ``0001-01-01 00:00:00.000000`` | ``9999-12-31 23:59:59.999999`` | + +---------------+--------------------------------+--------------------------------+ | | | + | ``timestamp`` | ``1970-01-01 00:00:01.000000`` | ``9999-12-31 23:59:59.499999`` | | | | + +---------------+--------------------------------+--------------------------------+ | | | + | ``time`` | ``-838:59:59.000000`` | ``838:59:59.000000`` | | | | + +---------------+--------------------------------+--------------------------------+---------------------+--------------------------------+--------------------------------+ + + So Spark can read all the values from MySQL, but not all of values in Spark DataFrame can be written to MySQL. + + References: + * `MySQL year documentation `_ + * `MySQL date, datetime & timestamp documentation `_ + * `MySQL time documentation `_ + * `Spark DateType documentation `_ + * `Spark TimestampType documentation `_ + +.. [4] + MySQL dialect generates DDL with MySQL type ``timestamp`` which is alias for ``timestamp(0)`` with precision up to seconds (``23:59:59``). + Inserting data with microseconds precision (``23:59:59.999999``) will lead to **throwing away microseconds**. + +.. [5] + ``time`` type is the same as ``timestamp`` with date ``1970-01-01``. So instead of reading data from MySQL like ``23:59:59`` + it is actually read ``1970-01-01 23:59:59``, and vice versa. + +String types +~~~~~~~~~~~~~ + ++-------------------------------+------------------+--------------------+---------------------+ +| MySQL type (read) | Spark type | MySQL type (write) | MySQL type (create) | ++===============================+==================+====================+=====================+ +| ``char`` | ``StringType()`` | ``longtext`` | ``longtext`` | ++-------------------------------+ | | | +| ``char(N)`` | | | | ++-------------------------------+ | | | +| ``varchar(N)`` | | | | ++-------------------------------+ | | | +| ``mediumtext`` | | | | ++-------------------------------+ | | | +| ``text`` | | | | ++-------------------------------+ | | | +| ``longtext`` | | | | ++-------------------------------+ | | | +| ``json`` | | | | ++-------------------------------+ | | | +| ``enum("val1", "val2", ...)`` | | | | ++-------------------------------+ | | | +| ``set("val1", "val2", ...)`` | | | | ++-------------------------------+------------------+--------------------+---------------------+ + +Binary types +~~~~~~~~~~~~ + ++-------------------+------------------+--------------------+---------------------+ +| MySQL type (read) | Spark type | MySQL type (write) | MySQL type (create) | ++===================+==================+====================+=====================+ +| ``binary`` | ``BinaryType()`` | ``blob`` | ``blob`` | ++-------------------+ | | | +| ``binary(N)`` | | | | ++-------------------+ | | | +| ``varbinary(N)`` | | | | ++-------------------+ | | | +| ``mediumblob`` | | | | ++-------------------+ | | | +| ``blob`` | | | | ++-------------------+ | | | +| ``longblob`` | | | | ++-------------------+------------------+--------------------+---------------------+ + +Geometry types +~~~~~~~~~~~~~~ + ++------------------------+------------------+--------------------+---------------------+ +| MySQL type (read) | Spark type | MySQL type (write) | MySQL type (create) | ++========================+==================+====================+=====================+ +| ``point`` | ``BinaryType()`` | ``blob`` | ``blob`` | ++------------------------+ | | | +| ``linestring`` | | | | ++------------------------+ | | | +| ``polygon`` | | | | ++------------------------+ | | | +| ``geometry`` | | | | ++------------------------+ | | | +| ``multipoint`` | | | | ++------------------------+ | | | +| ``multilinestring`` | | | | ++------------------------+ | | | +| ``multipolygon`` | | | | ++------------------------+ | | | +| ``geometrycollection`` | | | | ++------------------------+------------------+--------------------+---------------------+ + +Explicit type cast +------------------ + +``DBReader`` +~~~~~~~~~~~~ + +It is possible to explicitly cast column type using ``DBReader(columns=...)`` syntax. + +For example, you can use ``CAST(column AS text)`` to convert data to string representation on MySQL side, and so it will be read as Spark's ``StringType()``. + +It is also possible to use `JSON_OBJECT `_ MySQL function +to convert column of any type to string representation, and then parse this column on Spark side using +`from_json `_: + +.. code-block:: python + + from pyspark.sql.types import IntegerType, StructField, StructType + + from onetl.connection import MySQL + from onetl.db import DBReader + + mysql = MySQL(...) + + DBReader( + connection=mysql, + columns=[ + "id", + "supported_column", + "CAST(unsupported_column AS text) unsupported_column_str", + # or + "JSON_OBJECT('key', value_column) json_column", + ], + ) + df = reader.run() + + # Spark requires all columns to have some type, describe it + column_type = StructType([StructField("key", IntegerType())]) + + # cast column content to proper Spark type + df = df.select( + df.id, + df.supported_column, + # explicit cast + df.unsupported_column_str.cast("integer").alias("parsed_integer"), + # or explicit json parsing + from_json(df.json_column, schema).alias("struct_column"), + ) + +``DBWriter`` +~~~~~~~~~~~~ + +Convert dataframe column to JSON using `to_json `_, +and write it as ``text`` column in MySQL: + +.. code:: python + + mysql.execute( + """ + CREATE TABLE schema.target_tbl AS ( + id bigint, + array_column_json json -- any string type, actually + ) + ENGINE = InnoDB + """, + ) + + from pyspark.sql.functions import to_json + + df = df.select( + df.id, + to_json(df.array_column).alias("array_column_json"), + ) + + writer.run(df) + +Then you can parse this column on MySQL side - for example, by creating a view: + +.. code:: sql + + SELECT + id, + array_column_json->"$[0]" AS array_item + FROM target_tbl + +Or by using `GENERATED column `_: + +.. code-block:: sql + + CREATE TABLE schema.target_table ( + id bigint, + supported_column timestamp, + array_column_json json, -- any string type, actually + -- virtual column + array_item_0 GENERATED ALWAYS AS (array_column_json->"$[0]")) VIRTUAL + -- or stired column + -- array_item_0 GENERATED ALWAYS AS (array_column_json->"$[0]")) STORED + ) + +``VIRTUAL`` column value is calculated on every table read. +``STORED`` column value is calculated during insert, but this require additional space. diff --git a/docs/connection/db_connection/mysql/write.rst b/docs/connection/db_connection/mysql/write.rst index 67f13cf1b..2d7c056c9 100644 --- a/docs/connection/db_connection/mysql/write.rst +++ b/docs/connection/db_connection/mysql/write.rst @@ -1,9 +1,50 @@ .. _mysql-write: -Writing to MySQL -================ +Writing to MySQL using ``DBWriter`` +======================================== -For writing data to MySQL, use :obj:`DBWriter ` with options below. +For writing data to MySQL, use :obj:`DBWriter `. + +.. warning:: + + Please take into account :ref:`mysql-types` + +.. warning:: + + It is always recommended to create table explicitly using :ref:`MySQL.execute ` + instead of relying on Spark's table DDL generation. + + This is because Spark's DDL generator can create columns with different precision and types than it is expected, + causing precision loss or other issues. + +Examples +-------- + +.. code-block:: python + + from onetl.connection import MySQL + from onetl.db import DBWriter + + mysql = MySQL(...) + + df = ... # data is here + + writer = DBWriter( + connection=mysql, + target="schema.table", + options=MySQL.WriteOptions( + if_exists="append", + # ENGINE is required by MySQL + createTableOptions="ENGINE = MergeTree() ORDER BY id", + ), + ) + + writer.run(df) + +Options +------- + +Method above accepts :obj:`JDBCWriteOptions ` .. currentmodule:: onetl.connection.db_connection.jdbc_connection.options diff --git a/docs/connection/db_connection/oracle/execute.rst b/docs/connection/db_connection/oracle/execute.rst index 24ea689a4..b8533b278 100644 --- a/docs/connection/db_connection/oracle/execute.rst +++ b/docs/connection/db_connection/oracle/execute.rst @@ -3,11 +3,106 @@ Executing statements in Oracle ============================== -.. currentmodule:: onetl.connection.db_connection.oracle.connection +.. warning:: -.. automethod:: Oracle.fetch -.. automethod:: Oracle.execute -.. automethod:: Oracle.close + Methods below **read all the rows** returned from DB **to Spark driver memory**, and then convert them to DataFrame. + + Do **NOT** use them to read large amounts of data. Use :ref:`DBReader ` or :ref:`Oracle.sql ` instead. + +How to +------ + +There are 2 ways to execute some statement in Oracle + +Use ``Oracle.fetch`` +~~~~~~~~~~~~~~~~~~~~ + +Use this method to execute some ``SELECT`` query which returns **small number or rows**, like reading +Oracle config, or reading data from some reference table. Method returns Spark DataFrame. + +Method accepts :obj:`JDBCOptions `. + +Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. + +.. warning:: + + Please take into account :ref:`oracle-types`. + +Syntax support +^^^^^^^^^^^^^^ + +This method supports **any** query syntax supported by Oracle, like: + +* ✅︎ ``SELECT ... FROM ...`` +* ✅︎ ``WITH alias AS (...) SELECT ...`` +* ✅︎ ``SELECT func(arg1, arg2) FROM DUAL`` - call function +* ✅︎ ``SHOW ...`` +* ❌ ``SET ...; SELECT ...;`` - multiple statements not supported + +Examples +^^^^^^^^ + +.. code-block:: python + + from onetl.connection import Oracle + + oracle = Oracle(...) + + df = oracle.fetch( + "SELECT value FROM some.reference_table WHERE key = 'some_constant'", + options=Oracle.JDBCOptions(query_timeout=10), + ) + oracle.close() + value = df.collect()[0][0] # get value from first row and first column + +Use ``Oracle.execute`` +~~~~~~~~~~~~~~~~~~~~~~ + +Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it. + +Method accepts :obj:`JDBCOptions `. + +Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. + +Syntax support +^^^^^^^^^^^^^^ + +This method supports **any** query syntax supported by Oracle, like: + +* ✅︎ ``CREATE TABLE ...``, ``CREATE VIEW ...`` +* ✅︎ ``ALTER ...`` +* ✅︎ ``INSERT INTO ... SELECT ...``, ``UPDATE ...``, ``DELETE ...``, and so on +* ✅︎ ``DROP TABLE ...``, ``DROP VIEW ...``, and so on +* ✅︎ ``CALL procedure(arg1, arg2) ...`` or ``{call procedure(arg1, arg2)}`` - special syntax for calling procedure +* ✅︎ ``DECLARE ... BEGIN ... END`` - execute PL/SQL statement +* ✅︎ other statements not mentioned here +* ❌ ``SET ...; SELECT ...;`` - multiple statements not supported + +Examples +^^^^^^^^ + +.. code-block:: python + + from onetl.connection import Oracle + + oracle = Oracle(...) + + with oracle: + # automatically close connection after exiting this context manager + oracle.execute("DROP TABLE schema.table") + oracle.execute( + """ + CREATE TABLE schema.table AS ( + id bigint GENERATED ALWAYS AS IDENTITY, + key VARCHAR2(4000), + value NUMBER + ) + """, + options=Oracle.JDBCOptions(query_timeout=10), + ) + +Options +------- .. currentmodule:: onetl.connection.db_connection.jdbc_mixin.options diff --git a/docs/connection/db_connection/oracle/index.rst b/docs/connection/db_connection/oracle/index.rst index 519250fb5..ec9005397 100644 --- a/docs/connection/db_connection/oracle/index.rst +++ b/docs/connection/db_connection/oracle/index.rst @@ -7,6 +7,7 @@ Oracle :maxdepth: 1 :caption: Connection + prerequisites connection .. toctree:: @@ -14,5 +15,12 @@ Oracle :caption: Operations read + sql write execute + +.. toctree:: + :maxdepth: 1 + :caption: Troubleshooting + + types diff --git a/docs/connection/db_connection/oracle/prerequisites.rst b/docs/connection/db_connection/oracle/prerequisites.rst new file mode 100644 index 000000000..b5b64e437 --- /dev/null +++ b/docs/connection/db_connection/oracle/prerequisites.rst @@ -0,0 +1,112 @@ +.. _oracle-prerequisites: + +Prerequisites +============= + +Version Compatibility +--------------------- + +* Oracle Server versions: 23, 21, 19, 18, 12.2 and __probably__ 11.2 (tested, but it's not mentioned in official docs). +* Spark versions: 2.3.x - 3.5.x +* Java versions: 8 - 20 + +See `official documentation `_. + +Installing PySpark +------------------ + +To use Oracle connector you should have PySpark installed (or injected to ``sys.path``) +BEFORE creating the connector instance. + +See :ref:`install-spark` installation instruction for more details. + +Connecting to Oracle +-------------------- + +Connection port +~~~~~~~~~~~~~~~ + +Connection is usually performed to port 1521. Port may differ for different Oracle instances. +Please ask your Oracle administrator to provide required information. + +Connection host +~~~~~~~~~~~~~~~ + +It is possible to connect to Oracle by using either DNS name of host or it's IP address. + +If you're using Oracle cluster, it is currently possible to connect only to **one specific node**. +Connecting to multiple nodes to perform load balancing, as well as automatic failover to new master/replica are not supported. + +Connect as proxy user +~~~~~~~~~~~~~~~~~~~~~ + +It is possible to connect to database as another user without knowing this user password. + +This can be enabled by granting user a special ``CONNECT THROUGH`` permission: + +.. code-block:: sql + + ALTER USER schema_owner GRANT CONNECT THROUGH proxy_user; + +Then you can connect to Oracle using credentials of ``proxy_user`` but specify that you need permissions of ``schema_owner``: + +.. code-block:: python + + oracle = Oracle( + ..., + user="proxy_user[schema_owner]", + password="proxy_user password", + ) + +See `official documentation `_. + +Required grants +~~~~~~~~~~~~~~~ + +Ask your Oracle cluster administrator to set following grants for a user, +used for creating a connection: + +.. tabs:: + + .. code-tab:: sql Read + Write (schema is owned by user) + + -- allow user to log in + GRANT CREATE SESSION TO username; + + -- allow creating tables in user schema + GRANT CREATE TABLE TO username; + + -- allow read & write access to specific table + GRANT SELECT, INSERT ON username.mytable TO username; + + .. code-tab:: sql Read + Write (schema is not owned by user) + + -- allow user to log in + GRANT CREATE SESSION TO username; + + -- allow creating tables in any schema, + -- as Oracle does not support specifying exact schema name + GRANT CREATE ANY TABLE TO username; + + -- allow read & write access to specific table + GRANT SELECT, INSERT ON someschema.mytable TO username; + + -- only if if_exists="replace_entire_table" is used: + -- allow dropping/truncating tables in any schema, + -- as Oracle does not support specifying exact schema name + GRANT DROP ANY TABLE TO username; + + .. code-tab:: sql Read only + + -- allow user to log in + GRANT CREATE SESSION TO username; + + -- allow read access to specific table + GRANT SELECT ON someschema.mytable TO username; + +More details can be found in official documentation: + * `GRANT `_ + * `SELECT `_ + * `CREATE TABLE `_ + * `INSERT `_ + * `TRUNCATE TABLE `_ diff --git a/docs/connection/db_connection/oracle/read.rst b/docs/connection/db_connection/oracle/read.rst index ffd393e6e..6592cfc7b 100644 --- a/docs/connection/db_connection/oracle/read.rst +++ b/docs/connection/db_connection/oracle/read.rst @@ -1,18 +1,91 @@ .. _oracle-read: -Reading from Oracle -=================== +Reading from Oracle using ``DBReader`` +====================================== -There are 2 ways of distributed data reading from Oracle: +:obj:`DBReader ` supports :ref:`strategy` for incremental data reading, +but does not support custom queries, like ``JOIN``. -* Using :obj:`DBReader ` with different :ref:`strategy` -* Using :obj:`Oracle.sql ` +.. warning:: -Both methods accept :obj:`JDBCReadOptions ` + Please take into account :ref:`oracle-types` -.. currentmodule:: onetl.connection.db_connection.oracle.connection +Supported DBReader features +--------------------------- -.. automethod:: Oracle.sql +* ✅︎ ``columns`` +* ✅︎ ``where`` +* ✅︎ ``hwm``, supported strategies: +* * ✅︎ :ref:`snapshot-strategy` +* * ✅︎ :ref:`incremental-strategy` +* * ✅︎ :ref:`snapshot-batch-strategy` +* * ✅︎ :ref:`incremental-batch-strategy` +* ✅︎ ``hint`` (see `official documentation `_) +* ❌ ``df_schema`` +* ✅︎ ``options`` (see :obj:`JDBCReadOptions `) + +Examples +-------- + +Snapshot strategy: + +.. code-block:: python + + from onetl.connection import Oracle + from onetl.db import DBReader + + oracle = Oracle(...) + + reader = DBReader( + connection=oracle, + source="schema.table", + columns=["id", "key", "CAST(value AS VARCHAR2(4000)) value", "updated_dt"], + where="key = 'something'", + hint="INDEX(schema.table key_index)", + options=Oracle.ReadOptions(partition_column="id", num_partitions=10), + ) + df = reader.run() + +Incremental strategy: + +.. code-block:: python + + from onetl.connection import Oracle + from onetl.db import DBReader + from onetl.strategy import IncrementalStrategy + + oracle = Oracle(...) + + reader = DBReader( + connection=oracle, + source="schema.table", + columns=["id", "key", "CAST(value AS VARCHAR2(4000)) value", "updated_dt"], + where="key = 'something'", + hint="INDEX(schema.table key_index)", + hwm=DBReader.AutoDetectHWM(name="oracle_hwm", expression="updated_dt"), + options=Oracle.ReadOptions(partition_column="id", num_partitions=10), + ) + + with IncrementalStrategy(): + df = reader.run() + +Recommendations +--------------- + +Select only required columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of passing ``"*"`` in ``DBReader(columns=[...])`` prefer passing exact column names. This reduces the amount of data passed from Oracle to Spark. + +Pay attention to ``where`` value +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of filtering data on Spark side using ``df.filter(df.column == 'value')`` pass proper ``DBReader(where="column = 'value'")`` clause. +This both reduces the amount of data send from Oracle to Spark, and may also improve performance of the query. +Especially if there are indexes or partitions for columns used in ``where`` clause. + +Options +------- .. currentmodule:: onetl.connection.db_connection.jdbc_connection.options diff --git a/docs/connection/db_connection/oracle/sql.rst b/docs/connection/db_connection/oracle/sql.rst new file mode 100644 index 000000000..969c28afa --- /dev/null +++ b/docs/connection/db_connection/oracle/sql.rst @@ -0,0 +1,69 @@ +.. _oracle-sql: + +Reading from Oracle using ``Oracle.sql`` +======================================== + +``Oracle.sql`` allows passing custom SQL query, but does not support incremental strategies. + +.. warning:: + + Please take into account :ref:`oracle-types` + +.. warning:: + + Statement is executed in **read-write** connection, so if you're calling some functions/procedures with DDL/DML statements inside, + they can change data in your database. + +Syntax support +-------------- + +Only queries with the following syntax are supported: + +* ✅︎ ``SELECT ... FROM ...`` +* ✅︎ ``WITH alias AS (...) SELECT ...`` +* ❌ ``SHOW ...`` +* ❌ ``SET ...; SELECT ...;`` - multiple statements not supported + +Examples +-------- + +.. code-block:: python + + from onetl.connection import Oracle + + oracle = Oracle(...) + df = oracle.sql( + """ + SELECT + id, + key, + CAST(value AS VARCHAR2(4000)) value, + updated_at + FROM + some.mytable + WHERE + key = 'something' + """, + options=Oracle.ReadOptions( + partition_column="id", + num_partitions=10, + lower_bound=0, + upper_bound=1000, + ), + ) + +Recommendations +--------------- + +Select only required columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of passing ``SELECT * FROM ...`` prefer passing exact column names ``SELECT col1, col2, ...``. +This reduces the amount of data passed from Oracle to Spark. + +Pay attention to ``where`` value +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of filtering data on Spark side using ``df.filter(df.column == 'value')`` pass proper ``WHERE column = 'value'`` clause. +This both reduces the amount of data send from Oracle to Spark, and may also improve performance of the query. +Especially if there are indexes or partitions for columns used in ``where`` clause. diff --git a/docs/connection/db_connection/oracle/types.rst b/docs/connection/db_connection/oracle/types.rst new file mode 100644 index 000000000..c06e43b13 --- /dev/null +++ b/docs/connection/db_connection/oracle/types.rst @@ -0,0 +1,401 @@ +.. _oracle-types: + +Oracle <-> Spark type mapping +============================= + +Type detection & casting +------------------------ + +Spark's DataFrames always have a ``schema`` which is a list of columns with corresponding Spark types. All operations on a column are performed using column type. + +Reading from Oracle +~~~~~~~~~~~~~~~~~~~ + +This is how Oracle connector performs this: + +* For each column in query result (``SELECT column1, column2, ... FROM table ...``) get column name and Oracle type. +* Find corresponding ``Oracle type (read)`` -> ``Spark type`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* Create DataFrame from query with specific column names and Spark types. + +Writing to some existing Oracle table +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This is how Oracle connector performs this: + +* Get names of columns in DataFrame. [1]_ +* Perform ``SELECT * FROM table LIMIT 0`` query. +* Take only columns present in DataFrame (by name, case insensitive). For each found column get Clickhouse type. +* **Find corresponding** ``Oracle type (read)`` -> ``Spark type`` **combination** (see below) for each DataFrame column. If no combination is found, raise exception. [2]_ +* Find corresponding ``Spark type`` -> ``Oracle type (write)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* If ``Oracle type (write)`` match ``Oracle type (read)``, no additional casts will be performed, DataFrame column will be written to Oracle as is. +* If ``Oracle type (write)`` does not match ``Oracle type (read)``, DataFrame column will be casted to target column type **on Oracle side**. + For example, you can write column with text data to ``int`` column, if column contains valid integer values within supported value range and precision. + +.. [1] + This allows to write data to tables with ``DEFAULT`` and ``GENERATED`` columns - if DataFrame has no such column, + it will be populated by Oracle. + +.. [2] + + Yes, this is weird. + +Create new table using Spark +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. warning:: + + ABSOLUTELY NOT RECOMMENDED! + +This is how Oracle connector performs this: + +* Find corresponding ``Spark type`` -> ``Oracle type (create)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* Generate DDL for creating table in Oracle, like ``CREATE TABLE (col1 ...)``, and run it. +* Write DataFrame to created table as is. + +But Oracle connector support only limited number of types and almost no custom clauses (like ``PARTITION BY``, ``INDEX``, etc). +So instead of relying on Spark to create tables: + +.. dropdown:: See example + + .. code:: python + + writer = DBWriter( + connection=oracle, + table="public.table", + options=Oracle.WriteOptions(if_exists="append"), + ) + writer.run(df) + +Always prefer creating table with desired DDL **BEFORE WRITING DATA**: + +.. dropdown:: See example + + .. code:: python + + oracle.execute( + """ + CREATE TABLE username.table AS ( + id NUMBER, + business_dt TIMESTAMP(6), + value VARCHAR2(2000) + ) + """, + ) + + writer = DBWriter( + connection=oracle, + table="public.table", + options=Oracle.WriteOptions(if_exists="append"), + ) + writer.run(df) + +See Oracle `CREATE TABLE `_ documentation. + +Supported types +--------------- + +References +~~~~~~~~~~ + +See `List of Oracle types `_. + +Here you can find source code with type conversions: + +* `JDBC -> Spark `_ +* `Spark -> JDBC `_ + +Numeric types +~~~~~~~~~~~~~ + ++----------------------------------+-----------------------------------+-------------------------------+---------------------------+ +| Oracle type (read) | Spark type | Oracle type (write) | Oracle type (create) | ++==================================+===================================+===============================+===========================+ +| ``NUMBER`` | ``DecimalType(P=38, S=10)`` | ``NUMBER(P=38, S=10)`` | ``NUMBER(P=38, S=10)`` | ++----------------------------------+-----------------------------------+-------------------------------+---------------------------+ +| ``NUMBER(P=0..38)`` | ``DecimalType(P=0..38, S=0)`` | ``NUMBER(P=0..38, S=0)`` | ``NUMBER(P=38, S=0)`` | ++----------------------------------+-----------------------------------+-------------------------------+---------------------------+ +| ``NUMBER(P=0..38, S=0..38)`` | ``DecimalType(P=0..38, S=0..38)`` | ``NUMBER(P=0..38, S=0..38)`` | ``NUMBER(P=38, S=0..38)`` | ++----------------------------------+-----------------------------------+-------------------------------+---------------------------+ +| ``NUMBER(P=..., S=-127..-1)`` | unsupported [3]_ | | | ++----------------------------------+-----------------------------------+-------------------------------+---------------------------+ +| ``FLOAT`` | ``DecimalType(P=38, S=10)`` | ``NUMBER(P=38, S=10)`` | ``NUMBER(P=38, S=10)`` | ++----------------------------------+ | | | +| ``FLOAT(N)`` | | | | ++----------------------------------+ | | | +| ``REAL`` | | | | ++----------------------------------+ | | | +| ``DOUBLE PRECISION`` | | | | ++----------------------------------+-----------------------------------+-------------------------------+---------------------------+ +| ``BINARY_FLOAT`` | ``FloatType()`` | ``NUMBER(P=19, S=4)`` | ``NUMBER(P=19, S=4)`` | ++----------------------------------+-----------------------------------+ | | +| ``BINARY_DOUBLE`` | ``DoubleType()`` | | | ++----------------------------------+-----------------------------------+-------------------------------+---------------------------+ +| ``SMALLINT`` | ``DecimalType(P=38, S=0)`` | ``NUMBER(P=38, S=0)`` | ``NUMBER(P=38, S=0)`` | ++----------------------------------+ | | | +| ``INTEGER`` | | | | ++----------------------------------+-----------------------------------+-------------------------------+---------------------------+ +| ``LONG`` | ``StringType()`` | ``CLOB`` | ``CLOB`` | ++----------------------------------+-----------------------------------+-------------------------------+---------------------------+ + +.. [3] + + Oracle support decimal types with negative scale, like ``NUMBER(38, -10)``. Spark doesn't. + +Temporal types +~~~~~~~~~~~~~~ + ++--------------------------------------------+------------------------------------+---------------------------------+---------------------------------+ +| Oracle type (read) | Spark type | Oracle type (write) | Oracle type (create) | ++============================================+====================================+=================================+=================================+ +| ``DATE``, days | ``TimestampType()``, microseconds | ``TIMESTAMP(6)``, microseconds | ``TIMESTAMP(6)``, microseconds | ++--------------------------------------------+------------------------------------+---------------------------------+---------------------------------+ +| ``TIMESTAMP``, microseconds | ``TimestampType()``, microseconds | ``TIMESTAMP(6)``, microseconds | ``TIMESTAMP(6)``, microseconds | ++--------------------------------------------+ | | | +| ``TIMESTAMP(0)``, seconds | | | | ++--------------------------------------------+ | | | +| ``TIMESTAMP(3)``, milliseconds | | | | ++--------------------------------------------+ | | | +| ``TIMESTAMP(6)``, microseconds | | | | ++--------------------------------------------+------------------------------------+---------------------------------+---------------------------------+ +| ``TIMESTAMP(9)``, nanoseconds | ``TimestampType()``, microseconds, | ``TIMESTAMP(6)``, microseconds, | ``TIMESTAMP(6)``, microseconds, | +| | **precision loss** [4]_ | **precision loss** | **precision loss** | ++--------------------------------------------+------------------------------------+---------------------------------+---------------------------------+ +| ``TIMESTAMP WITH TIME ZONE`` | unsupported | | | ++--------------------------------------------+ | | | +| ``TIMESTAMP(N) WITH TIME ZONE`` | | | | ++--------------------------------------------+ | | | +| ``TIMESTAMP WITH LOCAL TIME ZONE`` | | | | ++--------------------------------------------+ | | | +| ``TIMESTAMP(N) WITH LOCAL TIME ZONE`` | | | | ++--------------------------------------------+ | | | +| ``INTERVAL YEAR TO MONTH`` | | | | ++--------------------------------------------+ | | | +| ``INTERVAL DAY TO SECOND`` | | | | ++--------------------------------------------+------------------------------------+---------------------------------+---------------------------------+ + +.. warning:: + + Note that types in Oracle and Spark have different value ranges: + + +---------------+------------------------------------+-----------------------------------+---------------------+--------------------------------+--------------------------------+ + | Oracle type | Min value | Max value | Spark type | Min value | Max value | + +===============+====================================+===================================+=====================+================================+================================+ + | ``date`` | ``-4712-01-01`` | ``9999-01-01`` | ``DateType()`` | ``0001-01-01`` | ``9999-12-31`` | + +---------------+------------------------------------+-----------------------------------+---------------------+--------------------------------+--------------------------------+ + | ``timestamp`` | ``-4712-01-01 00:00:00.000000000`` | ``9999-12-31 23:59:59.999999999`` | ``TimestampType()`` | ``0001-01-01 00:00:00.000000`` | ``9999-12-31 23:59:59.999999`` | + +---------------+------------------------------------+-----------------------------------+---------------------+--------------------------------+--------------------------------+ + + So not all of values can be read from Oracle to Spark. + + References: + * `Oracle date, timestamp and intervals documentation `_ + * `Spark DateType documentation `_ + * `Spark TimestampType documentation `_ + +.. [4] + Oracle support timestamp up to nanoseconds precision (``23:59:59.999999999``), + but Spark ``TimestampType()`` supports datetime up to microseconds precision (``23:59:59.999999``). + Nanoseconds will be lost during read or write operations. + +String types +~~~~~~~~~~~~ + ++-----------------------------+------------------+---------------------+----------------------+ +| Oracle type (read) | Spark type | Oracle type (write) | Oracle type (create) | ++=============================+==================+=====================+======================+ +| ``CHAR`` | ``StringType()`` | ``CLOB`` | ``CLOB`` | ++-----------------------------+ | | | +| ``CHAR(N CHAR)`` | | | | ++-----------------------------+ | | | +| ``CHAR(N BYTE)`` | | | | ++-----------------------------+ | | | +| ``NCHAR`` | | | | ++-----------------------------+ | | | +| ``NCHAR(N)`` | | | | ++-----------------------------+ | | | +| ``VARCHAR(N)`` | | | | ++-----------------------------+ | | | +| ``LONG VARCHAR`` | | | | ++-----------------------------+ | | | +| ``VARCHAR2(N CHAR)`` | | | | ++-----------------------------+ | | | +| ``VARCHAR2(N BYTE)`` | | | | ++-----------------------------+ | | | +| ``NVARCHAR2(N)`` | | | | ++-----------------------------+ | | | +| ``CLOB`` | | | | ++-----------------------------+ | | | +| ``NCLOB`` | | | | ++-----------------------------+------------------+---------------------+----------------------+ + +Binary types +~~~~~~~~~~~~ + ++--------------------------+------------------+---------------------+----------------------+ +| Oracle type (read) | Spark type | Oracle type (write) | Oracle type (create) | ++==========================+==================+=====================+======================+ +| ``RAW(N)`` | ``BinaryType()`` | ``BLOB`` | ``BLOB`` | ++--------------------------+ | | | +| ``LONG RAW`` | | | | ++--------------------------+ | | | +| ``BLOB`` | | | | ++--------------------------+------------------+---------------------+----------------------+ +| ``BFILE`` | unsupported | | | ++--------------------------+------------------+---------------------+----------------------+ + +Struct types +~~~~~~~~~~~~ + ++-------------------------------------+------------------+---------------------+----------------------+ +| Oracle type (read) | Spark type | Oracle type (write) | Oracle type (create) | ++=====================================+==================+=====================+======================+ +| ``XMLType`` | ``StringType()`` | ``CLOB`` | ``CLOB`` | ++-------------------------------------+ | | | +| ``URIType`` | | | | ++-------------------------------------+ | | | +| ``DBURIType`` | | | | ++-------------------------------------+ | | | +| ``XDBURIType`` | | | | ++-------------------------------------+ | | | +| ``HTTPURIType`` | | | | ++-------------------------------------+ | | | +| ``CREATE TYPE ... AS OBJECT (...)`` | | | | ++-------------------------------------+------------------+---------------------+----------------------+ +| ``JSON`` | unsupported | | | ++-------------------------------------+ | | | +| ``CREATE TYPE ... AS VARRAY ...`` | | | | ++-------------------------------------+ | | | +| ``CREATE TYPE ... AS TABLE OF ...`` | | | | ++-------------------------------------+------------------+---------------------+----------------------+ + +Special types +~~~~~~~~~~~~~ + ++--------------------+-------------------+---------------------+----------------------+ +| Oracle type (read) | Spark type | Oracle type (write) | Oracle type (create) | ++====================+===================+=====================+======================+ +| ``BOOLEAN`` | ``BooleanType()`` | ``BOOLEAN`` | ``NUMBER(P=1, S=0)`` | ++--------------------+-------------------+---------------------+----------------------+ +| ``ROWID`` | ``StringType()`` | ``CLOB`` | ``CLOB`` | ++--------------------+ | | | +| ``UROWID`` | | | | ++--------------------+ | | | +| ``UROWID(N)`` | | | | ++--------------------+-------------------+---------------------+----------------------+ +| ``ANYTYPE`` | unsupported | | | ++--------------------+ | | | +| ``ANYDATA`` | | | | ++--------------------+ | | | +| ``ANYDATASET`` | | | | ++--------------------+-------------------+---------------------+----------------------+ + +Explicit type cast +------------------ + +``DBReader`` +~~~~~~~~~~~~ + +It is possible to explicitly cast column of unsupported type using ``DBReader(columns=...)`` syntax. + +For example, you can use ``CAST(column AS CLOB)`` to convert data to string representation on Oracle side, and so it will be read as Spark's ``StringType()``. + +It is also possible to use `JSON_ARRAY `_ +or `JSON_OBJECT `_ Oracle functions +to convert column of any type to string representation, and then parse this column on Spark side using +`from_json `_: + +.. code-block:: python + + from pyspark.sql.types import IntegerType + + from onetl.connection import Oracle + from onetl.db import DBReader + + oracle = Oracle(...) + + DBReader( + connection=oracle, + columns=[ + "id", + "supported_column", + "CAST(unsupported_column AS VARCHAR2(4000)) unsupported_column_str", + # or + "JSON_ARRAY(array_column) array_column_json", + ], + ) + df = reader.run() + + # Spark requires all columns to have some type, describe it + column_type = IntegerType() + + # cast column content to proper Spark type + df = df.select( + df.id, + df.supported_column, + # explicit cast + df.unsupported_column_str.cast("integer").alias("parsed_integer"), + # or explicit json parsing + from_json(df.array_column_json, schema).alias("array_column"), + ) + +``DBWriter`` +~~~~~~~~~~~~ + +It is always possible to convert data on Spark side to string, and then write it to ``text`` column in Oracle table. + +For example, you can convert data using `to_json `_ function. + +.. code:: python + + from pyspark.sql.functions import to_json + + from onetl.connection import Oracle + from onetl.db import DBReader + + oracle = Oracle(...) + + oracle.execute( + """ + CREATE TABLE schema.target_table ( + id INTEGER, + supported_column TIMESTAMP, + array_column_json VARCHAR2(4000) -- any string type, actually + ) + """, + ) + + write_df = df.select( + df.id, + df.supported_column, + to_json(df.unsupported_column).alias("array_column_json"), + ) + + writer = DBWriter( + connection=oracle, + target="schema.target_table", + ) + writer.run(write_df) + +Then you can parse this column on Oracle side - for example, by creating a view: + +.. code-block:: sql + + SELECT + id, + supported_column, + JSON_VALUE(array_column_json, '$[0]' RETURNING NUMBER) AS array_item_0 + FROM + schema.target_table + +Or by using `VIRTUAL column `_: + +.. code-block:: sql + + CREATE TABLE schema.target_table ( + id INTEGER, + supported_column TIMESTAMP, + array_column_json VARCHAR2(4000), -- any string type, actually + array_item_0 GENERATED ALWAYS AS (JSON_VALUE(array_column_json, '$[0]' RETURNING NUMBER)) VIRTUAL + ) + +But data will be parsed on each table read in any case, as Oracle does no support ``GENERATED ALWAYS AS (...) STORED`` columns. diff --git a/docs/connection/db_connection/oracle/write.rst b/docs/connection/db_connection/oracle/write.rst index 78c57d915..5ce0e3b86 100644 --- a/docs/connection/db_connection/oracle/write.rst +++ b/docs/connection/db_connection/oracle/write.rst @@ -1,9 +1,46 @@ .. _oracle-write: -Writing to Oracle -================= +Writing to Oracle using ``DBWriter`` +==================================== -For writing data to Oracle, use :obj:`DBWriter ` with options below. +For writing data to Oracle, use :obj:`DBWriter `. + +.. warning:: + + Please take into account :ref:`oracle-types` + +.. warning:: + + It is always recommended to create table explicitly using :ref:`Oracle.execute ` + instead of relying on Spark's table DDL generation. + + This is because Spark's DDL generator can create columns with different precision and types than it is expected, + causing precision loss or other issues. + +Examples +-------- + +.. code-block:: python + + from onetl.connection import Oracle + from onetl.db import DBWriter + + oracle = Oracle(...) + + df = ... # data is here + + writer = DBWriter( + connection=oracle, + target="schema.table", + options=Oracle.WriteOptions(if_exists="append"), + ) + + writer.run(df) + +Options +------- + +Method above accepts :obj:`JDBCWriteOptions ` .. currentmodule:: onetl.connection.db_connection.jdbc_connection.options diff --git a/docs/connection/db_connection/postgres/execute.rst b/docs/connection/db_connection/postgres/execute.rst index 042b97197..8c966c110 100644 --- a/docs/connection/db_connection/postgres/execute.rst +++ b/docs/connection/db_connection/postgres/execute.rst @@ -3,11 +3,104 @@ Executing statements in Postgres ================================ -.. currentmodule:: onetl.connection.db_connection.postgres.connection +.. warning:: -.. automethod:: Postgres.fetch -.. automethod:: Postgres.execute -.. automethod:: Postgres.close + Methods below **read all the rows** returned from DB **to Spark driver memory**, and then convert them to DataFrame. + + Do **NOT** use them to read large amounts of data. Use :ref:`DBReader ` or :ref:`Postgres.sql ` instead. + +How to +------ + +There are 2 ways to execute some statement in Postgres + +Use ``Postgres.fetch`` +~~~~~~~~~~~~~~~~~~~~~~ + +Use this method to execute some ``SELECT`` query which returns **small number or rows**, like reading +Postgres config, or reading data from some reference table. Method returns Spark DataFrame. + +Method accepts :obj:`JDBCOptions `. + +Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. + +.. warning:: + + Please take into account :ref:`postgres-types`. + +Syntax support +^^^^^^^^^^^^^^ + +This method supports **any** query syntax supported by Postgres, like: + +* ✅︎ ``SELECT ... FROM ...`` +* ✅︎ ``WITH alias AS (...) SELECT ...``\ +* ❌ ``SET ...; SELECT ...;`` - multiple statements not supported + +Examples +^^^^^^^^ + +.. code-block:: python + + from onetl.connection import Postgres + + postgres = Postgres(...) + + df = postgres.fetch( + "SELECT value FROM some.reference_table WHERE key = 'some_constant'", + options=Postgres.JDBCOptions(query_timeout=10), + ) + postgres.close() + value = df.collect()[0][0] # get value from first row and first column + +Use ``Postgres.execute`` +~~~~~~~~~~~~~~~~~~~~~~~~ + +Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it. + +Method accepts :obj:`JDBCOptions `. + +Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. + +Syntax support +^^^^^^^^^^^^^^ + +This method supports **any** query syntax supported by Postgres, like: + +* ✅︎ ``CREATE TABLE ...``, ``CREATE VIEW ...``, ``DROP TABLE ...``, and so on +* ✅︎ ``ALTER ...`` +* ✅︎ ``INSERT INTO ... SELECT ...``, ``UPDATE ...``, ``DELETE ...``, and so on +* ✅︎ ``DROP TABLE ...``, ``DROP VIEW ...``, and so on +* ✅︎ ``CALL procedure(arg1, arg2) ...`` +* ✅︎ ``SELECT func(arg1, arg2)`` or ``{call func(arg1, arg2)}`` - special syntax for calling functions +* ✅︎ other statements not mentioned here +* ❌ ``SET ...; SELECT ...;`` - multiple statements not supported + +Examples +^^^^^^^^ + +.. code-block:: python + + from onetl.connection import Postgres + + postgres = Postgres(...) + + with postgres: + # automatically close connection after exiting this context manager + postgres.execute("DROP TABLE schema.table") + postgres.execute( + """ + CREATE TABLE schema.table AS ( + id bigint GENERATED ALWAYS AS IDENTITY, + key text, + value real + ) + """, + options=Postgres.JDBCOptions(query_timeout=10), + ) + +Options +------- .. currentmodule:: onetl.connection.db_connection.jdbc_mixin.options diff --git a/docs/connection/db_connection/postgres/index.rst b/docs/connection/db_connection/postgres/index.rst index f5376ee93..74d199bbc 100644 --- a/docs/connection/db_connection/postgres/index.rst +++ b/docs/connection/db_connection/postgres/index.rst @@ -7,6 +7,7 @@ Postgres :maxdepth: 1 :caption: Connection + prerequisites connection .. toctree:: @@ -14,5 +15,12 @@ Postgres :caption: Operations read + sql write execute + +.. toctree:: + :maxdepth: 1 + :caption: Troubleshooting + + types diff --git a/docs/connection/db_connection/postgres/prerequisites.rst b/docs/connection/db_connection/postgres/prerequisites.rst new file mode 100644 index 000000000..509b54bc0 --- /dev/null +++ b/docs/connection/db_connection/postgres/prerequisites.rst @@ -0,0 +1,75 @@ +.. _postgres-prerequisites: + +Prerequisites +============= + +Version Compatibility +--------------------- + +* PostgreSQL server versions: 8.2 or higher +* Spark versions: 2.3.x - 3.5.x +* Java versions: 8 - 20 + +See `official documentation `_. + +Installing PySpark +------------------ + +To use Postgres connector you should have PySpark installed (or injected to ``sys.path``) +BEFORE creating the connector instance. + +See :ref:`install-spark` installation instruction for more details. + +Connecting to Postgres +----------------------- + +Allowing connection to Postgres instance +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Ask your Postgres administrator to allow your user (and probably IP) to connect to instance, +e.g. by updating ``pg_hba.conf`` file. + +See `official documentation `_. + +Connection port +~~~~~~~~~~~~~~~ + +Connection is usually performed to port 5432. Port may differ for different Postgres instances. +Please ask your Postgres administrator to provide required information. + +Connection host +~~~~~~~~~~~~~~~ + +It is possible to connect to Postgres by using either DNS name of host or it's IP address. + +If you're using Postgres cluster, it is currently possible to connect only to **one specific node**. +Connecting to multiple nodes to perform load balancing, as well as automatic failover to new master/replica are not supported. + +Required grants +~~~~~~~~~~~~~~~ + +Ask your Postgres cluster administrator to set following grants for a user, +used for creating a connection: + +.. tabs:: + + .. code-tab:: sql Read + Write + + -- allow creating tables in specific schema + GRANT USAGE, CREATE ON SCHEMA myschema TO username; + + -- allow read & write access to specific table + GRANT SELECT, INSERT ON myschema.mytable TO username; + + -- only if if_exists="replace_entire_table" is used: + GRANT TRUNCATE ON myschema.mytable TO username; + + .. code-tab:: sql Read only + + -- allow creating tables in specific schema + GRANT USAGE ON SCHEMA myschema TO username; + + -- allow read access to specific table + GRANT SELECT ON myschema.mytable TO username; + +More details can be found in `official documentation `_. diff --git a/docs/connection/db_connection/postgres/read.rst b/docs/connection/db_connection/postgres/read.rst index 737a731db..67b5234a2 100644 --- a/docs/connection/db_connection/postgres/read.rst +++ b/docs/connection/db_connection/postgres/read.rst @@ -1,18 +1,89 @@ .. _postgres-read: -Reading from Postgres -===================== +Reading from Postgres using ``DBReader`` +======================================== -There are 2 ways of distributed data reading from Postgres: +:obj:`DBReader ` supports :ref:`strategy` for incremental data reading, +but does not support custom queries, like ``JOIN``. -* Using :obj:`DBReader ` with different :ref:`strategy` -* Using :obj:`Postgres.sql ` +.. warning:: -Both methods accept :obj:`JDBCReadOptions ` + Please take into account :ref:`postgres-types` -.. currentmodule:: onetl.connection.db_connection.postgres.connection +Supported DBReader features +--------------------------- -.. automethod:: Postgres.sql +* ✅︎ ``columns`` +* ✅︎ ``where`` +* ✅︎ ``hwm``, supported strategies: +* * ✅︎ :ref:`snapshot-strategy` +* * ✅︎ :ref:`incremental-strategy` +* * ✅︎ :ref:`snapshot-batch-strategy` +* * ✅︎ :ref:`incremental-batch-strategy` +* ❌ ``hint`` (is not supported by Postgres) +* ❌ ``df_schema`` +* ✅︎ ``options`` (see :obj:`JDBCReadOptions `) + +Examples +-------- + +Snapshot strategy: + +.. code-block:: python + + from onetl.connection import Postgres + from onetl.db import DBReader + + postgres = Postgres(...) + + reader = DBReader( + connection=postgres, + source="schema.table", + columns=["id", "key", "CAST(value AS text) value", "updated_dt"], + where="key = 'something'", + options=Postgres.ReadOptions(partition_column="id", num_partitions=10), + ) + df = reader.run() + +Incremental strategy: + +.. code-block:: python + + from onetl.connection import Postgres + from onetl.db import DBReader + from onetl.strategy import IncrementalStrategy + + postgres = Postgres(...) + + reader = DBReader( + connection=postgres, + source="schema.table", + columns=["id", "key", "CAST(value AS text) value", "updated_dt"], + where="key = 'something'", + hwm=DBReader.AutoDetectHWM(name="postgres_hwm", expression="updated_dt"), + options=Postgres.ReadOptions(partition_column="id", num_partitions=10), + ) + + with IncrementalStrategy(): + df = reader.run() + +Recommendations +--------------- + +Select only required columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of passing ``"*"`` in ``DBReader(columns=[...])`` prefer passing exact column names. This reduces the amount of data passed from Postgres to Spark. + +Pay attention to ``where`` value +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of filtering data on Spark side using ``df.filter(df.column == 'value')`` pass proper ``DBReader(where="column = 'value'")`` clause. +This both reduces the amount of data send from Postgres to Spark, and may also improve performance of the query. +Especially if there are indexes or partitions for columns used in ``where`` clause. + +Options +------- .. currentmodule:: onetl.connection.db_connection.jdbc_connection.options diff --git a/docs/connection/db_connection/postgres/sql.rst b/docs/connection/db_connection/postgres/sql.rst new file mode 100644 index 000000000..3f762f1ad --- /dev/null +++ b/docs/connection/db_connection/postgres/sql.rst @@ -0,0 +1,68 @@ +.. _postgres-sql: + +Reading from Postgres using ``Postgres.sql`` +============================================ + +``Postgres.sql`` allows passing custom SQL query, but does not support incremental strategies. + +.. warning:: + + Please take into account :ref:`postgres-types` + +.. warning:: + + Statement is executed in **read-write** connection, so if you're calling some functions/procedures with DDL/DML statements inside, + they can change data in your database. + +Syntax support +-------------- + +Only queries with the following syntax are supported: + +* ✅︎ ``SELECT ... FROM ...`` +* ✅︎ ``WITH alias AS (...) SELECT ...`` +* ❌ ``SET ...; SELECT ...;`` - multiple statements not supported + +Examples +-------- + +.. code-block:: python + + from onetl.connection import Postgres + + postgres = Postgres(...) + df = postgres.sql( + """ + SELECT + id, + key, + CAST(value AS text) value, + updated_at + FROM + some.mytable + WHERE + key = 'something' + """, + options=Postgres.ReadOptions( + partition_column="id", + num_partitions=10, + lower_bound=0, + upper_bound=1000, + ), + ) + +Recommendations +--------------- + +Select only required columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of passing ``SELECT * FROM ...`` prefer passing exact column names ``SELECT col1, col2, ...``. +This reduces the amount of data passed from Postgres to Spark. + +Pay attention to ``where`` value +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of filtering data on Spark side using ``df.filter(df.column == 'value')`` pass proper ``WHERE column = 'value'`` clause. +This both reduces the amount of data send from Postgres to Spark, and may also improve performance of the query. +Especially if there are indexes or partitions for columns used in ``where`` clause. diff --git a/docs/connection/db_connection/postgres/types.rst b/docs/connection/db_connection/postgres/types.rst new file mode 100644 index 000000000..5424289e4 --- /dev/null +++ b/docs/connection/db_connection/postgres/types.rst @@ -0,0 +1,490 @@ +.. _postgres-types: + +Postgres <-> Spark type mapping +================================= + +Type detection & casting +------------------------ + +Spark's DataFrames always have a ``schema`` which is a list of columns with corresponding Spark types. All operations on a column are performed using column type. + +Reading from Postgres +~~~~~~~~~~~~~~~~~~~~~ + +This is how Postgres connector performs this: + +* For each column in query result (``SELECT column1, column2, ... FROM table ...``) get column name and Postgres type. +* Find corresponding ``Postgres type (read)`` -> ``Spark type`` combination (see below) for each DataFrame column [1]_. If no combination is found, raise exception. +* Create DataFrame from query with specific column names and Spark types. + +.. [1] + All Postgres types that doesn't have corresponding Java type are converted to ``String``. + +Writing to some existing Postgres table +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This is how Postgres connector performs this: + +* Get names of columns in DataFrame. [1]_ +* Perform ``SELECT * FROM table LIMIT 0`` query. +* Take only columns present in DataFrame (by name, case insensitive). For each found column get Clickhouse type. +* Find corresponding ``Spark type`` -> ``Postgres type (write)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* If ``Postgres type (write)`` match ``Postgres type (read)``, no additional casts will be performed, DataFrame column will be written to Postgres as is. +* If ``Postgres type (write)`` does not match ``Postgres type (read)``, DataFrame column will be casted to target column type **on Postgres side**. + For example, you can write column with text data to ``int`` column, if column contains valid integer values within supported value range and precision [3]_. + +.. [2] + This allows to write data to tables with ``DEFAULT`` and ``GENERATED`` columns - if DataFrame has no such column, + it will be populated by Postgres. + +.. [3] + This is true only if either DataFrame column is a ``StringType()``, or target column is ``text`` type. + + But other types cannot be silently converted, like ``bytea -> bit(N)``. This requires explicit casting, see `Manual conversion to string`_. + +Create new table using Spark +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. warning:: + + ABSOLUTELY NOT RECOMMENDED! + +This is how Postgres connector performs this: + +* Find corresponding ``Spark type`` -> ``Postgres type (create)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* Generate DDL for creating table in Postgres, like ``CREATE TABLE (col1 ...)``, and run it. +* Write DataFrame to created table as is. + +But Postgres connector support only limited number of types and almost no custom clauses (like ``PARTITION BY``, ``INDEX``, etc). +So instead of relying on Spark to create tables: + +.. dropdown:: See example + + .. code:: python + + writer = DBWriter( + connection=postgres, + table="public.table", + options=Postgres.WriteOptions( + if_exists="append", + createTableOptions="PARTITION BY RANGE (id)", + ), + ) + writer.run(df) + +Always prefer creating table with desired DDL **BEFORE WRITING DATA**: + +.. dropdown:: See example + + .. code:: python + + postgres.execute( + """ + CREATE TABLE public.table AS ( + id bigint, + business_dt timestamp(6), + value json + ) + PARTITION BY RANGE (Id) + """, + ) + + writer = DBWriter( + connection=postgres, + table="public.table", + options=Postgres.WriteOptions(if_exists="append"), + ) + writer.run(df) + +See Postgres `CREATE TABLE `_ documentation. + +Supported types +--------------- + +References +~~~~~~~~~~ + +See `List of Postgres types `_. + +Here you can find source code with type conversions: + +* `Postgres <-> JDBC `_ +* `JDBC -> Spark `_ +* `Spark -> JDBC `_ + +Numeric types +~~~~~~~~~~~~~ + ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +| Postgres type (read) | Spark type | Postgres type (write) | Postgres type (create) | ++==================================+===================================+===============================+=========================+ +| ``decimal`` | ``DecimalType(P=38, S=18)`` | ``decimal(P=38, S=18)`` | ``decimal`` (unbounded) | ++----------------------------------+-----------------------------------+-------------------------------+ | +| ``decimal(P=0..38)`` | ``DecimalType(P=0..38, S=0)`` | ``decimal(P=0..38, S=0)`` | | ++----------------------------------+-----------------------------------+-------------------------------+ | +| ``decimal(P=0..38, S=0..38)`` | ``DecimalType(P=0..38, S=0..38)`` | ``decimal(P=0..38, S=0..38)`` | | ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +| ``decimal(P=39.., S=0..)`` | unsupported [4]_ | | | ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +| ``decimal(P=.., S=..-1)`` | unsupported [5]_ | | | ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +| ``real`` | ``FloatType()`` | ``real`` | ``real`` | ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +| ``double precision`` | ``DoubleType()`` | ``double precision`` | ``double precision`` | ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +| ``smallint`` | ``ShortType()`` | ``smallint`` | ``smallint`` | ++----------------------------------+-----------------------------------+ | | +| ``-`` | ``ByteType()`` | | | ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +| ``integer`` | ``IntegerType()`` | ``integer`` | ``integer`` | ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +| ``bigint`` | ``LongType()`` | ``bigint`` | ``bigint`` | ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +| ``money`` | ``StringType()`` [1]_ | ``text`` | ``text`` | ++----------------------------------+ | | | +| ``int4range`` | | | | ++----------------------------------+ | | | +| ``int8range`` | | | | ++----------------------------------+ | | | +| ``numrange`` | | | | ++----------------------------------+ | | | +| ``int2vector`` | | | | ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ + +.. [4] + + Postgres support decimal types with unlimited precision. + + But Spark's ``DecimalType(P, S)`` supports maximum ``P=38`` (128 bit). It is impossible to read, write or operate with values of larger precision, + this leads to an exception. + +.. [5] + + Postgres support decimal types with negative scale, like ``decimal(38, -10)``. Spark doesn't. + +Temporal types +~~~~~~~~~~~~~~ + ++------------------------------------+------------------------------+-----------------------+-------------------------+ +| Postgres type (read) | Spark type | Postgres type (write) | Postgres type (create) | ++====================================+==============================+=======================+=========================+ +| ``date`` | ``DateType()`` | ``date`` | ``date`` | ++------------------------------------+------------------------------+-----------------------+-------------------------+ +| ``time`` | ``TimestampType()``, | ``timestamp(6)`` | ``timestamp(6)`` | ++------------------------------------+ with time format quirks [6]_ | | | +| ``time(0..6)`` | | | | ++------------------------------------+ | | | +| ``time with time zone`` | | | | ++------------------------------------+ | | | +| ``time(0..6) with time zone`` | | | | ++------------------------------------+------------------------------+-----------------------+-------------------------+ +| ``timestamp`` | ``TimestampType()`` | ``timestamp(6)`` | ``timestamp(6)`` | ++------------------------------------+ | | | +| ``timestamp(0..6)`` | | | | ++------------------------------------+ | | | +| ``timestamp with time zone`` | | | | ++------------------------------------+ | | | +| ``timestamp(0..6) with time zone`` | | | | ++------------------------------------+------------------------------+-----------------------+-------------------------+ +| ``-`` | ``TimestampNTZType()`` | ``timestamp(6)`` | ``timestamp(6)`` | ++------------------------------------+------------------------------+-----------------------+-------------------------+ +| ``interval`` of any precision | ``StringType()`` [1]_ | ``text`` | ``text`` | ++------------------------------------+------------------------------+-----------------------+-------------------------+ +| ``-`` | ``DayTimeIntervalType()`` | unsupported | unsupported | ++------------------------------------+------------------------------+-----------------------+-------------------------+ +| ``-`` | ``YearMonthIntervalType()`` | unsupported | unsupported | ++------------------------------------+------------------------------+-----------------------+-------------------------+ +| ``daterange`` | ``StringType()`` [1]_ | ``text`` | ``text`` | ++------------------------------------+ | | | +| ``tsrange`` | | | | ++------------------------------------+ | | | +| ``tstzrange`` | | | | ++------------------------------------+------------------------------+-----------------------+-------------------------+ + +.. warning:: + + Note that types in Postgres and Spark have different value ranges: + + +---------------+---------------------------------+----------------------------------+---------------------+--------------------------------+--------------------------------+ + | Postgres type | Min value | Max value | Spark type | Min value | Max value | + +===============+=================================+==================================+=====================+================================+================================+ + | ``date`` | ``-4713-01-01`` | ``5874897-01-01`` | ``DateType()`` | ``0001-01-01`` | ``9999-12-31`` | + +---------------+---------------------------------+----------------------------------+---------------------+--------------------------------+--------------------------------+ + | ``timestamp`` | ``-4713-01-01 00:00:00.000000`` | ``294276-12-31 23:59:59.999999`` | ``TimestampType()`` | ``0001-01-01 00:00:00.000000`` | ``9999-12-31 23:59:59.999999`` | + +---------------+---------------------------------+----------------------------------+ | | | + | ``time`` | ``00:00:00.000000`` | ``24:00:00.000000`` | | | | + +---------------+---------------------------------+----------------------------------+---------------------+--------------------------------+--------------------------------+ + + So not all of values can be read from Postgres to Spark. + + References: + * `Postgres date/time types documentation `_ + * `Spark DateType documentation `_ + * `Spark TimestampType documentation `_ + +.. [6] + + ``time`` type is the same as ``timestamp`` with date ``1970-01-01``. So instead of reading data from Postgres like ``23:59:59`` + it is actually read ``1970-01-01 23:59:59``, and vice versa. + +String types +~~~~~~~~~~~~ + ++-----------------------------+-----------------------+-----------------------+-------------------------+ +| Postgres type (read) | Spark type | Postgres type (write) | Postgres type (create) | ++=============================+=======================+=======================+=========================+ +| ``character`` | ``StringType()`` | ``text`` | ``text`` | ++-----------------------------+ | | | +| ``character(N)`` | | | | ++-----------------------------+ | | | +| ``character varying`` | | | | ++-----------------------------+ | | | +| ``character varying(N)`` | | | | ++-----------------------------+ | | | +| ``text`` | | | | ++-----------------------------+ | | | +| ``json`` | | | | ++-----------------------------+ | | | +| ``jsonb`` | | | | ++-----------------------------+ | | | +| ``xml`` | | | | ++-----------------------------+-----------------------| | | +| ``CREATE TYPE ... AS ENUM`` | ``StringType()`` [1]_ | | | ++-----------------------------+ | | | +| ``tsvector`` | | | | ++-----------------------------+ | | | +| ``tsquery`` | | | | ++-----------------------------+-----------------------+-----------------------+-------------------------+ +| ``-`` | ``CharType()`` | ``unsupported`` | ``unsupported`` | ++-----------------------------+-----------------------+-----------------------+-------------------------+ +| ``-`` | ``VarcharType()`` | ``unsupported`` | ``unsupported`` | ++-----------------------------+-----------------------+-----------------------+-------------------------+ + +Binary types +~~~~~~~~~~~~ + ++--------------------------+-----------------------+-----------------------------+-------------------------+ +| Postgres type (read) | Spark type | Postgres type (write) | Postgres type (create) | ++==========================+=======================+=============================+=========================+ +| ``boolean`` | ``BooleanType()`` | ``boolean`` | ``boolean`` | ++--------------------------+-----------------------+-----------------------------+-------------------------+ +| ``bit`` | ``BooleanType()`` | ``bool``, | ``bool`` | ++--------------------------+ | **cannot insert data** [3]_ | | +| ``bit(N=1)`` | | | | ++--------------------------+-----------------------+-----------------------------+-------------------------+ +| ``bit(N=2..)`` | ``ByteType()`` | ``bytea``, | ``bytea`` | +| | | **cannot insert data** [3]_ | | ++--------------------------+-----------------------+-----------------------------+-------------------------+ +| ``bit varying`` | ``StringType()`` [1]_ | ``text`` | ``text`` | ++--------------------------+ | | | +| ``bit varying(N)`` | | | | ++--------------------------+-----------------------+-----------------------------+-------------------------+ +| ``bytea`` | ``BinaryType()`` | ``bytea`` | ``bytea`` | ++--------------------------+-----------------------+-----------------------------+-------------------------+ + + +Struct types +~~~~~~~~~~~~ + ++--------------------------------+-----------------------+-----------------------+-------------------------+ +| Postgres type (read) | Spark type | Postgres type (write) | Postgres type (create) | ++================================+=======================+=======================+=========================+ +| ``T[]`` | ``ArrayType(T)`` | ``T[]`` | ``T[]`` | ++--------------------------------+-----------------------+-----------------------+-------------------------+ +| ``T[][]`` | unsupported | | | ++--------------------------------+-----------------------+-----------------------+-------------------------+ +| ``CREATE TYPE sometype (...)`` | ``StringType()`` [1]_ | ``text`` | ``text`` | ++--------------------------------+-----------------------+-----------------------+-------------------------+ +| ``-`` | ``StructType()`` | unsupported | | ++--------------------------------+-----------------------+ | | +| ``-`` | ``MapType()`` | | | ++--------------------------------+-----------------------+-----------------------+-------------------------+ + +Network types +~~~~~~~~~~~~~ + ++----------------------+-----------------------+-----------------------+-------------------------+ +| Postgres type (read) | Spark type | Postgres type (write) | Postgres type (create) | ++======================+=======================+=======================+=========================+ +| ``cidr`` | ``StringType()`` [1]_ | ``text`` | ``text`` | ++----------------------+ | | | +| ``inet`` | | | | ++----------------------+ | | | +| ``macaddr`` | | | | ++----------------------+ | | | +| ``macaddr8`` | | | | ++----------------------+-----------------------+-----------------------+-------------------------+ + +Geo types +~~~~~~~~~ + ++----------------------+-----------------------+-----------------------+-------------------------+ +| Postgres type (read) | Spark type | Postgres type (write) | Postgres type (create) | ++======================+=======================+=======================+=========================+ +| ``circle`` | ``StringType()`` [1]_ | ``text`` | ``text`` | ++----------------------+ | | | +| ``box`` | | | | ++----------------------+ | | | +| ``line`` | | | | ++----------------------+ | | | +| ``lseg`` | | | | ++----------------------+ | | | +| ``path`` | | | | ++----------------------+ | | | +| ``point`` | | | | ++----------------------+ | | | +| ``polygon`` | | | | ++----------------------+ | | | +| ``polygon`` | | | | ++----------------------+-----------------------+-----------------------+-------------------------+ + +Explicit type cast +------------------ + +``DBReader`` +~~~~~~~~~~~~ + +It is possible to explicitly cast column of unsupported type using ``DBReader(columns=...)`` syntax. + +For example, you can use ``CAST(column AS text)`` to convert data to string representation on Postgres side, and so it will be read as Spark's ``StringType()``. + +It is also possible to use `to_json `_ Postgres function +to convert column of any type to string representation, and then parse this column on Spark side using +`from_json `_: + +.. code-block:: python + + from pyspark.sql.functions import from_json + from pyspark.sql.types import IntegerType + + from onetl.connection import Postgres + from onetl.db import DBReader + + postgres = Postgres(...) + + DBReader( + connection=postgres, + columns=[ + "id", + "supported_column", + "CAST(unsupported_column AS text) unsupported_column_str", + # or + "to_json(unsupported_column) array_column_json", + ], + ) + df = reader.run() + + # Spark requires all columns to have some type, describe it + column_type = IntegerType() + + # cast column content to proper Spark type + df = df.select( + df.id, + df.supported_column, + # explicit cast + df.unsupported_column_str.cast("integer").alias("parsed_integer"), + # or explicit json parsing + from_json(df.array_column_json, schema).alias("array_column"), + ) + +``DBWriter`` +~~~~~~~~~~~~ + +It is always possible to convert data on Spark side to string, and then write it to ``text`` column in Postgres table. + +Using ``to_json`` +^^^^^^^^^^^^^^^^^ + +For example, you can convert data using `to_json `_ function. + +.. code:: python + + from pyspark.sql.functions import to_json + + from onetl.connection import Postgres + from onetl.db import DBReader + + postgres = Postgres(...) + + postgres.execute( + """ + CREATE TABLE schema.target_table ( + id int, + supported_column timestamp, + array_column_json jsonb -- any column type, actually + ) + """, + ) + + write_df = df.select( + df.id, + df.supported_column, + to_json(df.unsupported_column).alias("array_column_json"), + ) + + writer = DBWriter( + connection=postgres, + target="schema.target_table", + ) + writer.run(write_df) + +Then you can parse this column on Postgres side (for example, by creating a view): + +.. code-block:: sql + + SELECT + id, + supported_column, + array_column_json->'0' AS array_item_0 + FROM + schema.target_table + +To avoid casting the value on every table read you can use `GENERATED ALWAYS STORED `_ column, but this requires 2x space (for original and parsed value). + +Manual conversion to string +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Postgres connector also supports conversion text value directly to target column type, if this value has a proper format. + +For example, you can write data like ``[123, 345)`` to ``int8range`` type because Postgres allows cast ``'[123, 345)'::int8range'``: + +.. code:: python + + from pyspark.sql.ftypes import StringType + from pyspark.sql.functions import udf + + from onetl.connection import Postgres + from onetl.db import DBReader + + postgres = Postgres(...) + + postgres.execute( + """ + CREATE TABLE schema.target_table ( + id int, + range_column int8range -- any column type, actually + ) + """, + ) + + + @udf(returnType=StringType()) + def array_to_range(value: tuple): + """This UDF allows to convert tuple[start, end] to Postgres' range format""" + start, end = value + return f"[{start},{end})" + + + write_df = df.select( + df.id, + array_to_range(df.range_column).alias("range_column"), + ) + + writer = DBWriter( + connection=postgres, + target="schema.target_table", + ) + writer.run(write_df) + +This can be tricky to implement and may lead to longer write process. +But this does not require extra space on Postgres side, and allows to avoid explicit value cast on every table read. diff --git a/docs/connection/db_connection/postgres/write.rst b/docs/connection/db_connection/postgres/write.rst index db96b4ec8..f35edf9b8 100644 --- a/docs/connection/db_connection/postgres/write.rst +++ b/docs/connection/db_connection/postgres/write.rst @@ -1,9 +1,46 @@ .. _postgres-write: -Writing to Postgres -=================== +Writing to Postgres using ``DBWriter`` +====================================== -For writing data to Postgres, use :obj:`DBWriter ` with options below. +For writing data to Postgres, use :obj:`DBWriter `. + +.. warning:: + + Please take into account :ref:`postgres-types` + +.. warning:: + + It is always recommended to create table explicitly using :ref:`Postgres.execute ` + instead of relying on Spark's table DDL generation. + + This is because Spark's DDL generator can create columns with different precision and types than it is expected, + causing precision loss or other issues. + +Examples +-------- + +.. code-block:: python + + from onetl.connection import Postgres + from onetl.db import DBWriter + + postgres = Postgres(...) + + df = ... # data is here + + writer = DBWriter( + connection=postgres, + target="schema.table", + options=Postgres.WriteOptions(if_exists="append"), + ) + + writer.run(df) + +Options +------- + +Method above accepts :obj:`JDBCWriteOptions ` .. currentmodule:: onetl.connection.db_connection.jdbc_connection.options diff --git a/docs/connection/db_connection/teradata/execute.rst b/docs/connection/db_connection/teradata/execute.rst index 80853f919..545b473d8 100644 --- a/docs/connection/db_connection/teradata/execute.rst +++ b/docs/connection/db_connection/teradata/execute.rst @@ -3,11 +3,103 @@ Executing statements in Teradata ================================ -.. currentmodule:: onetl.connection.db_connection.teradata.connection +.. warning:: -.. automethod:: Teradata.fetch -.. automethod:: Teradata.execute -.. automethod:: Teradata.close + Methods below **read all the rows** returned from DB **to Spark driver memory**, and then convert them to DataFrame. + + Do **NOT** use them to read large amounts of data. Use :ref:`DBReader ` or :ref:`Teradata.sql ` instead. + +How to +------ + +There are 2 ways to execute some statement in Teradata + +Use ``Teradata.fetch`` +~~~~~~~~~~~~~~~~~~~~~~ + +Use this method to execute some ``SELECT`` query which returns **small number or rows**, like reading +Teradata config, or reading data from some reference table. Method returns Spark DataFrame. + +Method accepts :obj:`JDBCOptions `. + +Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. + +Syntax support +^^^^^^^^^^^^^^ + +This method supports **any** query syntax supported by Teradata, like: + +* ✅︎ ``SELECT ... FROM ...`` +* ✅︎ ``WITH alias AS (...) SELECT ...`` +* ✅︎ ``SHOW ...`` +* ❌ ``SET ...; SELECT ...;`` - multiple statements not supported + +Examples +^^^^^^^^ + +.. code-block:: python + + from onetl.connection import Teradata + + teradata = Teradata(...) + + df = teradata.fetch( + "SELECT value FROM some.reference_table WHERE key = 'some_constant'", + options=Teradata.JDBCOptions(query_timeout=10), + ) + teradata.close() + value = df.collect()[0][0] # get value from first row and first column + +Use ``Teradata.execute`` +~~~~~~~~~~~~~~~~~~~~~~~~ + +Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it. + +Method accepts :obj:`JDBCOptions `. + +Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. + +Syntax support +^^^^^^^^^^^^^^ + +This method supports **any** query syntax supported by Teradata, like: + +* ✅︎ ``CREATE TABLE ...``, ``CREATE VIEW ...``, ``DROP TABLE ...``, and so on +* ✅︎ ``ALTER ...`` +* ✅︎ ``INSERT INTO ... SELECT ...``, ``UPDATE ...``, ``DELETE ...``, and so on +* ✅︎ ``DROP TABLE ...``, ``DROP VIEW ...``, and so on +* ✅︎ ``CALL procedure(arg1, arg2) ...`` or ``{call procedure(arg1, arg2)}`` - special syntax for calling procedure +* ✅︎ ``EXECUTE macro(arg1, arg2)`` +* ✅︎ ``EXECUTE FUNCTION ...`` +* ✅︎ other statements not mentioned here +* ❌ ``SET ...; SELECT ...;`` - multiple statements not supported + +Examples +^^^^^^^^ + +.. code-block:: python + + from onetl.connection import Teradata + + teradata = Teradata(...) + + with teradata: + # automatically close connection after exiting this context manager + teradata.execute("DROP TABLE database.table") + teradata.execute( + """ + CREATE MULTISET TABLE database.table AS ( + id BIGINT, + key VARCHAR, + value REAL + ) + NO PRIMARY INDEX + """, + options=Teradata.JDBCOptions(query_timeout=10), + ) + +Options +------- .. currentmodule:: onetl.connection.db_connection.jdbc_mixin.options diff --git a/docs/connection/db_connection/teradata/index.rst b/docs/connection/db_connection/teradata/index.rst index 2f6d6636d..a7d021947 100644 --- a/docs/connection/db_connection/teradata/index.rst +++ b/docs/connection/db_connection/teradata/index.rst @@ -7,6 +7,7 @@ Teradata :maxdepth: 1 :caption: Connection + prerequisites connection .. toctree:: @@ -14,5 +15,6 @@ Teradata :caption: Operations read + sql write execute diff --git a/docs/connection/db_connection/teradata/prerequisites.rst b/docs/connection/db_connection/teradata/prerequisites.rst new file mode 100644 index 000000000..294f9d536 --- /dev/null +++ b/docs/connection/db_connection/teradata/prerequisites.rst @@ -0,0 +1,60 @@ +.. _teradata-prerequisites: + +Prerequisites +============= + +Version Compatibility +--------------------- + +* Teradata server versions: 16.10 - 20.0 +* Spark versions: 2.3.x - 3.5.x +* Java versions: 8 - 20 + +See `official documentation `_. + +Installing PySpark +------------------ + +To use Teradata connector you should have PySpark installed (or injected to ``sys.path``) +BEFORE creating the connector instance. + +See :ref:`install-spark` installation instruction for more details. + +Connecting to Teradata +----------------------- + +Connection host +~~~~~~~~~~~~~~~ + +It is possible to connect to Teradata by using either DNS name Parsing Engine (PE) host, or it's IP address. + +Connection port +~~~~~~~~~~~~~~~ + +Connection is usually performed to port ``1025``. Port may differ for different Teradata instances. +Please ask your Teradata administrator to provide required information. + +Required grants +~~~~~~~~~~~~~~~ + +Ask your Teradata cluster administrator to set following grants for a user, +used for creating a connection: + +.. tabs:: + + .. code-tab:: sql Read + Write + + -- allow creating tables in the target schema + GRANT CREATE TABLE ON database TO username; + + -- allow read & write access to specific table + GRANT SELECT, INSERT ON database.mytable TO username; + + .. code-tab:: sql Read only + + -- allow read access to specific table + GRANT SELECT ON database.mytable TO username; + +See: + * `Teradata access rights `_ + * `GRANT documentation `_ diff --git a/docs/connection/db_connection/teradata/read.rst b/docs/connection/db_connection/teradata/read.rst index b23c42a59..f4cf95bfb 100644 --- a/docs/connection/db_connection/teradata/read.rst +++ b/docs/connection/db_connection/teradata/read.rst @@ -1,18 +1,119 @@ .. _teradata-read: -Reading from Teradata -===================== +Reading from Teradata using ``DBReader`` +======================================== -There are 2 ways of distributed data reading from Teradata: +:obj:`DBReader ` supports :ref:`strategy` for incremental data reading, +but does not support custom queries, like ``JOIN``. -* Using :obj:`DBReader ` with different :ref:`strategy` -* Using :obj:`Teradata.sql ` +Supported DBReader features +--------------------------- -Both methods accept :obj:`JDBCReadOptions ` +* ✅︎ ``columns`` +* ✅︎ ``where`` +* ✅︎ ``hwm``, supported strategies: +* * ✅︎ :ref:`snapshot-strategy` +* * ✅︎ :ref:`incremental-strategy` +* * ✅︎ :ref:`snapshot-batch-strategy` +* * ✅︎ :ref:`incremental-batch-strategy` +* ❌ ``hint`` (is not supported by Teradata) +* ❌ ``df_schema`` +* ✅︎ ``options`` (see :obj:`JDBCReadOptions `) -.. currentmodule:: onetl.connection.db_connection.teradata.connection +Examples +-------- -.. automethod:: Teradata.sql +Snapshot strategy: + +.. code-block:: python + + from onetl.connection import Teradata + from onetl.db import DBReader + + teradata = Teradata(...) + + reader = DBReader( + connection=teradata, + source="database.table", + columns=["id", "key", "CAST(value AS VARCHAR) value", "updated_dt"], + where="key = 'something'", + options=Teradata.ReadOptions( + partition_column="id", + num_partitions=10, + partitioning_mode="hash", + ), + ) + df = reader.run() + +Incremental strategy: + +.. code-block:: python + + from onetl.connection import Teradata + from onetl.db import DBReader + from onetl.strategy import IncrementalStrategy + + teradata = Teradata(...) + + reader = DBReader( + connection=teradata, + source="database.table", + columns=["id", "key", "CAST(value AS VARCHAR) value", "updated_dt"], + where="key = 'something'", + hwm=DBReader.AutoDetectHWM(name="teradata_hwm", expression="updated_dt"), + options=Teradata.ReadOptions( + partition_column="id", + num_partitions=10, + partitioning_mode="hash", + ), + ) + + with IncrementalStrategy(): + df = reader.run() + +Recommendations +--------------- + +Select only required columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of passing ``"*"`` in ``DBReader(columns=[...])`` prefer passing exact column names. This reduces the amount of data passed from Teradata to Spark. + +Pay attention to ``where`` value +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of filtering data on Spark side using ``df.filter(df.column == 'value')`` pass proper ``DBReader(where="column = 'value'")`` clause. +This both reduces the amount of data send from Teradata to Spark, and may also improve performance of the query. +Especially if there are indexes or partitions for columns used in ``where`` clause. + +Read data in parallel +~~~~~~~~~~~~~~~~~~~~~ + +``DBReader`` can read data in multiple parallel connections by passing ``Teradata.ReadOptions(num_partitions=..., partition_column=...)``. + +In the example above, Spark opens 10 parallel connections, and data is evenly distributed between all these connections using expression +``HASHAMP(HASHBUCKET(HASHROW({partition_column}))) MOD {num_partitions}``. +This allows sending each Spark worker only some piece of data, reducing resource consumption. +``partition_column`` here can be table column of any type. + +It is also possible to use ``partitioning_mode="mod"`` or ``partitioning_mode="range"``, but in this case +``partition_column`` have to be an integer, should not contain ``NULL``, and values to be uniformly distributed. +It is also less performant than ``partitioning_mode="hash"`` due to Teradata ``HASHAMP`` implementation. + +Do **NOT** use ``TYPE=FASTEXPORT`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Teradata supports several `different connection types `_: + * ``TYPE=DEFAULT`` - perform plain ``SELECT`` queries + * ``TYPE=FASTEXPORT`` - uses special FastExport protocol for select queries + +But ``TYPE=FASTEXPORT`` uses exclusive lock on the source table, so it is impossible to use multiple Spark workers parallel data read. +This leads to sending all the data to just one Spark worker, which is slow and takes a lot of RAM. + +Prefer using ``partitioning_mode="hash"`` from example above. + +Options +------- .. currentmodule:: onetl.connection.db_connection.jdbc_connection.options diff --git a/docs/connection/db_connection/teradata/sql.rst b/docs/connection/db_connection/teradata/sql.rst new file mode 100644 index 000000000..3531d01c3 --- /dev/null +++ b/docs/connection/db_connection/teradata/sql.rst @@ -0,0 +1,66 @@ +.. _teradata-sql: + +Reading from Teradata using ``Teradata.sql`` +============================================ + +``Teradata.sql`` allows passing custom SQL query, but does not support incremental strategies. + +.. warning:: + + Statement is executed in **read-write** connection, so if you're calling some functions/procedures with DDL/DML statements inside, + they can change data in your database. + +Syntax support +-------------- + +Only queries with the following syntax are supported: + +* ✅︎ ``SELECT ... FROM ...`` +* ✅︎ ``WITH alias AS (...) SELECT ...`` +* ❌ ``SHOW ...`` +* ❌ ``SET ...; SELECT ...;`` - multiple statements not supported + +Examples +-------- + +.. code-block:: python + + from onetl.connection import Teradata + + teradata = Teradata(...) + df = teradata.sql( + """ + SELECT + id, + key, + CAST(value AS VARCHAR) AS value, + updated_at, + HASHAMP(HASHBUCKET(HASHROW(id))) MOD 10 AS part_column + FROM + database.mytable + WHERE + key = 'something' + """, + options=Teradata.ReadOptions( + partition_column="part_column", + num_partitions=10, + lower_bound=0, + upper_bound=9, + ), + ) + +Recommendations +--------------- + +Select only required columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of passing ``SELECT * FROM ...`` prefer passing exact column names ``SELECT col1, col2, ...``. +This reduces the amount of data passed from Teradata to Spark. + +Pay attention to ``where`` value +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of filtering data on Spark side using ``df.filter(df.column == 'value')`` pass proper ``WHERE column = 'value'`` clause. +This both reduces the amount of data send from Teradata to Spark, and may also improve performance of the query. +Especially if there are indexes or partitions for columns used in ``where`` clause. diff --git a/docs/connection/db_connection/teradata/write.rst b/docs/connection/db_connection/teradata/write.rst index 7e5cbef40..288e79667 100644 --- a/docs/connection/db_connection/teradata/write.rst +++ b/docs/connection/db_connection/teradata/write.rst @@ -1,9 +1,116 @@ .. _teradata-write: -Writing to Teradata -=================== +Writing to Teradata using ``DBWriter`` +====================================== -For writing data to Teradata, use :obj:`DBWriter ` with options below. +For writing data to Teradata, use :obj:`DBWriter `. + +.. warning:: + + It is always recommended to create table explicitly using :ref:`Teradata.execute ` + instead of relying on Spark's table DDL generation. + + This is because Spark's DDL generator can create columns with different precision and types than it is expected, + causing precision loss or other issues. + +Examples +-------- + +.. code-block:: python + + from onetl.connection import Teradata + from onetl.db import DBWriter + + teradata = Teradata( + ..., + extra={"TYPE": "FASTLOAD", "TMODE": "TERA"}, + ) + + df = ... # data is here + + writer = DBWriter( + connection=teradata, + target="database.table", + options=Teradata.WriteOptions( + if_exists="append", + # avoid creating SET table, use MULTISET + createTableOptions="NO PRIMARY INDEX", + ), + ) + + writer.run(df.repartition(1)) + +Recommendations +--------------- + +Number of connections +~~~~~~~~~~~~~~~~~~~~~ + +Teradata is not MVCC based, so write operations take exclusive lock on the entire table. +So **it is impossible to write data to Teradata table in multiple parallel connections**, no exceptions. + +The only way to write to Teradata without making deadlocks is write dataframe with exactly 1 partition. + +It can be implemented using ``df.repartition(1)``: + +.. code-block:: python + + # do NOT use df.coalesce(1) as it can freeze + writer.run(df.repartition(1)) + +This moves all the data to just one Spark worker, so it may consume a lot of RAM. It is usually require to increase ``spark.executor.memory`` to handle this. + +Another way is to write all dataframe partitions one-by-one: + +.. code-block:: python + + from pyspark.sql.functions import spark_partition_id + + # get list of all partitions in the dataframe + partitions = sorted(df.select(spark_partition_id()).distinct().collect()) + + for partition in partitions: + # get only part of data within this exact partition + part_df = df.where(**partition.asDict()).coalesce(1) + + writer.run(part_df) + +This require even data distribution for all partitions to avoid data skew and spikes of RAM consuming. + +Choosing connection type +~~~~~~~~~~~~~~~~~~~~~~~~ + +Teradata supports several `different connection types `_: + * ``TYPE=DEFAULT`` - perform plain ``INSERT`` queries + * ``TYPE=FASTLOAD`` - uses special FastLoad protocol for insert queries + +It is always recommended to use ``TYPE=FASTLOAD`` because: + * It provides higher performance + * It properly handles inserting ``NULL`` values (``TYPE=DEFAULT`` raises an exception) + +But it can be used only during write, not read. + +Choosing transaction mode +~~~~~~~~~~~~~~~~~~~~~~~~~ + +Teradata supports `2 different transaction modes `_: + * ``TMODE=ANSI`` + * ``TMODE=TERA`` + +Choosing one of the modes can alter connector behavior. For example: + * Inserting data which exceeds table column length, like insert ``CHAR(25)`` to column with type ``CHAR(24)``: + * * ``TMODE=ANSI`` - raises exception + * * ``TMODE=TERA`` - truncates input string to 24 symbols + * Creating table using Spark: + * * ``TMODE=ANSI`` - creates ``MULTISET`` table + * * ``TMODE=TERA`` - creates ``SET`` table with ``PRIMARY KEY`` is a first column in dataframe. + This can lead to slower insert time, because each row will be checked against a unique index. + Fortunately, this can be disabled by passing custom ``createTableOptions``. + +Options +------- + +Method above accepts :obj:`JDBCWriteOptions ` .. currentmodule:: onetl.connection.db_connection.jdbc_connection.options diff --git a/docs/hwm_store/index.rst b/docs/hwm_store/index.rst index 884a72c1e..e6104d63c 100644 --- a/docs/hwm_store/index.rst +++ b/docs/hwm_store/index.rst @@ -3,11 +3,14 @@ HWM === -Since ``onetl>=0.10.0`` version, the HWM Store and HWM classes have been moved to a separate library :etl-entities:`etl-entities <>`. +Since onETL v0.10.0, the ``HWMStore`` and ``HWM`` classes have been moved to a separate library :etl-entities:`etl-entities <>`. -The only class was left intact is YamlHWMStore, **which is default** in onETL: +The only class was left intact is :ref:`yaml-hwm-store`, **which is default** in onETL. + +Other known implementation is `HorizonHWMStore `_. .. toctree:: :maxdepth: 2 + :hidden: yaml_hwm_store diff --git a/docs/hwm_store/yaml_hwm_store.rst b/docs/hwm_store/yaml_hwm_store.rst index b9d268fa7..6ac5e2530 100644 --- a/docs/hwm_store/yaml_hwm_store.rst +++ b/docs/hwm_store/yaml_hwm_store.rst @@ -1,9 +1,9 @@ .. _yaml-hwm-store: -YAML HWM Store -================================================================= +YAMLHWMStore +============ .. currentmodule:: onetl.hwm.store.yaml_hwm_store .. autoclass:: YAMLHWMStore - :members: get_hwm, save_hwm, __enter__ + :members: get_hwm, set_hwm, __enter__ diff --git a/docs/robots.txt b/docs/robots.txt new file mode 100644 index 000000000..dcab3dc9e --- /dev/null +++ b/docs/robots.txt @@ -0,0 +1,5 @@ +User-agent: * +Allow: /*/stable/ +Allow: /en/stable/ # Fallback for bots that don't understand wildcards +Disallow: / +Sitemap: https://onel.readthedocs.io/sitemap.xml diff --git a/mypy.ini b/mypy.ini deleted file mode 100644 index 38776255b..000000000 --- a/mypy.ini +++ /dev/null @@ -1,9 +0,0 @@ -[mypy] -python_version = 3.7 -exclude = site-packages|__pycache__|docs|old|build|dist|tests|venv -strict_optional=True -follow_imports = silent -show_error_codes = True -# ignore typing in third-party packages -ignore_missing_imports = True -plugins = pydantic.mypy diff --git a/onetl/VERSION b/onetl/VERSION index 571215736..5eef0f10e 100644 --- a/onetl/VERSION +++ b/onetl/VERSION @@ -1 +1 @@ -0.10.1 +0.10.2 diff --git a/onetl/_internal.py b/onetl/_internal.py index 3b533fac0..298ecdf95 100644 --- a/onetl/_internal.py +++ b/onetl/_internal.py @@ -11,7 +11,11 @@ from typing import TYPE_CHECKING, Any from etl_entities.process import ProcessStackManager -from pydantic import SecretStr + +try: + from pydantic.v1 import SecretStr +except (ImportError, AttributeError): + from pydantic import SecretStr # type: ignore[no-redef, assignment] if TYPE_CHECKING: from pathlib import PurePath diff --git a/onetl/base/base_db_connection.py b/onetl/base/base_db_connection.py index c11fb802e..670b15ec5 100644 --- a/onetl/base/base_db_connection.py +++ b/onetl/base/base_db_connection.py @@ -35,7 +35,7 @@ def validate_name(self, value: str) -> str: """ @abstractmethod - def validate_columns(self, columns: list[str] | None) -> list[str] | None: + def validate_columns(self, columns: str | list[str] | None) -> list[str] | None: """Check if ``columns`` value is valid. Raises diff --git a/onetl/connection/db_connection/clickhouse/connection.py b/onetl/connection/db_connection/clickhouse/connection.py index 63eeb455e..88de5645a 100644 --- a/onetl/connection/db_connection/clickhouse/connection.py +++ b/onetl/connection/db_connection/clickhouse/connection.py @@ -31,29 +31,9 @@ class Clickhouse(JDBCConnection): Based on Maven package ``ru.yandex.clickhouse:clickhouse-jdbc:0.3.2`` (`official Clickhouse JDBC driver `_). - .. dropdown:: Version compatibility - - * Clickhouse server versions: 20.7 or higher - * Spark versions: 2.3.x - 3.5.x - * Java versions: 8 - 20 - - See `official documentation `_. - .. warning:: - To use Clickhouse connector you should have PySpark installed (or injected to ``sys.path``) - BEFORE creating the connector instance. - - You can install PySpark as follows: - - .. code:: bash - - pip install onetl[spark] # latest PySpark version - - # or - pip install onetl pyspark=3.5.0 # pass specific PySpark version - - See :ref:`install-spark` installation instruction for more details. + Before using this connector please take into account :ref:`clickhouse-prerequisites` Parameters ---------- @@ -82,9 +62,11 @@ class Clickhouse(JDBCConnection): For example: ``{"continueBatchOnError": "false"}``. - See `Clickhouse JDBC driver properties documentation - `_ - for more details + See: + * `Clickhouse JDBC driver properties documentation `_ + * `Clickhouse core settings documentation `_ + * `Clickhouse query complexity documentation `_ + * `Clickhouse query level settings `_ Examples -------- diff --git a/onetl/connection/db_connection/db_connection/connection.py b/onetl/connection/db_connection/db_connection/connection.py index ff3e0af7f..1372cd69a 100644 --- a/onetl/connection/db_connection/db_connection/connection.py +++ b/onetl/connection/db_connection/db_connection/connection.py @@ -5,7 +5,10 @@ from logging import getLogger from typing import TYPE_CHECKING -from pydantic import Field, validator +try: + from pydantic.v1 import Field, validator +except (ImportError, AttributeError): + from pydantic import Field, validator # type: ignore[no-redef, assignment] from onetl._util.spark import try_import_pyspark from onetl.base import BaseDBConnection diff --git a/onetl/connection/db_connection/dialect_mixins/support_columns_list.py b/onetl/connection/db_connection/dialect_mixins/support_columns_list.py index a45f47d28..e443cf8f8 100644 --- a/onetl/connection/db_connection/dialect_mixins/support_columns_list.py +++ b/onetl/connection/db_connection/dialect_mixins/support_columns_list.py @@ -2,15 +2,24 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations -from typing import Any +import warnings class SupportColumns: def validate_columns( self, - columns: Any, + columns: str | list[str] | None, ) -> list[str] | None: if columns is None: return ["*"] + if isinstance(columns, str): + warnings.warn( + f"Passing DBReader(columns={columns!r}) is deprecated since v0.10.0 and will be removed in v1.0.0. " + f"Use DBReader(columns=[{columns!r}] instead", + category=UserWarning, + stacklevel=3, + ) + return [columns] + return list(columns) diff --git a/onetl/connection/db_connection/greenplum/connection.py b/onetl/connection/db_connection/greenplum/connection.py index 5d0268590..4460be149 100644 --- a/onetl/connection/db_connection/greenplum/connection.py +++ b/onetl/connection/db_connection/greenplum/connection.py @@ -9,7 +9,11 @@ from typing import TYPE_CHECKING, Any, ClassVar from etl_entities.instance import Host -from pydantic import validator + +try: + from pydantic.v1 import validator +except (ImportError, AttributeError): + from pydantic import validator # type: ignore[no-redef, assignment] from onetl._util.classproperty import classproperty from onetl._util.java import try_import_java_class @@ -63,8 +67,8 @@ class Config: class Greenplum(JDBCMixin, DBConnection): """Greenplum connection. |support_hooks| - Based on package ``io.pivotal:greenplum-spark:2.3.0`` - (`VMware Greenplum connector for Spark `_). + Based on package ``io.pivotal:greenplum-spark:2.2.0`` + (`VMware Greenplum connector for Spark `_). .. warning:: @@ -98,7 +102,7 @@ class Greenplum(JDBCMixin, DBConnection): For example: ``{"tcpKeepAlive": "true", "server.port": "50000-65535"}`` Supported options are: - * All `Postgres JDBC driver properties `_ + * All `Postgres JDBC driver properties `_ * Properties from `Greenplum connector for Spark documentation `_ page, but only starting with ``server.`` or ``pool.`` Examples @@ -183,7 +187,7 @@ def get_packages( Used only if ``scala_version=None``. - package_version : str, optional, default ``2.3.0`` + package_version : str, optional, default ``2.2.0`` Package version in format ``major.minor.patch`` Examples @@ -193,8 +197,8 @@ def get_packages( from onetl.connection import Greenplum - Greenplum.get_packages(scala_version="2.11") - Greenplum.get_packages(spark_version="3.2") + Greenplum.get_packages(scala_version="2.12") + Greenplum.get_packages(spark_version="3.2", package_version="2.3.0") """ @@ -202,7 +206,7 @@ def get_packages( if package_version: package_ver = Version.parse(package_version) else: - package_ver = Version(2, 3, 0) + package_ver = Version(2, 2, 0) if scala_version: scala_ver = Version.parse(scala_version) @@ -224,21 +228,21 @@ def package_spark_2_3(cls) -> str: """Get package name to be downloaded by Spark 2.3.""" msg = "`Greenplum.package_2_3` will be removed in 1.0.0, use `Greenplum.get_packages(spark_version='2.3')` instead" warnings.warn(msg, UserWarning, stacklevel=3) - return "io.pivotal:greenplum-spark_2.11:2.3.0" + return "io.pivotal:greenplum-spark_2.11:2.2.0" @classproperty def package_spark_2_4(cls) -> str: """Get package name to be downloaded by Spark 2.4.""" msg = "`Greenplum.package_2_4` will be removed in 1.0.0, use `Greenplum.get_packages(spark_version='2.4')` instead" warnings.warn(msg, UserWarning, stacklevel=3) - return "io.pivotal:greenplum-spark_2.11:2.3.0" + return "io.pivotal:greenplum-spark_2.11:2.2.0" @classproperty def package_spark_3_2(cls) -> str: """Get package name to be downloaded by Spark 3.2.""" msg = "`Greenplum.package_3_2` will be removed in 1.0.0, use `Greenplum.get_packages(spark_version='3.2')` instead" warnings.warn(msg, UserWarning, stacklevel=3) - return "io.pivotal:greenplum-spark_2.12:2.3.0" + return "io.pivotal:greenplum-spark_2.12:2.2.0" @property def instance_url(self) -> str: diff --git a/onetl/connection/db_connection/greenplum/options.py b/onetl/connection/db_connection/greenplum/options.py index 123edec1b..e100e35f1 100644 --- a/onetl/connection/db_connection/greenplum/options.py +++ b/onetl/connection/db_connection/greenplum/options.py @@ -6,7 +6,10 @@ from enum import Enum from typing import Optional -from pydantic import Field, root_validator +try: + from pydantic.v1 import Field, root_validator +except (ImportError, AttributeError): + from pydantic import Field, root_validator # type: ignore[no-redef, assignment] from onetl.connection.db_connection.jdbc_mixin import JDBCOptions @@ -76,7 +79,7 @@ class GreenplumReadOptions(JDBCOptions): .. warning:: Some options, like ``url``, ``dbtable``, ``server.*``, ``pool.*``, - etc are populated from connection attributes, and cannot be set in ``ReadOptions`` class + etc are populated from connection attributes, and cannot be overridden by the user in ``ReadOptions`` to avoid issues. Examples -------- @@ -208,8 +211,8 @@ class GreenplumWriteOptions(JDBCOptions): .. warning:: - Some options, like ``url``, ``dbtable``, ``server.*``, ``pool.*``, - etc are populated from connection attributes, and cannot be set in ``WriteOptions`` class + Some options, like ``url``, ``dbtable``, ``server.*``, ``pool.*``, etc + are populated from connection attributes, and cannot be overridden by the user in ``WriteOptions`` to avoid issues. Examples -------- diff --git a/onetl/connection/db_connection/hive/connection.py b/onetl/connection/db_connection/hive/connection.py index 34d5af5cb..f465fd73a 100644 --- a/onetl/connection/db_connection/hive/connection.py +++ b/onetl/connection/db_connection/hive/connection.py @@ -7,7 +7,11 @@ from typing import TYPE_CHECKING, Any, ClassVar, Iterable from etl_entities.instance import Cluster -from pydantic import validator + +try: + from pydantic.v1 import validator +except (ImportError, AttributeError): + from pydantic import validator # type: ignore[no-redef, assignment] from onetl._internal import clear_statement from onetl._util.spark import inject_spark_param @@ -497,7 +501,8 @@ def _insert_into( unsupported_options = write_options.dict(by_alias=True, exclude_unset=True, exclude={"if_exists"}) if unsupported_options: log.warning( - "|%s| Options %r are not supported while inserting into existing table, ignoring", + "|%s| User-specified options %r are ignored while inserting into existing table. " + "Using only table parameters from Hive metastore", self.__class__.__name__, unsupported_options, ) @@ -506,7 +511,8 @@ def _insert_into( # So we should sort columns according their order in the existing table # instead of using order from the dataframe columns = self._sort_df_columns_like_table(table, df.columns) - writer = df.select(*columns).write + if columns != df.columns: + df = df.select(*columns) # Writer option "partitionOverwriteMode" was added to Spark only in 2.4.0 # so using a workaround with patching Spark config and then setting up the previous value @@ -514,7 +520,7 @@ def _insert_into( overwrite = write_options.if_exists != HiveTableExistBehavior.APPEND log.info("|%s| Inserting data into existing table %r ...", self.__class__.__name__, table) - writer.insertInto(table, overwrite=overwrite) + df.write.insertInto(table, overwrite=overwrite) log.info("|%s| Data is successfully inserted into table %r.", self.__class__.__name__, table) diff --git a/onetl/connection/db_connection/hive/options.py b/onetl/connection/db_connection/hive/options.py index 357584660..5f687112f 100644 --- a/onetl/connection/db_connection/hive/options.py +++ b/onetl/connection/db_connection/hive/options.py @@ -6,8 +6,12 @@ from enum import Enum from typing import List, Optional, Tuple, Union -from deprecated import deprecated -from pydantic import Field, root_validator, validator +try: + from pydantic.v1 import Field, root_validator, validator +except (ImportError, AttributeError): + from pydantic import Field, root_validator, validator # type: ignore[no-redef, assignment] + +from typing_extensions import deprecated from onetl.impl import GenericOptions @@ -78,8 +82,8 @@ class HiveWriteOptions(GenericOptions): options = Hive.WriteOptions( if_exists="append", - partitionBy="reg_id", - someNewOption="value", + partition_by="reg_id", + customOption="value", ) """ @@ -206,7 +210,7 @@ class Config: .. warning:: - Used **only** while **creating new table**, or in case of ``if_exists=recreate_entire_table`` + Used **only** while **creating new table**, or in case of ``if_exists=replace_entire_table`` """ partition_by: Optional[Union[List[str], str]] = Field(default=None, alias="partitionBy") @@ -233,7 +237,7 @@ class Config: .. warning:: - Used **only** while **creating new table**, or in case of ``if_exists=recreate_entire_table`` + Used **only** while **creating new table**, or in case of ``if_exists=replace_entire_table`` """ bucket_by: Optional[Tuple[int, Union[List[str], str]]] = Field(default=None, alias="bucketBy") # noqa: WPS234 @@ -258,14 +262,14 @@ class Config: It is recommended to use this option **ONLY** if you have a large table (hundreds of Gb or more), which is used mostly for JOINs with other tables, - and you're inserting data using ``if_exists=overwrite_partitions`` or ``if_exists=recreate_entire_table``. + and you're inserting data using ``if_exists=overwrite_partitions`` or ``if_exists=replace_entire_table``. Otherwise Spark will create a lot of small files (one file for each bucket and each executor), drastically **decreasing** HDFS performance. .. warning:: - Used **only** while **creating new table**, or in case of ``if_exists=recreate_entire_table`` + Used **only** while **creating new table**, or in case of ``if_exists=replace_entire_table`` """ sort_by: Optional[Union[List[str], str]] = Field(default=None, alias="sortBy") @@ -283,7 +287,7 @@ class Config: .. warning:: - Used **only** while **creating new table**, or in case of ``if_exists=recreate_entire_table`` + Used **only** while **creating new table**, or in case of ``if_exists=replace_entire_table`` """ compression: Optional[str] = None @@ -294,7 +298,7 @@ class Config: .. warning:: - Used **only** while **creating new table**, or in case of ``if_exists=recreate_entire_table`` + Used **only** while **creating new table**, or in case of ``if_exists=replace_entire_table`` """ @validator("sort_by") @@ -339,9 +343,7 @@ def _mode_is_deprecated(cls, values): @deprecated( - version="0.5.0", - reason="Please use 'WriteOptions' class instead. Will be removed in v1.0.0", - action="always", + "Deprecated in 0.5.0 and will be removed in 1.0.0. Use 'WriteOptions' instead", category=UserWarning, ) class HiveLegacyOptions(HiveWriteOptions): diff --git a/onetl/connection/db_connection/jdbc_connection/connection.py b/onetl/connection/db_connection/jdbc_connection/connection.py index 9396dc802..3133d3671 100644 --- a/onetl/connection/db_connection/jdbc_connection/connection.py +++ b/onetl/connection/db_connection/jdbc_connection/connection.py @@ -68,38 +68,15 @@ def sql( Same as ``spark.read.jdbc(query)``. - .. note:: - - This method does not support :ref:`strategy`, - use :obj:`DBReader ` instead - - .. note:: - - Statement is executed in read-write connection, - so if you're calling some functions/procedures with DDL/DML statements inside, - they can change data in your database. - - Unfortunately, Spark does no provide any option to change this behavior. - Parameters ---------- query : str SQL query to be executed. - Only ``SELECT ... FROM ...`` form is supported. - - Some databases also supports ``WITH ... AS (...) SELECT ... FROM ...`` form. - - Queries like ``SHOW ...`` are not supported. - - .. warning:: - - The exact syntax **depends on RDBMS** is being used. - options : dict, :obj:`~ReadOptions`, default: ``None`` - Spark options to be used while fetching data, like ``fetchsize`` or ``partitionColumn`` + Spark options to be used while fetching data. Returns ------- @@ -107,23 +84,6 @@ def sql( Spark dataframe - Examples - -------- - - Read data from a table: - - .. code:: python - - df = connection.sql("SELECT * FROM mytable") - - Read data from a table with options: - - .. code:: python - - # reads data from table in batches, 10000 rows per batch - df = connection.sql("SELECT * FROM mytable", {"fetchsize": 10000}) - assert df.count() - """ query = clear_statement(query) diff --git a/onetl/connection/db_connection/jdbc_connection/options.py b/onetl/connection/db_connection/jdbc_connection/options.py index 8793fc084..50eda6a51 100644 --- a/onetl/connection/db_connection/jdbc_connection/options.py +++ b/onetl/connection/db_connection/jdbc_connection/options.py @@ -6,8 +6,12 @@ from enum import Enum from typing import Optional -from deprecated import deprecated -from pydantic import Field, PositiveInt, root_validator +try: + from pydantic.v1 import Field, PositiveInt, root_validator +except (ImportError, AttributeError): + from pydantic import Field, PositiveInt, root_validator # type: ignore[no-redef, assignment] + +from typing_extensions import deprecated from onetl._internal import to_camel from onetl.connection.db_connection.jdbc_mixin.options import JDBCOptions @@ -119,11 +123,11 @@ class JDBCReadOptions(JDBCOptions): .. code:: python options = JDBC.ReadOptions( - partitionColumn="reg_id", - numPartitions=10, - lowerBound=0, - upperBound=1000, - someNewOption="value", + partition_column="reg_id", + num_partitions=10, + lower_bound=0, + upper_bound=1000, + customOption="value", ) """ @@ -386,7 +390,7 @@ class JDBCWriteOptions(JDBCOptions): .. code:: python - options = JDBC.WriteOptions(if_exists="append", batchsize=20_000, someNewOption="value") + options = JDBC.WriteOptions(if_exists="append", batchsize=20_000, customOption="value") """ class Config: @@ -512,9 +516,7 @@ def _mode_is_deprecated(cls, values): @deprecated( - version="0.5.0", - reason="Please use 'ReadOptions' or 'WriteOptions' class instead. Will be removed in v1.0.0", - action="always", + "Deprecated in 0.5.0 and will be removed in 1.0.0. Use 'ReadOptions' or 'WriteOptions' instead", category=UserWarning, ) class JDBCLegacyOptions(JDBCReadOptions, JDBCWriteOptions): diff --git a/onetl/connection/db_connection/jdbc_mixin/connection.py b/onetl/connection/db_connection/jdbc_mixin/connection.py index d7324db0e..76424c30c 100644 --- a/onetl/connection/db_connection/jdbc_mixin/connection.py +++ b/onetl/connection/db_connection/jdbc_mixin/connection.py @@ -9,7 +9,10 @@ from enum import Enum, auto from typing import TYPE_CHECKING, Callable, ClassVar, Optional, TypeVar -from pydantic import Field, PrivateAttr, SecretStr, validator +try: + from pydantic.v1 import Field, PrivateAttr, SecretStr, validator +except (ImportError, AttributeError): + from pydantic import Field, PrivateAttr, SecretStr, validator # type: ignore[no-redef, assignment] from onetl._internal import clear_statement, stringify from onetl._util.java import get_java_gateway, try_import_java_class @@ -92,7 +95,7 @@ def close(self): .. code:: python - df = connection.fetch("SELECT * FROM mytable") + df = connection.fetch("SELECT * FROM mytable LIMIT 10") assert df.count() connection.close() @@ -144,22 +147,7 @@ def fetch( """ **Immediately** execute SELECT statement **on Spark driver** and return in-memory DataFrame. |support_hooks| - Works almost the same like :obj:`~sql`, but calls JDBC driver directly. - - .. warning:: - - This function should **NOT** be used to return dataframe with large number of rows/columns, - like ``SELECT * FROM table`` (without any filtering). - - Use it **ONLY** to execute queries with **one or just a few returning rows/columns**, - like ``SELECT COUNT(*) FROM table`` or ``SELECT * FROM table WHERE id = ...``. - - This is because all the data is fetched through master host, not in a distributed way - (even on clustered instances). - Also the resulting dataframe is stored directly in driver's memory, which is usually quite limited. - - Use :obj:`DBReader ` for reading - data from table via Spark executors, or for handling different :ref:`strategy`. + Works almost the same like :obj:`~sql`, but Spark executor is not used. .. note:: @@ -176,16 +164,6 @@ def fetch( SQL query to be executed. - Can be in any form of SELECT supported by RDBMS, like: - - * ``SELECT ... FROM ...`` - * ``WITH ... AS (...) SELECT ... FROM ...`` - * *Some* RDBMS instances also support ``SHOW ...`` queries, like ``SHOW TABLES`` - - .. warning:: - - The exact syntax **depends on a RDBMS** is being used. - options : dict, :obj:`~JDBCOptions`, default: ``None`` Options to be passed directly to JDBC driver, like ``fetchsize`` or ``queryTimeout`` @@ -199,24 +177,6 @@ def fetch( df : pyspark.sql.dataframe.DataFrame Spark dataframe - - Examples - -------- - - Read data from a table: - - .. code:: python - - df = connection.fetch("SELECT * FROM mytable") - assert df.count() - - Read data from a table with options: - - .. code:: python - - # reads data from table in batches, 10000 rows per batch - df = connection.fetch("SELECT * FROM mytable", {"fetchsize": 10000}) - assert df.count() """ query = clear_statement(query) @@ -242,25 +202,9 @@ def execute( """ **Immediately** execute DDL, DML or procedure/function **on Spark driver**. |support_hooks| - Returns DataFrame only if input is DML statement with ``RETURNING ...`` clause, or a procedure/function call. - In other cases returns ``None``. - There is no method like this in :obj:`pyspark.sql.SparkSession` object, but Spark internal methods works almost the same (but on executor side). - .. warning:: - - Use this method **ONLY** to execute queries which return **small number of rows/column, - or return nothing**. - - This is because all the data is fetched through master host, not in a distributed way - (even on clustered instances). - Also the resulting dataframe is stored directly in driver's memory, which is usually quite limited. - - Please **DO NOT** use this method to execute queries - like ``INSERT INTO ... VALUES(a, lot, of, values)``, - use :obj:`onetl.db.db_writer.db_writer.DBWriter` instead - .. note:: First call of the method opens the connection to a database. @@ -270,33 +214,7 @@ def execute( ---------- statement : str - Statement to be executed, like: - - DML statements: - - * ``INSERT INTO target_table SELECT * FROM source_table`` - * ``UPDATE mytable SET value = 1 WHERE id BETWEEN 100 AND 999`` - * ``DELETE FROM mytable WHERE id BETWEEN 100 AND 999`` - * ``TRUNCATE TABLE mytable`` - - DDL statements: - - * ``CREATE TABLE mytable (...)`` - * ``ALTER SCHEMA myschema ...`` - * ``DROP PROCEDURE myprocedure`` - - Call statements: - - * ``BEGIN ... END`` - * ``EXEC myprocedure`` - * ``EXECUTE myprocedure(arg1)`` - * ``CALL myfunction(arg1, arg2)`` - * ``{call myprocedure(arg1, ?)}`` (``?`` is output parameter) - * ``{?= call myfunction(arg1, arg2)}`` - - .. warning:: - - The exact syntax **depends on a RDBMS** is being used. + Statement to be executed. options : dict, :obj:`~JDBCOptions`, default: ``None`` @@ -310,27 +228,10 @@ def execute( ------- df : pyspark.sql.dataframe.DataFrame, optional - Spark dataframe - - Examples - -------- - - Create table: - - .. code:: python - - assert connection.execute("CREATE TABLE target_table(id NUMBER, data VARCHAR)") is None - - Insert data to one table from another, with a specific transaction isolation level, - and return DataFrame with new rows: - - .. code:: python + Spark DataFrame. - df = connection.execute( - "INSERT INTO target_table SELECT * FROM source_table RETURNING id, data", - {"isolationLevel": "READ_COMMITTED"}, - ) - assert df.count() + DataFrame is returned only if input is DML statement with ``RETURNING ...`` clause, + or a procedure/function call. In other cases returns ``None``. """ statement = clear_statement(statement) diff --git a/onetl/connection/db_connection/jdbc_mixin/options.py b/onetl/connection/db_connection/jdbc_mixin/options.py index 583cb4f53..349630719 100644 --- a/onetl/connection/db_connection/jdbc_mixin/options.py +++ b/onetl/connection/db_connection/jdbc_mixin/options.py @@ -4,7 +4,10 @@ from typing import Optional -from pydantic import Field +try: + from pydantic.v1 import Field +except (ImportError, AttributeError): + from pydantic import Field # type: ignore[no-redef, assignment] from onetl.impl import GenericOptions diff --git a/onetl/connection/db_connection/kafka/connection.py b/onetl/connection/db_connection/kafka/connection.py index b7f06194c..21a042451 100644 --- a/onetl/connection/db_connection/kafka/connection.py +++ b/onetl/connection/db_connection/kafka/connection.py @@ -8,7 +8,11 @@ from typing import TYPE_CHECKING, Any, List, Optional from etl_entities.instance import Cluster -from pydantic import root_validator, validator + +try: + from pydantic.v1 import root_validator, validator +except (ImportError, AttributeError): + from pydantic import root_validator, validator # type: ignore[no-redef, assignment] from onetl._internal import stringify from onetl._util.java import try_import_java_class diff --git a/onetl/connection/db_connection/kafka/kafka_basic_auth.py b/onetl/connection/db_connection/kafka/kafka_basic_auth.py index 56037cb32..e364b4123 100644 --- a/onetl/connection/db_connection/kafka/kafka_basic_auth.py +++ b/onetl/connection/db_connection/kafka/kafka_basic_auth.py @@ -4,7 +4,10 @@ from typing import TYPE_CHECKING -from pydantic import Field, SecretStr +try: + from pydantic.v1 import Field, SecretStr +except (ImportError, AttributeError): + from pydantic import Field, SecretStr # type: ignore[no-redef, assignment] from onetl.connection.db_connection.kafka.kafka_auth import KafkaAuth from onetl.impl import GenericOptions diff --git a/onetl/connection/db_connection/kafka/kafka_kerberos_auth.py b/onetl/connection/db_connection/kafka/kafka_kerberos_auth.py index 2af64f165..6f56ca55d 100644 --- a/onetl/connection/db_connection/kafka/kafka_kerberos_auth.py +++ b/onetl/connection/db_connection/kafka/kafka_kerberos_auth.py @@ -7,7 +7,10 @@ import shutil from typing import TYPE_CHECKING, Optional -from pydantic import Field, PrivateAttr, root_validator, validator +try: + from pydantic.v1 import Field, PrivateAttr, root_validator, validator +except (ImportError, AttributeError): + from pydantic import Field, PrivateAttr, root_validator, validator # type: ignore[no-redef, assignment] from onetl._internal import stringify from onetl._util.file import get_file_hash, is_file_readable diff --git a/onetl/connection/db_connection/kafka/kafka_scram_auth.py b/onetl/connection/db_connection/kafka/kafka_scram_auth.py index 579dc7ed0..2015123ee 100644 --- a/onetl/connection/db_connection/kafka/kafka_scram_auth.py +++ b/onetl/connection/db_connection/kafka/kafka_scram_auth.py @@ -4,7 +4,11 @@ from typing import TYPE_CHECKING -from pydantic import Field, SecretStr +try: + from pydantic.v1 import Field, SecretStr +except (ImportError, AttributeError): + from pydantic import Field, SecretStr # type: ignore[no-redef, assignment] + from typing_extensions import Literal from onetl._internal import stringify diff --git a/onetl/connection/db_connection/kafka/kafka_ssl_protocol.py b/onetl/connection/db_connection/kafka/kafka_ssl_protocol.py index d76c4e333..f7b9ecc15 100644 --- a/onetl/connection/db_connection/kafka/kafka_ssl_protocol.py +++ b/onetl/connection/db_connection/kafka/kafka_ssl_protocol.py @@ -5,7 +5,10 @@ from pathlib import Path from typing import TYPE_CHECKING, Optional -from pydantic import Field, SecretStr, validator +try: + from pydantic.v1 import Field, SecretStr, validator +except (ImportError, AttributeError): + from pydantic import Field, SecretStr, validator # type: ignore[no-redef, assignment] from onetl._internal import stringify from onetl._util.file import is_file_readable diff --git a/onetl/connection/db_connection/kafka/options.py b/onetl/connection/db_connection/kafka/options.py index 28bf44f6b..0b04f2cb5 100644 --- a/onetl/connection/db_connection/kafka/options.py +++ b/onetl/connection/db_connection/kafka/options.py @@ -4,7 +4,10 @@ from enum import Enum -from pydantic import Field, root_validator +try: + from pydantic.v1 import Field, root_validator +except (ImportError, AttributeError): + from pydantic import Field, root_validator # type: ignore[no-redef, assignment] from onetl.impl import GenericOptions @@ -71,8 +74,7 @@ class KafkaReadOptions(GenericOptions): * ``subscribe`` * ``subscribePattern`` - are populated from connection attributes, and cannot be set in ``KafkaReadOptions`` class and be overridden - by the user to avoid issues. + are populated from connection attributes, and cannot be overridden by the user in ``ReadOptions`` to avoid issues. Examples -------- @@ -117,8 +119,7 @@ class KafkaWriteOptions(GenericOptions): * ``kafka.*`` * ``topic`` - are populated from connection attributes, and cannot be set in ``KafkaWriteOptions`` class and be overridden - by the user to avoid issues. + are populated from connection attributes, and cannot be overridden by the user in ``WriteOptions`` to avoid issues. Examples -------- diff --git a/onetl/connection/db_connection/mongodb/connection.py b/onetl/connection/db_connection/mongodb/connection.py index c84317d88..acaaa6539 100644 --- a/onetl/connection/db_connection/mongodb/connection.py +++ b/onetl/connection/db_connection/mongodb/connection.py @@ -9,7 +9,11 @@ from urllib import parse as parser from etl_entities.instance import Host -from pydantic import SecretStr, validator + +try: + from pydantic.v1 import SecretStr, validator +except (ImportError, AttributeError): + from pydantic import SecretStr, validator # type: ignore[no-redef, assignment] from onetl._util.classproperty import classproperty from onetl._util.java import try_import_java_class @@ -46,33 +50,12 @@ class Config: class MongoDB(DBConnection): """MongoDB connection. |support_hooks| - Based on package ``org.mongodb.spark:mongo-spark-connector`` - (`MongoDB connector for Spark `_) - - .. dropdown:: Version compatibility - - * MongoDB server versions: 4.0 or higher - * Spark versions: 3.2.x - 3.4.x - * Scala versions: 2.12 - 2.13 - * Java versions: 8 - 20 - - See `official documentation `_. + Based on package ``org.mongodb.spark:mongo-spark-connector:10.1.1`` + (`MongoDB connector for Spark `_) .. warning:: - To use MongoDB connector you should have PySpark installed (or injected to ``sys.path``) - BEFORE creating the connector instance. - - You can install PySpark as follows: - - .. code:: bash - - pip install onetl[spark] # latest PySpark version - - # or - pip install onetl pyspark=3.4.2 # pass specific PySpark version - - See :ref:`install-spark` installation instruction for more details. + Before using this connector please take into account :ref:`mongodb-prerequisites` Parameters ---------- @@ -114,7 +97,7 @@ class MongoDB(DBConnection): from pyspark.sql import SparkSession # Create Spark session with MongoDB connector loaded - maven_packages = MongoDB.get_packages(spark_version="3.2") + maven_packages = MongoDB.get_packages(spark_version="3.4") spark = ( SparkSession.builder.appName("spark-app-name") .config("spark.jars.packages", ",".join(maven_packages)) @@ -128,7 +111,7 @@ class MongoDB(DBConnection): password="*****", database="target_database", spark=spark, - ) + ).check() """ database: str @@ -177,8 +160,8 @@ def get_packages( from onetl.connection import MongoDB - MongoDB.get_packages(scala_version="2.11") - MongoDB.get_packages(spark_version="3.2") + MongoDB.get_packages(scala_version="2.12") + MongoDB.get_packages(spark_version="3.4") """ @@ -233,7 +216,7 @@ def package_spark_3_4(cls) -> str: def pipeline( self, collection: str, - pipeline: dict | list[dict], + pipeline: dict | list[dict] | None = None, df_schema: StructType | None = None, options: MongoDBPipelineOptions | dict | None = None, ): @@ -245,7 +228,7 @@ def pipeline( .. code:: js - db.collection.aggregate([{"$match": ...}, {"$group": ...}]) + db.collection_name.aggregate([{"$match": ...}, {"$group": ...}]) but pipeline is executed on Spark executors, in a distributed way. @@ -254,40 +237,32 @@ def pipeline( This method does not support :ref:`strategy`, use :obj:`DBReader ` instead - .. note:: - - Statement is executed in read-write connection, - so if you're calling some stored functions, you can make changes - to the data source. - - Unfortunately, Spark does no provide any option to change this behavior. - Parameters ---------- collection : str Collection name. - pipeline : dict | list[dict] + pipeline : dict | list[dict], optional Pipeline containing a database query. See `Aggregation pipeline syntax `_. - df_schema : StructType, default: ``None`` + df_schema : StructType, optional Schema describing the resulting DataFrame. - options : PipelineOptions | dict, default: ``None`` + options : PipelineOptions | dict, optional Additional pipeline options, see :obj:`~PipelineOptions`. Examples -------- - Get document with a specific ``_id``: + Get document with a specific ``field`` value: .. code:: python df = connection.pipeline( collection="collection_name", - pipeline={"$match": {"_id": {"$eq": 1}}}, + pipeline={"$match": {"field": {"$eq": 1}}}, ) Calculate aggregation and get result: @@ -320,7 +295,7 @@ def pipeline( df_schema = StructType( [ - StructField("_id", IntegerType()), + StructField("_id", StringType()), StructField("some_string", StringType()), StructField("some_int", IntegerType()), StructField("some_datetime", TimestampType()), @@ -340,15 +315,17 @@ def pipeline( df = connection.pipeline( collection="collection_name", - pipeline={"$match": {"_id": {"$eq": 1}}}, - options=MongoDB.PipelineOptions(hint={"_id": 1}), + pipeline={"$match": {"field": {"$eq": 1}}}, + options=MongoDB.PipelineOptions(hint={"field": 1}), ) """ log.info("|%s| Executing aggregation pipeline:", self.__class__.__name__) read_options = self.PipelineOptions.parse(options).dict(by_alias=True, exclude_none=True) - pipeline = self.dialect.prepare_pipeline(pipeline) + if pipeline: + pipeline = self.dialect.prepare_pipeline(pipeline) + log_with_indent(log, "collection = %r", collection) log_json(log, pipeline, name="pipeline") @@ -359,7 +336,8 @@ def pipeline( log_options(log, read_options) read_options["collection"] = collection - read_options["aggregation.pipeline"] = json.dumps(pipeline) + if pipeline: + read_options["aggregation.pipeline"] = json.dumps(pipeline) read_options["connection.uri"] = self.connection_url spark_reader = self.spark.read.format("mongodb").options(**read_options) diff --git a/onetl/connection/db_connection/mongodb/options.py b/onetl/connection/db_connection/mongodb/options.py index 86ff82f1b..2ddc7a068 100644 --- a/onetl/connection/db_connection/mongodb/options.py +++ b/onetl/connection/db_connection/mongodb/options.py @@ -5,7 +5,10 @@ import warnings from enum import Enum -from pydantic import Field, root_validator +try: + from pydantic.v1 import Field, root_validator +except (ImportError, AttributeError): + from pydantic import Field, root_validator # type: ignore[no-redef, assignment] from onetl.impl import GenericOptions @@ -96,7 +99,7 @@ class MongoDBPipelineOptions(GenericOptions): .. note :: You can pass any value - `supported by connector `_, + `supported by connector `_, even if it is not mentioned in this documentation. The set of supported options depends on connector version. See link above. @@ -104,7 +107,7 @@ class MongoDBPipelineOptions(GenericOptions): .. warning:: Options ``uri``, ``database``, ``collection``, ``pipeline`` are populated from connection attributes, - and cannot be set in ``PipelineOptions`` class. + and cannot be overridden by the user in ``PipelineOptions`` to avoid issues. Examples -------- @@ -114,7 +117,7 @@ class MongoDBPipelineOptions(GenericOptions): .. code:: python MongoDB.PipelineOptions( - hint="{'_id': 1}", + hint={"some_field": 1}, ) """ @@ -130,7 +133,7 @@ class MongoDBReadOptions(GenericOptions): .. note :: You can pass any value - `supported by connector `_, + `supported by connector `_, even if it is not mentioned in this documentation. The set of supported options depends on connector version. See link above. @@ -138,7 +141,7 @@ class MongoDBReadOptions(GenericOptions): .. warning:: Options ``uri``, ``database``, ``collection``, ``pipeline``, ``hint`` are populated from connection - attributes, and cannot be set in ``ReadOptions`` class. + attributes, and cannot be overridden by the user in ``ReadOptions`` to avoid issues. Examples -------- @@ -164,7 +167,7 @@ class MongoDBWriteOptions(GenericOptions): .. note :: You can pass any value - `supported by connector `_, + `supported by connector `_, even if it is not mentioned in this documentation. The set of supported options depends on connector version. See link above. @@ -172,7 +175,7 @@ class MongoDBWriteOptions(GenericOptions): .. warning:: Options ``uri``, ``database``, ``collection`` are populated from connection attributes, - and cannot be set in ``WriteOptions`` class. + and cannot be overridden by the user in ``WriteOptions`` to avoid issues. Examples -------- diff --git a/onetl/connection/db_connection/mssql/connection.py b/onetl/connection/db_connection/mssql/connection.py index c2716ce72..608ef7521 100644 --- a/onetl/connection/db_connection/mssql/connection.py +++ b/onetl/connection/db_connection/mssql/connection.py @@ -29,30 +29,9 @@ class MSSQL(JDBCConnection): (`official MSSQL JDBC driver `_). - .. dropdown:: Version compatibility - - * SQL Server versions: 2014 - 2022 - * Spark versions: 2.3.x - 3.5.x - * Java versions: 8 - 20 - - See `official documentation `_ - and `official compatibility matrix `_. - .. warning:: - To use MSSQL connector you should have PySpark installed (or injected to ``sys.path``) - BEFORE creating the connector instance. - - You can install PySpark as follows: - - .. code:: bash - - pip install onetl[spark] # latest PySpark version - - # or - pip install onetl pyspark=3.5.0 # pass specific PySpark version - - See :ref:`install-spark` installation instruction for more details. + Before using this connector please take into account :ref:`mssql-prerequisites` Parameters ---------- diff --git a/onetl/connection/db_connection/mysql/connection.py b/onetl/connection/db_connection/mysql/connection.py index 224504962..ebf43f668 100644 --- a/onetl/connection/db_connection/mysql/connection.py +++ b/onetl/connection/db_connection/mysql/connection.py @@ -29,29 +29,9 @@ class MySQL(JDBCConnection): Based on Maven package ``com.mysql:mysql-connector-j:8.0.33`` (`official MySQL JDBC driver `_). - .. dropdown:: Version compatibility - - * MySQL server versions: 5.7, 8.0 - * Spark versions: 2.3.x - 3.5.x - * Java versions: 8 - 20 - - See `official documentation `_. - .. warning:: - To use MySQL connector you should have PySpark installed (or injected to ``sys.path``) - BEFORE creating the connector instance. - - You can install PySpark as follows: - - .. code:: bash - - pip install onetl[spark] # latest PySpark version - - # or - pip install onetl pyspark=3.5.0 # pass specific PySpark version - - See :ref:`install-spark` installation instruction for more details. + Before using this connector please take into account :ref:`mysql-prerequisites` Parameters ---------- @@ -78,7 +58,7 @@ class MySQL(JDBCConnection): extra : dict, default: ``None`` Specifies one or more extra parameters by which clients can connect to the instance. - For example: ``{"useSSL": "false"}`` + For example: ``{"useSSL": "false", "allowPublicKeyRetrieval": "true"}`` See `MySQL JDBC driver properties documentation `_ @@ -107,7 +87,7 @@ class MySQL(JDBCConnection): host="database.host.or.ip", user="user", password="*****", - extra={"useSSL": "false"}, + extra={"useSSL": "false", "allowPublicKeyRetrieval": "true"}, spark=spark, ) diff --git a/onetl/connection/db_connection/oracle/connection.py b/onetl/connection/db_connection/oracle/connection.py index 6d87cb559..f6b1ad2d9 100644 --- a/onetl/connection/db_connection/oracle/connection.py +++ b/onetl/connection/db_connection/oracle/connection.py @@ -12,7 +12,10 @@ from textwrap import indent from typing import TYPE_CHECKING, Any, ClassVar, Optional -from pydantic import root_validator +try: + from pydantic.v1 import root_validator +except (ImportError, AttributeError): + from pydantic import root_validator # type: ignore[no-redef, assignment] from onetl._internal import clear_statement from onetl._util.classproperty import classproperty @@ -70,29 +73,9 @@ class Oracle(JDBCConnection): Based on Maven package ``com.oracle.database.jdbc:ojdbc8:23.2.0.0`` (`official Oracle JDBC driver `_). - .. dropdown:: Version compatibility - - * Oracle Server versions: 23, 21, 19, 18, 12.2 and probably 11.2 (tested, but that's not official). - * Spark versions: 2.3.x - 3.5.x - * Java versions: 8 - 20 - - See `official documentation `_. - .. warning:: - To use Oracle connector you should have PySpark installed (or injected to ``sys.path``) - BEFORE creating the connector instance. - - You can install PySpark as follows: - - .. code:: bash - - pip install onetl[spark] # latest PySpark version - - # or - pip install onetl pyspark=3.5.0 # pass specific PySpark version - - See :ref:`install-spark` installation instruction for more details. + Before using this connector please take into account :ref:`oracle-prerequisites` Parameters ---------- @@ -113,16 +96,16 @@ class Oracle(JDBCConnection): .. warning :: - Be careful, to correct work you must provide ``sid`` or ``service_name`` + You should provide either ``sid`` or ``service_name``, not both of them service_name : str, default: ``None`` Specifies one or more names by which clients can connect to the instance. - For example: ``MYDATA``. + For example: ``PDB1``. .. warning :: - Be careful, for correct work you must provide ``sid`` or ``service_name`` + You should provide either ``sid`` or ``service_name``, not both of them spark : :obj:`pyspark.sql.SparkSession` Spark session. @@ -130,24 +113,22 @@ class Oracle(JDBCConnection): extra : dict, default: ``None`` Specifies one or more extra parameters by which clients can connect to the instance. - For example: ``{"defaultBatchValue": 100}`` + For example: ``{"remarksReporting": "false"}`` - See `Oracle JDBC driver properties documentation - `_ - for more details + See official documentation: + * `Connection parameters `_ + * `Connection properties `_ Examples -------- - Oracle connection initialization + Connect to Oracle using ``sid``: .. code:: python from onetl.connection import Oracle from pyspark.sql import SparkSession - extra = {"defaultBatchValue": 100} - # Create Spark session with Oracle driver loaded maven_packages = Oracle.get_packages() spark = ( @@ -162,7 +143,22 @@ class Oracle(JDBCConnection): user="user", password="*****", sid="XE", - extra=extra, + extra={"remarksReporting": "false"}, + spark=spark, + ) + + Using ``service_name``: + + .. code:: python + + ... + + oracle = Oracle( + host="database.host.or.ip", + user="user", + password="*****", + service_name="PDB1", + extra={"remarksReporting": "false"}, spark=spark, ) diff --git a/onetl/connection/db_connection/postgres/connection.py b/onetl/connection/db_connection/postgres/connection.py index 1353204fd..a4d27bcea 100644 --- a/onetl/connection/db_connection/postgres/connection.py +++ b/onetl/connection/db_connection/postgres/connection.py @@ -16,6 +16,9 @@ class PostgresExtra(GenericOptions): + # allows automatic conversion from text to target column type during write + stringtype: str = "unspecified" + class Config: extra = "allow" @@ -27,29 +30,9 @@ class Postgres(JDBCConnection): Based on Maven package ``org.postgresql:postgresql:42.6.0`` (`official Postgres JDBC driver `_). - .. dropdown:: Version compatibility - - * PostgreSQL server versions: 8.2 or higher - * Spark versions: 2.3.x - 3.5.x - * Java versions: 8 - 20 - - See `official documentation `_. - .. warning:: - To use Postgres connector you should have PySpark installed (or injected to ``sys.path``) - BEFORE creating the connector instance. - - You can install PySpark as follows: - - .. code:: bash - - pip install onetl[spark] # latest PySpark version - - # or - pip install onetl pyspark=3.5.0 # pass specific PySpark version - - See :ref:`install-spark` installation instruction for more details. + Before using this connector please take into account :ref:`postgres-prerequisites` Parameters ---------- @@ -78,7 +61,7 @@ class Postgres(JDBCConnection): For example: ``{"ssl": "false"}`` - See `Postgres JDBC driver properties documentation `_ + See `Postgres JDBC driver properties documentation `_ for more details Examples diff --git a/onetl/connection/db_connection/teradata/connection.py b/onetl/connection/db_connection/teradata/connection.py index 95741927b..0dc3e67dd 100644 --- a/onetl/connection/db_connection/teradata/connection.py +++ b/onetl/connection/db_connection/teradata/connection.py @@ -32,29 +32,9 @@ class Teradata(JDBCConnection): Based on package ``com.teradata.jdbc:terajdbc:17.20.00.15`` (`official Teradata JDBC driver `_). - .. dropdown:: Version compatibility - - * Teradata server versions: 16.10 - 20.0 - * Spark versions: 2.3.x - 3.5.x - * Java versions: 8 - 20 - - See `official documentation `_. - .. warning:: - To use Teradata connector you should have PySpark installed (or injected to ``sys.path``) - BEFORE creating the connector instance. - - You can install PySpark as follows: - - .. code:: bash - - pip install onetl[spark] # latest PySpark version - - # or - pip install onetl pyspark=3.5.0 # pass specific PySpark version - - See :ref:`install-spark` installation instruction for more details. + Before using this connector please take into account :ref:`teradata-prerequisites` Parameters ---------- @@ -92,10 +72,10 @@ class Teradata(JDBCConnection): By default, these options are added to extra: * ``CHARSET = "UTF8"`` - * ``COLUMN_NAME = "ON"`` - * ``FLATTEN = "ON"`` + * ``COLUMN_NAME = "ON"`` - allow reading column title from a table + * ``FLATTEN = "ON"`` - improves error messages * ``MAYBENULL = "ON"`` - * ``STRICT_NAMES = "OFF"`` + * ``STRICT_NAMES = "OFF"`` - ignore Spark options passed to JDBC URL It is possible to override default values, for example set ``extra={"FLATTEN": "OFF"}`` diff --git a/onetl/connection/file_connection/ftp.py b/onetl/connection/file_connection/ftp.py index 5e91648e1..07b8b53e1 100644 --- a/onetl/connection/file_connection/ftp.py +++ b/onetl/connection/file_connection/ftp.py @@ -2,14 +2,18 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations -import ftplib # noqa: S402 +import ftplib # noqa: S402 # nosec import os import textwrap from logging import getLogger from typing import Optional from etl_entities.instance import Host -from pydantic import SecretStr + +try: + from pydantic.v1 import SecretStr +except (ImportError, AttributeError): + from pydantic import SecretStr # type: ignore[no-redef, assignment] from onetl.base import PathStatProtocol from onetl.connection.file_connection.file_connection import FileConnection diff --git a/onetl/connection/file_connection/ftps.py b/onetl/connection/file_connection/ftps.py index 859cca942..9277b66fd 100644 --- a/onetl/connection/file_connection/ftps.py +++ b/onetl/connection/file_connection/ftps.py @@ -1,6 +1,6 @@ # SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) # SPDX-License-Identifier: Apache-2.0 -import ftplib # NOQA: S402 +import ftplib # noqa: S402 # nosec import textwrap from ftputil import FTPHost @@ -31,7 +31,7 @@ class TLSfix(ftplib.FTP_TLS): # noqa: N801 """ def ntransfercmd(self, cmd, rest=None): - conn, size = ftplib.FTP.ntransfercmd(self, cmd, rest) # noqa: S321 + conn, size = ftplib.FTP.ntransfercmd(self, cmd, rest) # noqa: S321 # nosec if self._prot_p: conn = self.context.wrap_socket( conn, diff --git a/onetl/connection/file_connection/hdfs/connection.py b/onetl/connection/file_connection/hdfs/connection.py index 9cf4482d7..7105763fe 100644 --- a/onetl/connection/file_connection/hdfs/connection.py +++ b/onetl/connection/file_connection/hdfs/connection.py @@ -9,7 +9,11 @@ from typing import TYPE_CHECKING, Optional, Tuple from etl_entities.instance import Cluster, Host -from pydantic import Field, FilePath, SecretStr, root_validator, validator + +try: + from pydantic.v1 import Field, FilePath, SecretStr, root_validator, validator +except (ImportError, AttributeError): + from pydantic import Field, FilePath, SecretStr, root_validator, validator # type: ignore[no-redef, assignment] from onetl.base import PathStatProtocol from onetl.connection.file_connection.file_connection import FileConnection diff --git a/onetl/connection/file_connection/s3.py b/onetl/connection/file_connection/s3.py index 9deb016c2..7811c4c4b 100644 --- a/onetl/connection/file_connection/s3.py +++ b/onetl/connection/file_connection/s3.py @@ -29,7 +29,12 @@ ) from e from etl_entities.instance import Host -from pydantic import SecretStr, root_validator + +try: + from pydantic.v1 import SecretStr, root_validator +except (ImportError, AttributeError): + from pydantic import SecretStr, root_validator # type: ignore[no-redef, assignment] + from typing_extensions import Literal from onetl.connection.file_connection.file_connection import FileConnection diff --git a/onetl/connection/file_connection/samba.py b/onetl/connection/file_connection/samba.py index 81c6cce8a..9fc0857fa 100644 --- a/onetl/connection/file_connection/samba.py +++ b/onetl/connection/file_connection/samba.py @@ -11,9 +11,13 @@ from typing import Optional, Union from etl_entities.instance import Host -from pydantic import SecretStr, validator from typing_extensions import Literal +try: + from pydantic.v1 import SecretStr, validator +except (ImportError, AttributeError): + from pydantic import SecretStr, validator # type: ignore[no-redef, assignment] + from onetl.connection.file_connection.file_connection import FileConnection from onetl.hooks import slot, support_hooks from onetl.impl import LocalPath, RemotePath, RemotePathStat diff --git a/onetl/connection/file_connection/sftp.py b/onetl/connection/file_connection/sftp.py index 61ed3ee95..3d64669b1 100644 --- a/onetl/connection/file_connection/sftp.py +++ b/onetl/connection/file_connection/sftp.py @@ -10,7 +10,11 @@ from typing import Optional from etl_entities.instance import Host -from pydantic import FilePath, SecretStr + +try: + from pydantic.v1 import FilePath, SecretStr +except (ImportError, AttributeError): + from pydantic import FilePath, SecretStr # type: ignore[no-redef, assignment] from onetl.connection.file_connection.file_connection import FileConnection from onetl.connection.file_connection.mixins.rename_dir_mixin import RenameDirMixin diff --git a/onetl/connection/file_connection/webdav.py b/onetl/connection/file_connection/webdav.py index b0eb7decc..f74eed3e3 100644 --- a/onetl/connection/file_connection/webdav.py +++ b/onetl/connection/file_connection/webdav.py @@ -12,7 +12,12 @@ from typing import Optional, Union from etl_entities.instance import Host -from pydantic import DirectoryPath, FilePath, SecretStr, root_validator + +try: + from pydantic.v1 import DirectoryPath, FilePath, SecretStr, root_validator +except (ImportError, AttributeError): + from pydantic import DirectoryPath, FilePath, SecretStr, root_validator # type: ignore[no-redef, assignment] + from typing_extensions import Literal from onetl.connection.file_connection.file_connection import FileConnection diff --git a/onetl/connection/file_df_connection/spark_file_df_connection.py b/onetl/connection/file_df_connection/spark_file_df_connection.py index d3061385d..06121139f 100644 --- a/onetl/connection/file_df_connection/spark_file_df_connection.py +++ b/onetl/connection/file_df_connection/spark_file_df_connection.py @@ -7,7 +7,10 @@ from logging import getLogger from typing import TYPE_CHECKING -from pydantic import Field, validator +try: + from pydantic.v1 import Field, validator +except (ImportError, AttributeError): + from pydantic import Field, validator # type: ignore[no-redef, assignment] from onetl._util.hadoop import get_hadoop_config from onetl._util.spark import try_import_pyspark diff --git a/onetl/connection/file_df_connection/spark_hdfs/connection.py b/onetl/connection/file_df_connection/spark_hdfs/connection.py index 7f23e8b58..1c0ca6bb3 100644 --- a/onetl/connection/file_df_connection/spark_hdfs/connection.py +++ b/onetl/connection/file_df_connection/spark_hdfs/connection.py @@ -10,7 +10,11 @@ from typing import TYPE_CHECKING, Optional from etl_entities.instance import Cluster, Host -from pydantic import Field, PrivateAttr, validator + +try: + from pydantic.v1 import Field, PrivateAttr, validator +except (ImportError, AttributeError): + from pydantic import Field, PrivateAttr, validator # type: ignore[no-redef, assignment] from onetl.base import PurePathProtocol from onetl.connection.file_df_connection.spark_file_df_connection import ( diff --git a/onetl/connection/file_df_connection/spark_local_fs.py b/onetl/connection/file_df_connection/spark_local_fs.py index d2accb68a..0ebd43683 100644 --- a/onetl/connection/file_df_connection/spark_local_fs.py +++ b/onetl/connection/file_df_connection/spark_local_fs.py @@ -6,7 +6,10 @@ import socket from pathlib import Path -from pydantic import validator +try: + from pydantic.v1 import validator +except (ImportError, AttributeError): + from pydantic import validator # type: ignore[no-redef, assignment] from onetl.base import PurePathProtocol from onetl.connection.file_df_connection.spark_file_df_connection import ( diff --git a/onetl/connection/file_df_connection/spark_s3/connection.py b/onetl/connection/file_df_connection/spark_s3/connection.py index ff516abf7..46ce8bb33 100644 --- a/onetl/connection/file_df_connection/spark_s3/connection.py +++ b/onetl/connection/file_df_connection/spark_s3/connection.py @@ -8,7 +8,12 @@ from typing import TYPE_CHECKING, ClassVar, List, Optional from etl_entities.instance import Host -from pydantic import SecretStr, root_validator, validator + +try: + from pydantic.v1 import SecretStr, root_validator, validator +except (ImportError, AttributeError): + from pydantic import SecretStr, root_validator, validator # type: ignore[no-redef, assignment] + from typing_extensions import Literal from onetl._internal import stringify diff --git a/onetl/core/file_filter/file_filter.py b/onetl/core/file_filter/file_filter.py index a3dac8654..a8fd7b694 100644 --- a/onetl/core/file_filter/file_filter.py +++ b/onetl/core/file_filter/file_filter.py @@ -9,12 +9,18 @@ import warnings from typing import List, Optional, Union -from pydantic import Field, root_validator, validator +from typing_extensions import deprecated + +try: + from pydantic.v1 import Field, root_validator, validator +except (ImportError, AttributeError): + from pydantic import Field, root_validator, validator # type: ignore[no-redef, assignment] from onetl.base import BaseFileFilter, PathProtocol from onetl.impl import FrozenModel, RemotePath +@deprecated("Deprecated in 0.8.0 and will be removed in 1.0.0. Use Glob, Regexp or ExcludeDir instead", category=None) class FileFilter(BaseFileFilter, FrozenModel): r"""Filter files or directories by their path. diff --git a/onetl/core/file_limit/file_limit.py b/onetl/core/file_limit/file_limit.py index b178433e7..d3b98718b 100644 --- a/onetl/core/file_limit/file_limit.py +++ b/onetl/core/file_limit/file_limit.py @@ -5,12 +5,18 @@ import textwrap import warnings -from pydantic import validator +from typing_extensions import deprecated + +try: + from pydantic.v1 import validator +except (ImportError, AttributeError): + from pydantic import validator # type: ignore[no-redef, assignment] from onetl.base import BaseFileLimit, PathProtocol from onetl.impl import FrozenModel +@deprecated("Deprecated in 0.8.0 and will be removed in 1.0.0. Use MaxFilesCount instead", category=None) class FileLimit(BaseFileLimit, FrozenModel): """Limits the number of downloaded files. diff --git a/onetl/db/db_reader/db_reader.py b/onetl/db/db_reader/db_reader.py index 7f6f6f0d9..af038c24f 100644 --- a/onetl/db/db_reader/db_reader.py +++ b/onetl/db/db_reader/db_reader.py @@ -11,7 +11,11 @@ from etl_entities.hwm import HWM, ColumnHWM, KeyValueHWM from etl_entities.old_hwm import IntHWM as OldColumnHWM from etl_entities.source import Column, Table -from pydantic import Field, PrivateAttr, root_validator, validator + +try: + from pydantic.v1 import Field, PrivateAttr, root_validator, validator +except (ImportError, AttributeError): + from pydantic import Field, PrivateAttr, root_validator, validator # type: ignore[no-redef, assignment] from onetl._util.spark import try_import_pyspark from onetl.base import ( @@ -103,6 +107,11 @@ class DBReader(FrozenModel): It is recommended to pass column names explicitly to avoid selecting too many columns, and to avoid adding unexpected columns to dataframe if source DDL is changed. + .. deprecated:: 0.10.0 + + Syntax ``DBReader(columns="col1, col2")`` (string instead of list) is not supported, + and will be removed in v1.0.0 + where : Any, default: ``None`` Custom ``where`` for SQL query or MongoDB pipeline. @@ -130,9 +139,7 @@ class DBReader(FrozenModel): .. code:: python - from onetl.hwm import AutoDetectHWM - - hwm = AutoDetectHWM( + hwm = DBReader.AutoDetectHWM( name="some_unique_hwm_name", expression="hwm_column", ) @@ -143,9 +150,7 @@ class DBReader(FrozenModel): .. code:: python - from onetl.hwm import AutoDetectHWM - - hwm = AutoDetectHWM( + hwm = DBReader.AutoDetectHWM( name="some_unique_hwm_name", expression="cast(hwm_column_orig as date)", ) @@ -332,7 +337,6 @@ class DBReader(FrozenModel): from onetl.db import DBReader from onetl.connection import Postgres from onetl.strategy import IncrementalStrategy - from onetl.hwm import AutoDetectHWM from pyspark.sql import SparkSession maven_packages = Postgres.get_packages() @@ -385,8 +389,8 @@ def validate_source(cls, source, values): connection: BaseDBConnection = values["connection"] return connection.dialect.validate_name(source) - @validator("columns", always=True) # noqa: WPS231 - def validate_columns(cls, value: list[str] | None, values: dict) -> list[str] | None: + @validator("columns", always=True, pre=True) + def validate_columns(cls, value: str | list[str] | None, values: dict) -> list[str] | None: connection: BaseDBConnection = values["connection"] return connection.dialect.validate_columns(value) diff --git a/onetl/db/db_writer/db_writer.py b/onetl/db/db_writer/db_writer.py index e79bb5205..eb83c9eb8 100644 --- a/onetl/db/db_writer/db_writer.py +++ b/onetl/db/db_writer/db_writer.py @@ -5,7 +5,10 @@ from logging import getLogger from typing import TYPE_CHECKING, Optional -from pydantic import Field, PrivateAttr, validator +try: + from pydantic.v1 import Field, PrivateAttr, validator +except (ImportError, AttributeError): + from pydantic import Field, PrivateAttr, validator # type: ignore[no-redef, assignment] from onetl.base import BaseDBConnection from onetl.hooks import slot, support_hooks diff --git a/onetl/file/file_df_reader/file_df_reader.py b/onetl/file/file_df_reader/file_df_reader.py index 5b8d9bed6..6967387a6 100644 --- a/onetl/file/file_df_reader/file_df_reader.py +++ b/onetl/file/file_df_reader/file_df_reader.py @@ -7,7 +7,11 @@ from typing import TYPE_CHECKING, Iterable, Optional from ordered_set import OrderedSet -from pydantic import PrivateAttr, validator + +try: + from pydantic.v1 import PrivateAttr, validator +except (ImportError, AttributeError): + from pydantic import PrivateAttr, validator # type: ignore[no-redef, assignment] from onetl._util.spark import try_import_pyspark from onetl.base import BaseFileDFConnection, BaseReadableFileFormat, PurePathProtocol diff --git a/onetl/file/file_df_reader/options.py b/onetl/file/file_df_reader/options.py index ea4564549..7fdbe64d9 100644 --- a/onetl/file/file_df_reader/options.py +++ b/onetl/file/file_df_reader/options.py @@ -4,7 +4,10 @@ from typing import TYPE_CHECKING, Optional -from pydantic import Field, validator +try: + from pydantic.v1 import Field, validator +except (ImportError, AttributeError): + from pydantic import Field, validator # type: ignore[no-redef, assignment] from onetl._util.spark import get_pyspark_version from onetl.base import FileDFReadOptions diff --git a/onetl/file/file_df_writer/file_df_writer.py b/onetl/file/file_df_writer/file_df_writer.py index 3b7a2724c..a80f54801 100644 --- a/onetl/file/file_df_writer/file_df_writer.py +++ b/onetl/file/file_df_writer/file_df_writer.py @@ -5,7 +5,10 @@ import logging from typing import TYPE_CHECKING -from pydantic import PrivateAttr, validator +try: + from pydantic.v1 import PrivateAttr, validator +except (ImportError, AttributeError): + from pydantic import PrivateAttr, validator # type: ignore[no-redef, assignment] from onetl.base import BaseFileDFConnection, BaseWritableFileFormat, PurePathProtocol from onetl.file.file_df_writer.options import FileDFWriterOptions diff --git a/onetl/file/file_df_writer/options.py b/onetl/file/file_df_writer/options.py index 01d808135..01bd9ee48 100644 --- a/onetl/file/file_df_writer/options.py +++ b/onetl/file/file_df_writer/options.py @@ -7,7 +7,10 @@ from enum import Enum from typing import TYPE_CHECKING, ContextManager, Iterable, List, Optional, Union -from pydantic import Field, root_validator +try: + from pydantic.v1 import Field, root_validator +except (ImportError, AttributeError): + from pydantic import Field, root_validator # type: ignore[no-redef, assignment] from onetl._util.spark import inject_spark_param from onetl.base import FileDFWriteOptions diff --git a/onetl/file/file_downloader/file_downloader.py b/onetl/file/file_downloader/file_downloader.py index a3db0ef19..f179491e0 100644 --- a/onetl/file/file_downloader/file_downloader.py +++ b/onetl/file/file_downloader/file_downloader.py @@ -16,7 +16,11 @@ from etl_entities.old_hwm import FileListHWM as OldFileListHWM from etl_entities.source import RemoteFolder from ordered_set import OrderedSet -from pydantic import Field, PrivateAttr, root_validator, validator + +try: + from pydantic.v1 import Field, PrivateAttr, root_validator, validator +except (ImportError, AttributeError): + from pydantic import Field, PrivateAttr, root_validator, validator # type: ignore[no-redef, assignment] from onetl._internal import generate_temp_path from onetl.base import BaseFileConnection, BaseFileFilter, BaseFileLimit diff --git a/onetl/file/file_downloader/options.py b/onetl/file/file_downloader/options.py index 11b776239..07cc2abc3 100644 --- a/onetl/file/file_downloader/options.py +++ b/onetl/file/file_downloader/options.py @@ -4,7 +4,10 @@ import warnings -from pydantic import Field, root_validator +try: + from pydantic.v1 import Field, root_validator +except (ImportError, AttributeError): + from pydantic import Field, root_validator # type: ignore[no-redef, assignment] from onetl.impl import FileExistBehavior, GenericOptions diff --git a/onetl/file/file_downloader/result.py b/onetl/file/file_downloader/result.py index 7d6233025..754dcdc04 100644 --- a/onetl/file/file_downloader/result.py +++ b/onetl/file/file_downloader/result.py @@ -2,7 +2,10 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations -from pydantic import Field +try: + from pydantic.v1 import Field +except (ImportError, AttributeError): + from pydantic import Field # type: ignore[no-redef, assignment] from onetl.file.file_result import FileResult, FileSet from onetl.impl import FailedRemoteFile, LocalPath, RemoteFile, RemotePath diff --git a/onetl/file/file_mover/file_mover.py b/onetl/file/file_mover/file_mover.py index c39f2170f..1c0b55dc7 100644 --- a/onetl/file/file_mover/file_mover.py +++ b/onetl/file/file_mover/file_mover.py @@ -9,7 +9,11 @@ from typing import Iterable, List, Optional, Tuple from ordered_set import OrderedSet -from pydantic import Field, PrivateAttr, validator + +try: + from pydantic.v1 import Field, PrivateAttr, validator +except (ImportError, AttributeError): + from pydantic import Field, PrivateAttr, validator # type: ignore[no-redef, assignment] from onetl.base import BaseFileConnection, BaseFileFilter, BaseFileLimit from onetl.base.path_protocol import PathProtocol, PathWithStatsProtocol diff --git a/onetl/file/file_mover/options.py b/onetl/file/file_mover/options.py index 78a6fd63d..b68326198 100644 --- a/onetl/file/file_mover/options.py +++ b/onetl/file/file_mover/options.py @@ -4,7 +4,10 @@ import warnings -from pydantic import Field, root_validator +try: + from pydantic.v1 import Field, root_validator +except (ImportError, AttributeError): + from pydantic import Field, root_validator # type: ignore[no-redef, assignment] from onetl.impl import FileExistBehavior, GenericOptions diff --git a/onetl/file/file_mover/result.py b/onetl/file/file_mover/result.py index 59fc433e8..90175a2d4 100644 --- a/onetl/file/file_mover/result.py +++ b/onetl/file/file_mover/result.py @@ -2,7 +2,10 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations -from pydantic import Field +try: + from pydantic.v1 import Field +except (ImportError, AttributeError): + from pydantic import Field # type: ignore[no-redef, assignment] from onetl.file.file_result import FileResult, FileSet from onetl.impl import FailedRemoteFile, RemoteFile, RemotePath diff --git a/onetl/file/file_result.py b/onetl/file/file_result.py index 0ba074288..2841741f9 100644 --- a/onetl/file/file_result.py +++ b/onetl/file/file_result.py @@ -6,7 +6,11 @@ from typing import Iterable from humanize import naturalsize -from pydantic import Field, validator + +try: + from pydantic.v1 import Field, validator +except (ImportError, AttributeError): + from pydantic import Field, validator # type: ignore[no-redef, assignment] from onetl.base import PurePathProtocol from onetl.exception import ( diff --git a/onetl/file/file_uploader/file_uploader.py b/onetl/file/file_uploader/file_uploader.py index d4e77ecdd..ba107310b 100644 --- a/onetl/file/file_uploader/file_uploader.py +++ b/onetl/file/file_uploader/file_uploader.py @@ -9,7 +9,11 @@ from typing import Iterable, Optional, Tuple from ordered_set import OrderedSet -from pydantic import PrivateAttr, validator + +try: + from pydantic.v1 import PrivateAttr, validator +except (ImportError, AttributeError): + from pydantic import PrivateAttr, validator # type: ignore[no-redef, assignment] from onetl._internal import generate_temp_path from onetl.base import BaseFileConnection diff --git a/onetl/file/file_uploader/options.py b/onetl/file/file_uploader/options.py index af4e588fe..9789d8477 100644 --- a/onetl/file/file_uploader/options.py +++ b/onetl/file/file_uploader/options.py @@ -4,7 +4,10 @@ import warnings -from pydantic import Field, root_validator +try: + from pydantic.v1 import Field, root_validator +except (ImportError, AttributeError): + from pydantic import Field, root_validator # type: ignore[no-redef, assignment] from onetl.impl import FileExistBehavior, GenericOptions diff --git a/onetl/file/file_uploader/result.py b/onetl/file/file_uploader/result.py index 48fbbcc85..9a785c719 100644 --- a/onetl/file/file_uploader/result.py +++ b/onetl/file/file_uploader/result.py @@ -2,7 +2,10 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations -from pydantic import Field +try: + from pydantic.v1 import Field +except (ImportError, AttributeError): + from pydantic import Field # type: ignore[no-redef, assignment] from onetl.file.file_result import FileResult, FileSet from onetl.impl import FailedLocalFile, LocalPath, RemoteFile diff --git a/onetl/file/filter/exclude_dir.py b/onetl/file/filter/exclude_dir.py index 33b387623..a7d9007a5 100644 --- a/onetl/file/filter/exclude_dir.py +++ b/onetl/file/filter/exclude_dir.py @@ -4,7 +4,10 @@ import os -from pydantic import validator +try: + from pydantic.v1 import validator +except (ImportError, AttributeError): + from pydantic import validator # type: ignore[no-redef, assignment] from onetl.base import BaseFileFilter, PathProtocol, PurePathProtocol from onetl.impl import FrozenModel, RemotePath diff --git a/onetl/file/filter/glob.py b/onetl/file/filter/glob.py index 045ee1bcc..768261e60 100644 --- a/onetl/file/filter/glob.py +++ b/onetl/file/filter/glob.py @@ -4,7 +4,10 @@ import glob -from pydantic import validator +try: + from pydantic.v1 import validator +except (ImportError, AttributeError): + from pydantic import validator # type: ignore[no-redef, assignment] from onetl.base import BaseFileFilter, PathProtocol from onetl.impl import FrozenModel diff --git a/onetl/file/filter/regexp.py b/onetl/file/filter/regexp.py index e07ba7039..0c64001da 100644 --- a/onetl/file/filter/regexp.py +++ b/onetl/file/filter/regexp.py @@ -5,7 +5,10 @@ import os import re -from pydantic import validator +try: + from pydantic.v1 import validator +except (ImportError, AttributeError): + from pydantic import validator # type: ignore[no-redef, assignment] from onetl.base import BaseFileFilter, PathProtocol from onetl.impl import FrozenModel diff --git a/onetl/file/format/avro.py b/onetl/file/format/avro.py index 04d07af84..551c76c9b 100644 --- a/onetl/file/format/avro.py +++ b/onetl/file/format/avro.py @@ -6,7 +6,10 @@ import logging from typing import TYPE_CHECKING, ClassVar, Dict, Optional -from pydantic import Field, validator +try: + from pydantic.v1 import Field, validator +except (ImportError, AttributeError): + from pydantic import Field, validator # type: ignore[no-redef, assignment] from onetl._util.java import try_import_java_class from onetl._util.scala import get_default_scala_version diff --git a/onetl/file/format/csv.py b/onetl/file/format/csv.py index e21eb5d82..41794f7ef 100644 --- a/onetl/file/format/csv.py +++ b/onetl/file/format/csv.py @@ -4,7 +4,10 @@ from typing import TYPE_CHECKING, ClassVar -from pydantic import Field +try: + from pydantic.v1 import Field +except (ImportError, AttributeError): + from pydantic import Field # type: ignore[no-redef, assignment] from onetl.file.format.file_format import ReadWriteFileFormat from onetl.hooks import slot, support_hooks diff --git a/onetl/file/format/xml.py b/onetl/file/format/xml.py index 794593d34..47190ea77 100644 --- a/onetl/file/format/xml.py +++ b/onetl/file/format/xml.py @@ -5,7 +5,10 @@ import logging from typing import TYPE_CHECKING, ClassVar -from pydantic import Field +try: + from pydantic.v1 import Field +except (ImportError, AttributeError): + from pydantic import Field # type: ignore[no-redef, assignment] from onetl._util.java import try_import_java_class from onetl._util.scala import get_default_scala_version diff --git a/onetl/hwm/auto_hwm.py b/onetl/hwm/auto_hwm.py index 26ee406c8..5c42f3579 100644 --- a/onetl/hwm/auto_hwm.py +++ b/onetl/hwm/auto_hwm.py @@ -5,7 +5,12 @@ from typing import Any from etl_entities.hwm import HWM -from pydantic import root_validator + +try: + from pydantic.v1 import root_validator +except (ImportError, AttributeError): + from pydantic import root_validator # type: ignore[no-redef, assignment] + from typing_extensions import Literal diff --git a/onetl/hwm/store/yaml_hwm_store.py b/onetl/hwm/store/yaml_hwm_store.py index 759637bab..2ffd24f68 100644 --- a/onetl/hwm/store/yaml_hwm_store.py +++ b/onetl/hwm/store/yaml_hwm_store.py @@ -14,7 +14,11 @@ register_hwm_store_class, ) from platformdirs import user_data_dir -from pydantic import validator + +try: + from pydantic.v1 import validator +except (ImportError, AttributeError): + from pydantic import validator # type: ignore[no-redef, assignment] from onetl.hooks import slot, support_hooks from onetl.impl import FrozenModel, LocalPath diff --git a/onetl/impl/base_model.py b/onetl/impl/base_model.py index cf76cd48c..3208619e5 100644 --- a/onetl/impl/base_model.py +++ b/onetl/impl/base_model.py @@ -1,10 +1,15 @@ # SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) # SPDX-License-Identifier: Apache-2.0 +# isort: skip_file + from __future__ import annotations from typing import ClassVar -from pydantic import BaseModel as PydanticBaseModel +try: + from pydantic.v1 import BaseModel as PydanticBaseModel +except (ImportError, AttributeError): + from pydantic import BaseModel as PydanticBaseModel # type: ignore[no-redef, assignment] class BaseModel(PydanticBaseModel): diff --git a/onetl/impl/generic_options.py b/onetl/impl/generic_options.py index 057e69a65..8d3e629b9 100644 --- a/onetl/impl/generic_options.py +++ b/onetl/impl/generic_options.py @@ -7,7 +7,10 @@ from fnmatch import fnmatch from typing import Iterable, TypeVar -from pydantic import root_validator +try: + from pydantic.v1 import root_validator +except (ImportError, AttributeError): + from pydantic import root_validator # type: ignore[no-redef, assignment] from onetl.impl.frozen_model import FrozenModel diff --git a/onetl/log.py b/onetl/log.py index c97fe6620..32c239b8b 100644 --- a/onetl/log.py +++ b/onetl/log.py @@ -11,8 +11,8 @@ from textwrap import dedent from typing import TYPE_CHECKING, Any, Collection, Iterable -from deprecated import deprecated from etl_entities.hwm import HWM +from typing_extensions import deprecated if TYPE_CHECKING: from pyspark.sql import DataFrame @@ -33,9 +33,7 @@ @deprecated( - version="0.5.0", - reason="Will be removed in 1.0.0, use 'setup_logging' instead", - action="always", + "Deprecated in 0.5.0 and will be removed in 1.0.0. Use 'setup_logging' instead", category=UserWarning, ) def setup_notebook_logging(level: int | str = logging.INFO) -> None: diff --git a/onetl/strategy/batch_hwm_strategy.py b/onetl/strategy/batch_hwm_strategy.py index dc02fb29a..1e5ec5b5b 100644 --- a/onetl/strategy/batch_hwm_strategy.py +++ b/onetl/strategy/batch_hwm_strategy.py @@ -6,7 +6,10 @@ from textwrap import dedent from typing import Any, ClassVar -from pydantic import validator +try: + from pydantic.v1 import validator +except (ImportError, AttributeError): + from pydantic import validator # type: ignore[no-redef, assignment] from onetl.hwm import Edge from onetl.strategy.hwm_strategy import HWMStrategy diff --git a/onetl/strategy/incremental_strategy.py b/onetl/strategy/incremental_strategy.py index 8946f7d5d..267b01613 100644 --- a/onetl/strategy/incremental_strategy.py +++ b/onetl/strategy/incremental_strategy.py @@ -196,7 +196,6 @@ class IncrementalStrategy(OffsetMixin, HWMStrategy): from onetl.connection import Postgres from onetl.db import DBReader from onetl.strategy import IncrementalStrategy - from onetl.hwm import AutoDetectHWM from pyspark.sql import SparkSession @@ -290,7 +289,6 @@ class IncrementalStrategy(OffsetMixin, HWMStrategy): from onetl.connection import Kafka from onetl.db import DBReader from onetl.strategy import IncrementalStrategy - from onetl.hwm import AutoDetectHWM from pyspark.sql import SparkSession @@ -323,7 +321,7 @@ class IncrementalStrategy(OffsetMixin, HWMStrategy): from onetl.connection import SFTP from onetl.file import FileDownloader from onetl.strategy import SnapshotStrategy - from etl_entities import FileListHWM + from etl_entities.hwm import FileListHWM sftp = SFTP( host="sftp.domain.com", @@ -495,7 +493,6 @@ class IncrementalBatchStrategy(OffsetMixin, BatchHWMStrategy): from onetl.connection import Postgres, Hive from onetl.db import DBReader from onetl.strategy import IncrementalBatchStrategy - from onetl.hwm import AutoDetectHWM from pyspark.sql import SparkSession diff --git a/onetl/strategy/snapshot_strategy.py b/onetl/strategy/snapshot_strategy.py index 49bce44dd..5368b2039 100644 --- a/onetl/strategy/snapshot_strategy.py +++ b/onetl/strategy/snapshot_strategy.py @@ -55,7 +55,6 @@ class SnapshotStrategy(BaseStrategy): from onetl.connection import Postgres from onetl.db import DBReader from onetl.strategy import SnapshotStrategy - from onetl.hwm import AutoDetectHWM from pyspark.sql import SparkSession @@ -227,7 +226,6 @@ class SnapshotBatchStrategy(BatchHWMStrategy): from onetl.connection import Postgres, Hive from onetl.db import DBReader from onetl.strategy import SnapshotBatchStrategy - from onetl.hwm import AutoDetectHWM from pyspark.sql import SparkSession diff --git a/requirements/core.txt b/requirements/core.txt index f0ca45793..c8e245a4a 100644 --- a/requirements/core.txt +++ b/requirements/core.txt @@ -1,5 +1,4 @@ -deprecated -etl-entities>=2.2,<2.3 +etl-entities>=2.2,<2.4 evacuator>=1.0,<1.1 frozendict humanize @@ -9,8 +8,8 @@ ordered-set platformdirs # https://github.com/pydantic/pydantic/issues/4092 # https://github.com/pydantic/pydantic/issues/4498 -pydantic>=1.9.2,!=1.10.2,<2 +pydantic>=1.9.2,!=1.10.2,<3 pyyaml # https://github.com/python/typing_extensions/pull/188 -typing_extensions>=4.0.0,!=4.6.1; python_version == "3.7" -typing_extensions>=4.0.0; python_version > "3.7" +typing_extensions>=4.5.0,!=4.6.1; python_version == "3.7" +typing_extensions>=4.5.0; python_version > "3.7" diff --git a/requirements/docs-plantuml.txt b/requirements/docs-plantuml.txt deleted file mode 100644 index efaaff7ac..000000000 --- a/requirements/docs-plantuml.txt +++ /dev/null @@ -1,2 +0,0 @@ ---no-deps -sphinx-plantuml diff --git a/requirements/docs.txt b/requirements/docs.txt index fedf2562a..3776dbb09 100644 --- a/requirements/docs.txt +++ b/requirements/docs.txt @@ -1,4 +1,4 @@ -autodoc-pydantic<2.0.0 +autodoc-pydantic<2 furo importlib-resources<6 numpydoc diff --git a/requirements/tests/pydantic-1.txt b/requirements/tests/pydantic-1.txt new file mode 100644 index 000000000..fd72c6577 --- /dev/null +++ b/requirements/tests/pydantic-1.txt @@ -0,0 +1 @@ +pydantic<2 diff --git a/requirements/tests/pydantic-2.txt b/requirements/tests/pydantic-2.txt new file mode 100644 index 000000000..8b2c779e6 --- /dev/null +++ b/requirements/tests/pydantic-2.txt @@ -0,0 +1 @@ +pydantic>=2,<3 diff --git a/requirements/tests/pydantic-latest.txt b/requirements/tests/pydantic-latest.txt new file mode 100644 index 000000000..2d5452210 --- /dev/null +++ b/requirements/tests/pydantic-latest.txt @@ -0,0 +1 @@ +pydantic>=2 diff --git a/setup.cfg b/setup.cfg index d73a28390..96b763b7c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -51,7 +51,7 @@ max-cognitive-score = 20 # Max amount of cognitive complexity per module max-cognitive-average = 25 max-imports = 25 -max-imported-names = 55 +max-imported-names = 60 # Max of expression usages in a module max-module-expressions = 15 # Max of expression usages in a function @@ -269,6 +269,7 @@ ignore = WPS201, # WPS429 Found multiple assign targets WPS429, +# https://github.com/wemake-services/wemake-python-styleguide/issues/2847 # E704 multiple statements on one line: def func(): ... E704, # WPS220 Found too deep nesting: 46 > 20 @@ -426,10 +427,10 @@ per-file-ignores = docstring_style = sphinx [mypy] -python_version = 3.7 +python_version = 3.8 # TODO: remove later exclude = ^(?=.*file).* -strict_optional=True +strict_optional = True # ignore typing in third-party packages ignore_missing_imports = True follow_imports = silent diff --git a/tests/.coveragerc b/tests/.coveragerc index 295122b8a..5d7fb16f9 100644 --- a/tests/.coveragerc +++ b/tests/.coveragerc @@ -20,3 +20,4 @@ exclude_lines = spark = SparkSession._instantiatedSession if log.isEnabledFor(logging.DEBUG): if sys.version_info + except .*ImportError diff --git a/tests/fixtures/create_keytab.py b/tests/fixtures/create_keytab.py index 368c36d59..c44cb1a62 100644 --- a/tests/fixtures/create_keytab.py +++ b/tests/fixtures/create_keytab.py @@ -14,4 +14,4 @@ def create_keytab(tmp_path_factory): @pytest.fixture def keytab_md5(): - return hashlib.md5(b"content").hexdigest() # noqa: S324 + return hashlib.md5(b"content").hexdigest() # noqa: S324 # nosec diff --git a/tests/fixtures/spark.py b/tests/fixtures/spark.py index 3196210e1..b2559d520 100644 --- a/tests/fixtures/spark.py +++ b/tests/fixtures/spark.py @@ -59,7 +59,12 @@ def maven_packages(): with_greenplum = os.getenv("ONETL_DB_WITH_GREENPLUM", "false").lower() == "true" if with_greenplum: # Greenplum connector jar is not publicly available, - packages.extend(Greenplum.get_packages(spark_version=pyspark_version)) + packages.extend( + Greenplum.get_packages( + spark_version=pyspark_version, + package_version=os.getenv("ONETL_GP_PACKAGE_VERSION") or None, + ), + ) if pyspark_version >= (2, 4): # There is no Avro package for Spark 2.3 @@ -78,7 +83,6 @@ def maven_packages(): packages.extend(MongoDB.get_packages(spark_version=pyspark_version)) # There is no Excel files support for Spark less than 3.2 - # And there is still no package released for 3.5.0 https://github.com/crealytics/spark-excel/issues/787 packages.extend(Excel.get_packages(spark_version=pyspark_version)) return packages diff --git a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_hive_writer_integration.py b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_hive_writer_integration.py index 07b30cb07..a841f09f1 100644 --- a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_hive_writer_integration.py +++ b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_hive_writer_integration.py @@ -289,7 +289,10 @@ def test_hive_writer_insert_into_with_options(spark, processing, get_schema_tabl options=options, ) - error_msg = f"|Hive| Options {option_kv} are not supported while inserting into existing table, ignoring" + error_msg = ( + f"|Hive| User-specified options {option_kv} are ignored while inserting into existing table. " + "Using only table parameters from Hive metastore" + ) with caplog.at_level(logging.WARNING): # write to table with new options writer2.run(df2) diff --git a/tests/tests_integration/tests_db_connection_integration/test_mongodb_integration.py b/tests/tests_integration/tests_db_connection_integration/test_mongodb_integration.py index 6283df1a1..86daa7d91 100644 --- a/tests/tests_integration/tests_db_connection_integration/test_mongodb_integration.py +++ b/tests/tests_integration/tests_db_connection_integration/test_mongodb_integration.py @@ -48,10 +48,9 @@ def test_mongodb_connection_check_fail(processing, spark): mongo.check() -def test_mongodb_connection_read_pipeline_match( +def test_mongodb_connection_read_pipeline( spark, prepare_schema_table, - load_table_data, processing, ): mongo = MongoDB( @@ -63,23 +62,60 @@ def test_mongodb_connection_read_pipeline_match( spark=spark, ) + write_df = processing.create_pandas_df() + + processing.insert_data( + schema=prepare_schema_table.schema, + table=prepare_schema_table.table, + values=write_df, + ) + df = mongo.pipeline( collection=prepare_schema_table.table, - pipeline={"$match": {"_id": {"$eq": 1}}}, ) - assert df - assert df.count() == 1 + processing.assert_equal_df( + df=df, + other_frame=write_df, + ) - collected = df.collect() - assert collected[0][0] == 1 # _id +def test_mongodb_connection_read_pipeline_match( + spark, + prepare_schema_table, + processing, +): + mongo = MongoDB( + host=processing.host, + port=processing.port, + user=processing.user, + password=processing.password, + database=processing.database, + spark=spark, + ) + + write_df = processing.create_pandas_df() + + processing.insert_data( + schema=prepare_schema_table.schema, + table=prepare_schema_table.table, + values=write_df, + ) + + df = mongo.pipeline( + collection=prepare_schema_table.table, + pipeline={"$match": {"_id": {"$eq": 1}}}, + ) + + processing.assert_equal_df( + df=df, + other_frame=write_df[write_df._id == 1], + ) def test_mongodb_connection_read_pipeline_match_with_df_schema( spark, prepare_schema_table, - load_table_data, processing, ): mongo = MongoDB( @@ -91,21 +127,27 @@ def test_mongodb_connection_read_pipeline_match_with_df_schema( spark=spark, ) + write_df = processing.create_pandas_df() + + processing.insert_data( + schema=prepare_schema_table.schema, + table=prepare_schema_table.table, + values=write_df, + ) + df = mongo.pipeline( collection=prepare_schema_table.table, pipeline={"$match": {"_id": {"$eq": 1}}}, df_schema=prepare_schema_table.schema, ) - assert df - assert df.count() == 1 - - collected = df.collect() - - assert collected[0]["_id"] == 1 + processing.assert_equal_df( + df=df, + other_frame=write_df[write_df._id == 1], + ) -def test_mongodb_connection_read_pipeline_group(spark, prepare_schema_table, load_table_data, processing): +def test_mongodb_connection_read_pipeline_group(spark, prepare_schema_table, processing): mongo = MongoDB( host=processing.host, port=processing.port, @@ -115,6 +157,14 @@ def test_mongodb_connection_read_pipeline_group(spark, prepare_schema_table, loa spark=spark, ) + write_df = processing.create_pandas_df(min_id=1, max_id=100) + + processing.insert_data( + schema=prepare_schema_table.schema, + table=prepare_schema_table.table, + values=write_df, + ) + df = mongo.pipeline( collection=prepare_schema_table.table, pipeline={"$group": {"_id": 1, "min": {"$min": "$hwm_int"}, "max": {"$max": "$hwm_int"}}}, diff --git a/tests/tests_integration/tests_hwm_store_integration.py b/tests/tests_integration/tests_hwm_store_integration.py index 8b6938b33..1b589de2d 100644 --- a/tests/tests_integration/tests_hwm_store_integration.py +++ b/tests/tests_integration/tests_hwm_store_integration.py @@ -11,7 +11,7 @@ hwm_store = [ MemoryHWMStore(), - YAMLHWMStore(path=tempfile.mktemp("hwmstore")), # noqa: S306 NOSONAR + YAMLHWMStore(path=tempfile.mktemp("hwmstore")), # noqa: S306 # nosec ] diff --git a/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_kafka.py b/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_kafka.py index 1ba96cee1..7ae146cca 100644 --- a/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_kafka.py +++ b/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_kafka.py @@ -26,7 +26,7 @@ def test_kafka_strategy_incremental( kafka_topic, num_partitions, ): - from pyspark.sql.functions import count as spark_count + from pyspark.sql.functions import max as spark_max hwm_type = KeyValueIntHWM hwm_name = secrets.token_hex(5) @@ -71,18 +71,23 @@ def test_kafka_strategy_incremental( with IncrementalStrategy(): first_df = reader.run() + # all the data has been read + deserialized_first_df = processing.json_deserialize(first_df, df_schema=kafka_dataframe_schema) + processing.assert_equal_df(df=deserialized_first_df, other_frame=first_span, order_by="id_int") + hwm = store.get_hwm(hwm_name) assert hwm is not None assert isinstance(hwm, hwm_type) - # check that HWM distribution of messages in partitions matches the distribution in sparkDF - partition_counts = first_df.groupBy("partition").agg(spark_count("*").alias("count")) - partition_count_dict_first_df = {row["partition"]: row["count"] for row in partition_counts.collect()} - assert hwm.value == partition_count_dict_first_df - - # all the data has been read - deserialized_first_df = processing.json_deserialize(first_df, df_schema=kafka_dataframe_schema) - processing.assert_equal_df(df=deserialized_first_df, other_frame=first_span, order_by="id_int") + # HWM contains mapping `partition: max offset + 1` + partition_offsets_initial = dict.fromkeys(range(num_partitions or 1), 0) + partition_offsets_initial.update( + { + row["partition"]: row["offset"] + 1 + for row in first_df.groupBy("partition").agg(spark_max("offset").alias("offset")).collect() + }, + ) + assert hwm.value == partition_offsets_initial # insert second span processing.insert_pandas_df_into_topic(second_span, kafka_topic) @@ -90,17 +95,21 @@ def test_kafka_strategy_incremental( with IncrementalStrategy(): second_df = reader.run() - hwm = store.get_hwm(hwm_name) - - # check that HWM distribution of messages in partitions matches the distribution in sparkDF - combined_df = first_df.union(second_df) - partition_counts_combined = combined_df.groupBy("partition").agg(spark_count("*").alias("count")) - partition_count_dict_combined = {row["partition"]: row["count"] for row in partition_counts_combined.collect()} - assert hwm.value == partition_count_dict_combined - + # only new data has been read deserialized_second_df = processing.json_deserialize(second_df, df_schema=kafka_dataframe_schema) processing.assert_equal_df(df=deserialized_second_df, other_frame=second_span, order_by="id_int") + # HWM contains mapping `partition: max offset + 1` + hwm = store.get_hwm(hwm_name) + partition_offsets_new = partition_offsets_initial.copy() + partition_offsets_new.update( + { + row["partition"]: row["offset"] + 1 + for row in second_df.groupBy("partition").agg(spark_max("offset").alias("offset")).collect() + }, + ) + assert hwm.value == partition_offsets_new + @pytest.mark.parametrize( "num_partitions", @@ -117,7 +126,7 @@ def test_kafka_strategy_incremental_nothing_to_read( num_partitions, kafka_topic, ): - from pyspark.sql.functions import count as spark_count + from pyspark.sql.functions import max as spark_max hwm_name = secrets.token_hex(5) store = HWMStoreStackManager.get_current() @@ -170,15 +179,20 @@ def test_kafka_strategy_incremental_nothing_to_read( assert reader.has_data() first_df = reader.run() - hwm = store.get_hwm(name=hwm_name) - # check that HWM distribution of messages in partitions matches the distribution in sparkDF - partition_counts = first_df.groupBy("partition").agg(spark_count("*").alias("count")) - partition_count_dict_first_df = {row["partition"]: row["count"] for row in partition_counts.collect()} - assert hwm.value == partition_count_dict_first_df - deserialized_df = processing.json_deserialize(first_df, df_schema=kafka_dataframe_schema) processing.assert_equal_df(df=deserialized_df, other_frame=first_span, order_by="id_int") + # HWM contains mapping `partition: max offset + 1` + hwm = store.get_hwm(name=hwm_name) + partition_offsets_initial = dict.fromkeys(range(num_partitions or 1), 0) + partition_offsets_initial.update( + { + row["partition"]: row["offset"] + 1 + for row in first_df.groupBy("partition").agg(spark_max("offset").alias("offset")).collect() + }, + ) + assert hwm.value == partition_offsets_initial + # no new data yet, nothing to read with IncrementalStrategy(): df = reader.run() @@ -186,7 +200,7 @@ def test_kafka_strategy_incremental_nothing_to_read( assert not df.count() # HWM value is unchanged hwm = store.get_hwm(name=hwm_name) - assert hwm.value == partition_count_dict_first_df + assert hwm.value == partition_offsets_initial # insert second span processing.insert_pandas_df_into_topic(second_span, kafka_topic) @@ -195,38 +209,44 @@ def test_kafka_strategy_incremental_nothing_to_read( assert not df.count() # HWM value is unchanged hwm = store.get_hwm(name=hwm_name) - assert hwm.value == partition_count_dict_first_df + assert hwm.value == partition_offsets_initial # read data with IncrementalStrategy(): - df = reader.run() + second_df = reader.run() - hwm = store.get_hwm(name=hwm_name) - # check that HWM distribution of messages in partitions matches the distribution in sparkDF - combined_df = df.union(first_df) - partition_counts_combined = combined_df.groupBy("partition").agg(spark_count("*").alias("count")) - partition_count_dict_combined = {row["partition"]: row["count"] for row in partition_counts_combined.collect()} - assert hwm.value == partition_count_dict_combined + # only new data has been read + deserialized_second_df = processing.json_deserialize(second_df, df_schema=kafka_dataframe_schema) + processing.assert_equal_df(df=deserialized_second_df, other_frame=second_span, order_by="id_int") - deserialized_df = processing.json_deserialize(df, df_schema=kafka_dataframe_schema) - processing.assert_equal_df(df=deserialized_df, other_frame=second_span, order_by="id_int") + # HWM contains mapping `partition: max offset + 1` + hwm = store.get_hwm(name=hwm_name) + partition_offsets_new = partition_offsets_initial.copy() + partition_offsets_new.update( + { + row["partition"]: row["offset"] + 1 + for row in second_df.groupBy("partition").agg(spark_max("offset").alias("offset")).collect() + }, + ) + assert hwm.value == partition_offsets_new @pytest.mark.parametrize( - "initial_partitions, additional_partitions", + "initial_partitions, new_partitions", [ - (3, 2), # starting with 3 partitions, adding 2 more - (5, 1), # starting with 5 partitions, adding 1 more + (3, 5), + (5, 6), ], ) def test_kafka_strategy_incremental_with_new_partition( spark, processing, initial_partitions, - additional_partitions, + new_partitions, kafka_topic, + kafka_dataframe_schema, ): - from pyspark.sql.functions import count as spark_count + from pyspark.sql.functions import max as spark_max hwm_name = secrets.token_hex(5) store = HWMStoreStackManager.get_current() @@ -267,21 +287,38 @@ def test_kafka_strategy_incremental_with_new_partition( # Specified: Set(topic1, topic2) Assigned: Set(topic1, topic2, additional_topic3, additional_topic4) first_df.cache() + # all the data has been read + deserialized_df = processing.json_deserialize(first_df, df_schema=kafka_dataframe_schema) + processing.assert_equal_df(df=deserialized_df, other_frame=first_span, order_by="id_int") + + # HWM contains mapping `partition: max offset + 1` hwm = store.get_hwm(name=hwm_name) - first_run_hwm_keys_num = len(hwm.value.keys()) + partition_offsets_initial = dict.fromkeys(range(initial_partitions), 0) + partition_offsets_initial.update( + { + row["partition"]: row["offset"] + 1 + for row in first_df.groupBy("partition").agg(spark_max("offset").alias("offset")).collect() + }, + ) - processing.change_topic_partitions(kafka_topic, initial_partitions + additional_partitions) + processing.change_topic_partitions(kafka_topic, new_partitions) processing.insert_pandas_df_into_topic(second_span, kafka_topic) with IncrementalStrategy(): second_df = reader.run() + # only new data has been read + deserialized_second_df = processing.json_deserialize(second_df, df_schema=kafka_dataframe_schema) + processing.assert_equal_df(df=deserialized_second_df, other_frame=second_span, order_by="id_int") + + # HWM contains mapping `partition: max offset + 1` hwm = store.get_hwm(name=hwm_name) - second_run_hwm_keys_num = len(hwm.value) - assert first_run_hwm_keys_num + additional_partitions == second_run_hwm_keys_num - - # check that HWM distribution of messages in partitions matches the distribution in sparkDF - combined_df = second_df.union(first_df) - partition_counts_combined = combined_df.groupBy("partition").agg(spark_count("*").alias("count")) - partition_count_dict_combined = {row["partition"]: row["count"] for row in partition_counts_combined.collect()} - assert hwm.value == partition_count_dict_combined + partition_offsets_new = dict.fromkeys(range(new_partitions), 0) + partition_offsets_new.update(partition_offsets_initial) + partition_offsets_new.update( + { + row["partition"]: row["offset"] + 1 + for row in second_df.groupBy("partition").agg(spark_max("offset").alias("offset")).collect() + }, + ) + assert hwm.value == partition_offsets_new diff --git a/tests/tests_unit/test_db/test_db_reader_unit/test_common_reader_unit.py b/tests/tests_unit/test_db/test_db_reader_unit/test_common_reader_unit.py index a21fb1daa..5f61e6a07 100644 --- a/tests/tests_unit/test_db/test_db_reader_unit/test_common_reader_unit.py +++ b/tests/tests_unit/test_db/test_db_reader_unit/test_common_reader_unit.py @@ -58,7 +58,6 @@ def test_reader_hive_with_read_options(spark_mock): (), {}, set(), - "any,string", ], ) def test_reader_invalid_columns(spark_mock, columns): @@ -73,6 +72,7 @@ def test_reader_invalid_columns(spark_mock, columns): @pytest.mark.parametrize( "columns, real_columns", [ + (None, ["*"]), (["*"], ["*"]), (["abc", "cde"], ["abc", "cde"]), (["*", "abc"], ["*", "abc"]), @@ -88,6 +88,35 @@ def test_reader_valid_columns(spark_mock, columns, real_columns): assert reader.columns == real_columns +@pytest.mark.parametrize( + "column, real_columns, msg", + [ + ( + "*", + ["*"], + "Passing DBReader(columns='*') is deprecated since v0.10.0 and will be removed in v1.0.0. " + "Use DBReader(columns=['*'] instead", + ), + ( + "some, column", + ["some, column"], + "Passing DBReader(columns='some, column') is deprecated since v0.10.0 and will be removed in v1.0.0. " + "Use DBReader(columns=['some, column'] instead", + ), + ], + ids=["*", "some, column"], +) +def test_reader_legacy_columns(spark_mock, column, real_columns, msg): + with pytest.warns(UserWarning, match=re.escape(msg)): + reader = DBReader( + connection=Hive(cluster="rnd-dwh", spark=spark_mock), + table="schema.table", + columns=column, + ) + + assert reader.columns == real_columns + + @pytest.mark.parametrize( "hwm_column, real_hwm_expression", [ diff --git a/tests/tests_unit/tests_db_connection_unit/test_greenplum_unit.py b/tests/tests_unit/tests_db_connection_unit/test_greenplum_unit.py index 71308e2f9..bda06d282 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_greenplum_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_greenplum_unit.py @@ -15,9 +15,9 @@ def test_greenplum_driver(): def test_greenplum_package(): warning_msg = re.escape("will be removed in 1.0.0, use `Greenplum.get_packages(spark_version=") with pytest.warns(UserWarning, match=warning_msg): - assert Greenplum.package_spark_2_3 == "io.pivotal:greenplum-spark_2.11:2.3.0" - assert Greenplum.package_spark_2_4 == "io.pivotal:greenplum-spark_2.11:2.3.0" - assert Greenplum.package_spark_3_2 == "io.pivotal:greenplum-spark_2.12:2.3.0" + assert Greenplum.package_spark_2_3 == "io.pivotal:greenplum-spark_2.11:2.2.0" + assert Greenplum.package_spark_2_4 == "io.pivotal:greenplum-spark_2.11:2.2.0" + assert Greenplum.package_spark_3_2 == "io.pivotal:greenplum-spark_2.12:2.2.0" def test_greenplum_get_packages_no_input(): @@ -56,15 +56,15 @@ def test_greenplum_get_packages_scala_version_not_supported(scala_version): "spark_version, scala_version, package", [ # use Scala version directly - (None, "2.11", "io.pivotal:greenplum-spark_2.11:2.3.0"), - (None, "2.12", "io.pivotal:greenplum-spark_2.12:2.3.0"), + (None, "2.11", "io.pivotal:greenplum-spark_2.11:2.2.0"), + (None, "2.12", "io.pivotal:greenplum-spark_2.12:2.2.0"), # Detect Scala version by Spark version - ("2.3", None, "io.pivotal:greenplum-spark_2.11:2.3.0"), - ("2.4", None, "io.pivotal:greenplum-spark_2.11:2.3.0"), - ("3.2", None, "io.pivotal:greenplum-spark_2.12:2.3.0"), + ("2.3", None, "io.pivotal:greenplum-spark_2.11:2.2.0"), + ("2.4", None, "io.pivotal:greenplum-spark_2.11:2.2.0"), + ("3.2", None, "io.pivotal:greenplum-spark_2.12:2.2.0"), # Override Scala version detected automatically - ("2.3", "2.11", "io.pivotal:greenplum-spark_2.11:2.3.0"), - ("2.4", "2.12", "io.pivotal:greenplum-spark_2.12:2.3.0"), + ("2.3", "2.11", "io.pivotal:greenplum-spark_2.11:2.2.0"), + ("2.4", "2.12", "io.pivotal:greenplum-spark_2.12:2.2.0"), ], ) def test_greenplum_get_packages(spark_version, scala_version, package): @@ -74,7 +74,7 @@ def test_greenplum_get_packages(spark_version, scala_version, package): @pytest.mark.parametrize( "package_version, scala_version, package", [ - (None, "2.12", "io.pivotal:greenplum-spark_2.12:2.3.0"), + (None, "2.12", "io.pivotal:greenplum-spark_2.12:2.2.0"), ("2.3.0", "2.12", "io.pivotal:greenplum-spark_2.12:2.3.0"), ("2.1.4", "2.12", "io.pivotal:greenplum-spark_2.12:2.1.4"), ], diff --git a/tests/tests_unit/tests_db_connection_unit/test_hive_unit.py b/tests/tests_unit/tests_db_connection_unit/test_hive_unit.py index 01cffd4a8..891406ba4 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_hive_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_hive_unit.py @@ -103,7 +103,7 @@ def get_current_cluster() -> str: def test_hive_old_options_deprecated(): - warning_msg = "Please use 'WriteOptions' class instead. Will be removed in v1.0.0" + warning_msg = "Deprecated in 0.5.0 and will be removed in 1.0.0. Use 'WriteOptions' instead" with pytest.warns(UserWarning, match=warning_msg): options = Hive.Options(some="value") diff --git a/tests/tests_unit/tests_db_connection_unit/test_jdbc_options_unit.py b/tests/tests_unit/tests_db_connection_unit/test_jdbc_options_unit.py index f932408d0..7c8ecfcca 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_jdbc_options_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_jdbc_options_unit.py @@ -129,7 +129,7 @@ def test_jdbc_read_options_cannot_be_used_in_write_options(arg, value): ], ) def test_jdbc_old_options_allowed_but_deprecated(arg, value): - warning_msg = "Please use 'ReadOptions' or 'WriteOptions' class instead. Will be removed in v1.0.0" + warning_msg = "Deprecated in 0.5.0 and will be removed in 1.0.0. Use 'ReadOptions' or 'WriteOptions' instead" with pytest.warns(UserWarning, match=warning_msg): options = Postgres.Options.parse({arg: value}) @@ -228,7 +228,7 @@ def test_jdbc_read_options_to_jdbc(spark_mock): "CamelCaseOption": "left unchanged", }, "upperBound": "1000", - "url": "jdbc:postgresql://local:5432/default?ApplicationName=abc", + "url": "jdbc:postgresql://local:5432/default?ApplicationName=abc&stringtype=unspecified", } @@ -257,7 +257,7 @@ def test_jdbc_write_options_to_jdbc(spark_mock): "camelCaseOption": "left unchanged", "CamelCaseOption": "left unchanged", }, - "url": "jdbc:postgresql://local:5432/default?ApplicationName=abc", + "url": "jdbc:postgresql://local:5432/default?ApplicationName=abc&stringtype=unspecified", } diff --git a/tests/tests_unit/tests_db_connection_unit/test_postgres_unit.py b/tests/tests_unit/tests_db_connection_unit/test_postgres_unit.py index 01f85eb08..f6b5f4e9b 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_postgres_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_postgres_unit.py @@ -55,7 +55,7 @@ def test_postgres(spark_mock): assert conn.password.get_secret_value() == "passwd" assert conn.database == "database" - assert conn.jdbc_url == "jdbc:postgresql://some_host:5432/database?ApplicationName=abc" + assert conn.jdbc_url == "jdbc:postgresql://some_host:5432/database?ApplicationName=abc&stringtype=unspecified" assert "password='passwd'" not in str(conn) assert "password='passwd'" not in repr(conn) @@ -71,7 +71,7 @@ def test_postgres_with_port(spark_mock): assert conn.password.get_secret_value() == "passwd" assert conn.database == "database" - assert conn.jdbc_url == "jdbc:postgresql://some_host:5000/database?ApplicationName=abc" + assert conn.jdbc_url == "jdbc:postgresql://some_host:5000/database?ApplicationName=abc&stringtype=unspecified" def test_postgres_without_database_error(spark_mock): @@ -89,7 +89,10 @@ def test_postgres_with_extra(spark_mock): spark=spark_mock, ) - assert conn.jdbc_url == "jdbc:postgresql://some_host:5432/database?ApplicationName=abc&autosave=always&ssl=true" + assert ( + conn.jdbc_url + == "jdbc:postgresql://some_host:5432/database?ApplicationName=abc&autosave=always&ssl=true&stringtype=unspecified" + ) def test_postgres_without_mandatory_args(spark_mock):