diff --git a/.env.dependencies b/.env.dependencies index af892898b..5fccfa159 100644 --- a/.env.dependencies +++ b/.env.dependencies @@ -13,24 +13,14 @@ KAFKA_CFG_LISTENERS=INTERNAL_PLAINTEXT_ANONYMOUS://:9092,EXTERNAL_PLAINTEXT_ANON KAFKA_CFG_ADVERTISED_LISTENERS=INTERNAL_PLAINTEXT_ANONYMOUS://kafka:9092,EXTERNAL_PLAINTEXT_ANONYMOUS://localhost:9093,INTERNAL_PLAINTEXT_SASL://kafka:9094,EXTERNAL_PLAINTEXT_SASL://localhost:9095 KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP=INTERNAL_PLAINTEXT_ANONYMOUS:PLAINTEXT,EXTERNAL_PLAINTEXT_ANONYMOUS:PLAINTEXT,INTERNAL_PLAINTEXT_SASL:SASL_PLAINTEXT,EXTERNAL_PLAINTEXT_SASL:SASL_PLAINTEXT KAFKA_CFG_SASL_ENABLED_MECHANISMS=PLAIN,SCRAM-SHA-256,SCRAM-SHA-512 -# old config names for <1.1.1 -KAFKA_ZOOKEEPER_CONNECT=zookeeper:2181 -KAFKA_INTER_BROKER_LISTENER_NAME=INTERNAL_PLAINTEXT_ANONYMOUS -KAFKA_LISTENERS=INTERNAL_PLAINTEXT_ANONYMOUS://:9092,EXTERNAL_PLAINTEXT_ANONYMOUS://:9093,INTERNAL_PLAINTEXT_SASL://:9094,EXTERNAL_PLAINTEXT_SASL://:9095 -KAFKA_ADVERTISED_LISTENERS=INTERNAL_PLAINTEXT_ANONYMOUS://kafka:9092,EXTERNAL_PLAINTEXT_ANONYMOUS://localhost:9093,INTERNAL_PLAINTEXT_SASL://kafka:9094,EXTERNAL_PLAINTEXT_SASL://localhost:9095 -KAFKA_LISTENER_SECURITY_PROTOCOL_MAP=INTERNAL_PLAINTEXT_ANONYMOUS:PLAINTEXT,EXTERNAL_PLAINTEXT_ANONYMOUS:PLAINTEXT,INTERNAL_PLAINTEXT_SASL:SASL_PLAINTEXT,EXTERNAL_PLAINTEXT_SASL:SASL_PLAINTEXT -KAFKA_SASL_ENABLED_MECHANISMS=PLAIN,SCRAM-SHA-256,SCRAM-SHA-512 # Mongo MONGO_INITDB_ROOT_USERNAME=onetl MONGO_INITDB_ROOT_PASSWORD=E4j7h!9A # MSSQL -MSSQL_DB=onetl -MSSQL_USER=onetl -MSSQL_PASSWORD=7ellowEl7akey ACCEPT_EULA=Y -SA_PASSWORD=2astazeY +MSSQL_SA_PASSWORD=2astazeY # MySQL MYSQL_ROOT_PASSWORD=ohbuz9Eochaj9saibooK3thooGa5aesh diff --git a/.github/workflows/cache-cleanup.yml b/.github/workflows/cache-cleanup.yml new file mode 100644 index 000000000..b960a7492 --- /dev/null +++ b/.github/workflows/cache-cleanup.yml @@ -0,0 +1,40 @@ +name: Cleanup caches after merge +on: + pull_request: + types: + - closed + workflow_dispatch: + +jobs: + cleanup: + runs-on: ubuntu-latest + permissions: + # `actions:write` permission is required to delete caches + # See also: https://docs.github.com/en/rest/actions/cache?apiVersion=2022-11-28#delete-a-github-actions-cache-for-a-repository-using-a-cache-id + actions: write + contents: read + + steps: + - name: Check out code + uses: actions/checkout@v4 + + - name: Cleanup cache + run: | + gh extension install actions/gh-actions-cache + + REPO=${{ github.repository }} + BRANCH=refs/pull/${{ github.event.pull_request.number }}/merge + + echo "Fetching list of cache key" + cacheKeysForPR=$(gh actions-cache list -R $REPO -B $BRANCH --limit 100 --sort size | cut -f 1 ) + + ## Setting this to not fail the workflow while deleting cache keys. + set +e + echo "Deleting caches..." + for cacheKey in $cacheKeysForPR + do + gh actions-cache delete $cacheKey -R $REPO -B $BRANCH --confirm + done + echo "Done" + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index f3397b706..a1e1e7ef5 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -49,8 +49,6 @@ jobs: restore-keys: | ${{ runner.os }}-python-${{ env.DEFAULT_PYTHON }}-codeql-${{ hashFiles('requirements*.txt') }} ${{ runner.os }}-python-${{ env.DEFAULT_PYTHON }}-codeql- - ${{ runner.os }}-python - ${{ runner.os }}- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel diff --git a/.github/workflows/data/clickhouse/matrix.yml b/.github/workflows/data/clickhouse/matrix.yml index 9c8c558ba..1469100a3 100644 --- a/.github/workflows/data/clickhouse/matrix.yml +++ b/.github/workflows/data/clickhouse/matrix.yml @@ -6,7 +6,7 @@ min: &min os: ubuntu-latest max: &max - spark-version: 3.5.0 + spark-version: 3.5.1 pydantic-version: 2 python-version: '3.12' java-version: 20 @@ -22,19 +22,19 @@ latest: &latest matrix: small: - clickhouse-image: clickhouse/clickhouse-server - clickhouse-version: 23.6.1-alpine + clickhouse-version: 24.3.2.23-alpine <<: *max full: - # the lowest supported Clickhouse version by JDBC driver + # Clickhouse version with proper DateTime > DateTime64 comparison - clickhouse-image: yandex/clickhouse-server - clickhouse-version: '20.7' + clickhouse-version: '21.1' <<: *min - clickhouse-image: clickhouse/clickhouse-server - clickhouse-version: 23.6.1-alpine + clickhouse-version: 24.3.2.23-alpine <<: *max nightly: - clickhouse-image: yandex/clickhouse-server - clickhouse-version: '20.7' + clickhouse-version: '21.1' <<: *min - clickhouse-image: clickhouse/clickhouse-server clickhouse-version: latest-alpine diff --git a/.github/workflows/data/core/matrix.yml b/.github/workflows/data/core/matrix.yml index a7339e139..d20f074ab 100644 --- a/.github/workflows/data/core/matrix.yml +++ b/.github/workflows/data/core/matrix.yml @@ -6,7 +6,7 @@ min: &min os: ubuntu-latest max: &max - spark-version: 3.5.0 + spark-version: 3.5.1 pydantic-version: 2 python-version: '3.12' java-version: 20 diff --git a/.github/workflows/data/core/tracked.txt b/.github/workflows/data/core/tracked.txt index da678a6a1..5b2a3ca4d 100644 --- a/.github/workflows/data/core/tracked.txt +++ b/.github/workflows/data/core/tracked.txt @@ -2,5 +2,7 @@ onetl/hooks/** onetl/plugins/** onetl/impl/** onetl/hwm/** +onetl/_util/** onetl/_internal.py onetl/log.py +.github/workflows/data/core/** diff --git a/.github/workflows/data/ftp/matrix.yml b/.github/workflows/data/ftp/matrix.yml index 49468d914..d01c39029 100644 --- a/.github/workflows/data/ftp/matrix.yml +++ b/.github/workflows/data/ftp/matrix.yml @@ -15,7 +15,7 @@ latest: &latest matrix: small: - # chonjay21/ftps image has only latest tag + # chonjay21/ftps image has only latest tag - ftp-version: latest <<: *max full: diff --git a/.github/workflows/data/ftps/matrix.yml b/.github/workflows/data/ftps/matrix.yml index ec5a862cd..efe28e79a 100644 --- a/.github/workflows/data/ftps/matrix.yml +++ b/.github/workflows/data/ftps/matrix.yml @@ -15,7 +15,7 @@ latest: &latest matrix: small: - # chonjay21/ftps image has only latest tag + # chonjay21/ftps image has only latest tag - ftps-version: latest <<: *max full: diff --git a/.github/workflows/data/greenplum/matrix.yml b/.github/workflows/data/greenplum/matrix.yml index 2b66c0e19..28ec20e75 100644 --- a/.github/workflows/data/greenplum/matrix.yml +++ b/.github/workflows/data/greenplum/matrix.yml @@ -28,14 +28,14 @@ matrix: package-version: 2.3.1 <<: *max full: - - greenplum-version: 6.25.3 + - greenplum-version: 6.23.1 package-version: 2.2.0 <<: *min - greenplum-version: 7.0.0 package-version: 2.3.1 <<: *max nightly: - - greenplum-version: 6.25.3 + - greenplum-version: 6.23.1 package-version: 2.2.0 <<: *min - greenplum-version: 7.0.0 diff --git a/.github/workflows/data/hdfs/matrix.yml b/.github/workflows/data/hdfs/matrix.yml index e62f0242a..6d8156c50 100644 --- a/.github/workflows/data/hdfs/matrix.yml +++ b/.github/workflows/data/hdfs/matrix.yml @@ -8,7 +8,7 @@ min: &min max: &max hadoop-version: hadoop3-hdfs - spark-version: 3.5.0 + spark-version: 3.5.1 pydantic-version: 2 python-version: '3.12' java-version: 20 diff --git a/.github/workflows/data/hive/matrix.yml b/.github/workflows/data/hive/matrix.yml index 0f7d4ba6b..6ce0d7a8e 100644 --- a/.github/workflows/data/hive/matrix.yml +++ b/.github/workflows/data/hive/matrix.yml @@ -6,7 +6,7 @@ min: &min os: ubuntu-latest max: &max - spark-version: 3.5.0 + spark-version: 3.5.1 pydantic-version: 2 python-version: '3.12' java-version: 20 diff --git a/.github/workflows/data/kafka/matrix.yml b/.github/workflows/data/kafka/matrix.yml index 29e587721..8050948b7 100644 --- a/.github/workflows/data/kafka/matrix.yml +++ b/.github/workflows/data/kafka/matrix.yml @@ -1,6 +1,8 @@ min: &min - # kafka_version: 0.10.2-1-r3 - kafka-version: 3.5.1 + # Headers are supported only since 2.x. + # Images before 3.2.3 are not creating kafka_jaas.conf properly, and failing to start + # https://github.com/bitnami/containers/blob/9db9064668365cac89bff58259f63eb78bb97e79/bitnami/kafka/README.md?plain=1#L933 + kafka-version: 3.2.3 pydantic-version: 1 spark-version: 2.4.8 python-version: '3.7' @@ -8,9 +10,9 @@ min: &min os: ubuntu-latest max: &max - kafka-version: 3.5.1 + kafka-version: 3.7.0 pydantic-version: 2 - spark-version: 3.5.0 + spark-version: 3.5.1 python-version: '3.12' java-version: 20 os: ubuntu-latest diff --git a/.github/workflows/data/local-fs/matrix.yml b/.github/workflows/data/local-fs/matrix.yml index b3db2391f..d1337291e 100644 --- a/.github/workflows/data/local-fs/matrix.yml +++ b/.github/workflows/data/local-fs/matrix.yml @@ -20,6 +20,7 @@ min_excel: &min_excel os: ubuntu-latest max: &max + # Excel package currently has no release for 3.5.1 spark-version: 3.5.0 pydantic-version: 2 python-version: '3.12' diff --git a/.github/workflows/data/local-fs/tracked.txt b/.github/workflows/data/local-fs/tracked.txt index bb8d4c276..a61170172 100644 --- a/.github/workflows/data/local-fs/tracked.txt +++ b/.github/workflows/data/local-fs/tracked.txt @@ -2,3 +2,19 @@ **/*local-fs* **/*local_fs*/** **/*local-fs*/** +**/*csv* +**/*csv*/** +**/*json* +**/*json*/** +**/*xml* +**/*xml*/** +**/*excel* +**/*excel*/** +**/*avro* +**/*avro*/** +**/*orc* +**/*orc*/** +**/*parquet* +**/*parquet*/** +**/*file_format* +**/*file_format*/** diff --git a/.github/workflows/data/mongodb/matrix.yml b/.github/workflows/data/mongodb/matrix.yml index c916cc306..a07bdd3b7 100644 --- a/.github/workflows/data/mongodb/matrix.yml +++ b/.github/workflows/data/mongodb/matrix.yml @@ -7,7 +7,7 @@ min: &min os: ubuntu-latest max: &max - spark-version: 3.4.2 + spark-version: 3.5.1 pydantic-version: 2 python-version: '3.12' java-version: 20 @@ -22,12 +22,12 @@ latest: &latest matrix: small: - - mongodb-version: 6.0.7 + - mongodb-version: 7.0.9 <<: *max full: - mongodb-version: 4.0.0 <<: *min - - mongodb-version: 6.0.7 + - mongodb-version: 7.0.9 <<: *max nightly: - mongodb-version: 4.0.0 diff --git a/.github/workflows/data/mssql/matrix.yml b/.github/workflows/data/mssql/matrix.yml index 0138805bb..c46d98d03 100644 --- a/.github/workflows/data/mssql/matrix.yml +++ b/.github/workflows/data/mssql/matrix.yml @@ -6,7 +6,7 @@ min: &min os: ubuntu-latest max: &max - spark-version: 3.5.0 + spark-version: 3.5.1 pydantic-version: 2 python-version: '3.12' java-version: 20 @@ -21,16 +21,15 @@ latest: &latest matrix: small: - - mssql-version: v2017.CU24.0 + - mssql-version: 2022-CU12-ubuntu-22.04 <<: *max full: - - mssql-version: v2017.CU24.0 + - mssql-version: 2017-GA-ubuntu <<: *min - # v2019.CU4.0 is not very stable - - mssql-version: v2017.CU24.0 + - mssql-version: 2022-CU12-ubuntu-22.04 <<: *max nightly: - - mssql-version: v2017.CU24.0 + - mssql-version: 2017-GA-ubuntu <<: *min - - mssql-version: v2017.CU24.0 + - mssql-version: latest <<: *latest diff --git a/.github/workflows/data/mysql/matrix.yml b/.github/workflows/data/mysql/matrix.yml index 9b64e3b93..8e46b42e8 100644 --- a/.github/workflows/data/mysql/matrix.yml +++ b/.github/workflows/data/mysql/matrix.yml @@ -6,7 +6,7 @@ min: &min os: ubuntu-latest max: &max - spark-version: 3.5.0 + spark-version: 3.5.1 pydantic-version: 2 python-version: '3.12' java-version: 20 @@ -21,17 +21,17 @@ latest: &latest matrix: small: - - mysql-version: 8.0.33 + - mysql-version: 8.4.0 <<: *max full: - # Min supported version by JDBC driver is 5.7 - - mysql-version: 5.7.42 + # Min supported version by JDBC driver is 5.7 + - mysql-version: 5.7.6 <<: *min - # Max supported version by JDBC driver is 8.0 - - mysql-version: 8.0.33 + # Max supported version by JDBC driver is 8.3 + - mysql-version: 8.4.0 <<: *max nightly: - - mysql-version: 5.7.42 + - mysql-version: 5.7.6 <<: *min - mysql-version: latest <<: *latest diff --git a/.github/workflows/data/oracle/matrix.yml b/.github/workflows/data/oracle/matrix.yml index 55dc4c185..c0a50fc2b 100644 --- a/.github/workflows/data/oracle/matrix.yml +++ b/.github/workflows/data/oracle/matrix.yml @@ -6,7 +6,7 @@ min: &min os: ubuntu-latest max: &max - spark-version: 3.5.0 + spark-version: 3.5.1 pydantic-version: 2 python-version: '3.12' java-version: 20 @@ -30,10 +30,6 @@ matrix: oracle-version: 11.2.0.2-slim-faststart db-name: XE <<: *min - - oracle-image: gvenzl/oracle-xe - oracle-version: 21.3.0-slim-faststart - db-name: XEPDB1 - <<: *max - oracle-image: gvenzl/oracle-free oracle-version: 23.3-slim-faststart db-name: FREEPDB1 diff --git a/.github/workflows/data/postgres/matrix.yml b/.github/workflows/data/postgres/matrix.yml index 8cdff4f63..7b8e296e5 100644 --- a/.github/workflows/data/postgres/matrix.yml +++ b/.github/workflows/data/postgres/matrix.yml @@ -6,7 +6,7 @@ min: &min os: ubuntu-latest max: &max - spark-version: 3.5.0 + spark-version: 3.5.1 pydantic-version: 2 python-version: '3.12' java-version: 20 @@ -21,13 +21,13 @@ latest: &latest matrix: small: - - postgres-version: 15.2-alpine + - postgres-version: 16.2-alpine <<: *max full: - # Min supported version by JDBC driver is 8.4, but it is too ancient to be used by anyone in real life + # Min supported version by JDBC driver is 8.4, but it is too ancient to be used by anyone in real life - postgres-version: 9.4.26-alpine <<: *min - - postgres-version: 15.2-alpine + - postgres-version: 16.2-alpine <<: *max nightly: - postgres-version: 9.4.26-alpine diff --git a/.github/workflows/data/s3/matrix.yml b/.github/workflows/data/s3/matrix.yml index a0825603b..d9b9338f8 100644 --- a/.github/workflows/data/s3/matrix.yml +++ b/.github/workflows/data/s3/matrix.yml @@ -9,8 +9,8 @@ min: &min os: ubuntu-latest max: &max - minio-version: 2023.7.18 - spark-version: 3.5.0 + minio-version: 2024.4.18 + spark-version: 3.5.1 pydantic-version: 2 python-version: '3.12' java-version: 20 diff --git a/.github/workflows/data/samba/matrix.yml b/.github/workflows/data/samba/matrix.yml index 5b0b2628e..b1e6b56da 100644 --- a/.github/workflows/data/samba/matrix.yml +++ b/.github/workflows/data/samba/matrix.yml @@ -15,6 +15,7 @@ latest: &latest matrix: small: + # elswork/samba image versions does not correlate with smbd version, it is always 4.x - server-version: latest <<: *max full: diff --git a/.github/workflows/data/sftp/matrix.yml b/.github/workflows/data/sftp/matrix.yml index 0dfd9e730..a32f6f823 100644 --- a/.github/workflows/data/sftp/matrix.yml +++ b/.github/workflows/data/sftp/matrix.yml @@ -15,13 +15,13 @@ latest: &latest matrix: small: - - openssh-version: 9.3_p1-r3-ls120 + - openssh-version: 9.6_p1-r0-ls154 <<: *max full: - # prior image versions does not accept incoming connections, seems like a bug + # prior image versions does not accept incoming connections, seems like a bug - openssh-version: 8.1_p1-r0-ls5 <<: *min - - openssh-version: 9.3_p1-r3-ls120 + - openssh-version: 9.6_p1-r0-ls154 <<: *max nightly: - openssh-version: 8.1_p1-r0-ls5 diff --git a/.github/workflows/data/teradata/matrix.yml b/.github/workflows/data/teradata/matrix.yml index 9647daec6..6c2a55455 100644 --- a/.github/workflows/data/teradata/matrix.yml +++ b/.github/workflows/data/teradata/matrix.yml @@ -1,5 +1,5 @@ max: &max - spark-version: 3.5.0 + spark-version: 3.5.1 pydantic-version: 2 python-version: '3.12' java-version: 20 diff --git a/.github/workflows/data/webdav/matrix.yml b/.github/workflows/data/webdav/matrix.yml index 8d8f012a7..fb76e3282 100644 --- a/.github/workflows/data/webdav/matrix.yml +++ b/.github/workflows/data/webdav/matrix.yml @@ -15,7 +15,7 @@ latest: &latest matrix: small: - # chonjay21/webdav image has only latest tag + # chonjay21/webdav image has only latest tag - webdav-version: latest <<: *max full: diff --git a/.github/workflows/get-matrix.yml b/.github/workflows/get-matrix.yml index e4579eb28..8f024cf88 100644 --- a/.github/workflows/get-matrix.yml +++ b/.github/workflows/get-matrix.yml @@ -86,7 +86,7 @@ jobs: - name: Check if base files are changed id: changed-base - uses: tj-actions/changed-files@v43 + uses: tj-actions/changed-files@v44 with: files_from_source_file: .github/workflows/data/base/tracked.txt files_ignore_from_source_file: .github/workflows/data/base/ignored.txt @@ -97,7 +97,7 @@ jobs: - name: Check if db-related files are changed id: changed-db - uses: tj-actions/changed-files@v43 + uses: tj-actions/changed-files@v44 with: files_from_source_file: .github/workflows/data/db/tracked.txt files_ignore_from_source_file: .github/workflows/data/db/ignored.txt @@ -108,7 +108,7 @@ jobs: - name: Check if file-related files are changed id: changed-file - uses: tj-actions/changed-files@v43 + uses: tj-actions/changed-files@v44 with: files_from_source_file: .github/workflows/data/file/tracked.txt files_ignore_from_source_file: .github/workflows/data/file/ignored.txt @@ -119,7 +119,7 @@ jobs: - name: Check if file-df-related files are changed id: changed-file-df - uses: tj-actions/changed-files@v43 + uses: tj-actions/changed-files@v44 with: files_from_source_file: .github/workflows/data/file-df/tracked.txt files_ignore_from_source_file: .github/workflows/data/file-df/ignored.txt @@ -130,7 +130,7 @@ jobs: - name: Check if core files are changed id: changed-core - uses: tj-actions/changed-files@v43 + uses: tj-actions/changed-files@v44 with: files_from_source_file: .github/workflows/data/core/tracked.txt files_ignore_from_source_file: .github/workflows/data/core/ignored.txt @@ -154,13 +154,13 @@ jobs: - name: Get Core matrix id: matrix-core - uses: mikefarah/yq@v4.42.1 + uses: mikefarah/yq@v4.44.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/core/matrix.yml - name: Check if Clickhouse files are changed id: changed-clickhouse - uses: tj-actions/changed-files@v43 + uses: tj-actions/changed-files@v44 with: files_from_source_file: .github/workflows/data/clickhouse/tracked.txt files_ignore_from_source_file: .github/workflows/data/clickhouse/ignored.txt @@ -184,13 +184,13 @@ jobs: - name: Get Clickhouse matrix id: matrix-clickhouse - uses: mikefarah/yq@v4.42.1 + uses: mikefarah/yq@v4.44.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/clickhouse/matrix.yml - name: Check if Greenplum files are changed id: changed-greenplum - uses: tj-actions/changed-files@v43 + uses: tj-actions/changed-files@v44 with: files_from_source_file: .github/workflows/data/greenplum/tracked.txt files_ignore_from_source_file: .github/workflows/data/greenplum/ignored.txt @@ -214,13 +214,13 @@ jobs: - name: Get Greenplum matrix id: matrix-greenplum - uses: mikefarah/yq@v4.42.1 + uses: mikefarah/yq@v4.44.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/greenplum/matrix.yml - name: Check if Hive files are changed id: changed-hive - uses: tj-actions/changed-files@v43 + uses: tj-actions/changed-files@v44 with: files_from_source_file: .github/workflows/data/hive/tracked.txt files_ignore_from_source_file: .github/workflows/data/hive/ignored.txt @@ -244,13 +244,13 @@ jobs: - name: Get Hive matrix id: matrix-hive - uses: mikefarah/yq@v4.42.1 + uses: mikefarah/yq@v4.44.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/hive/matrix.yml - name: Check if Kafka files are changed id: changed-kafka - uses: tj-actions/changed-files@v43 + uses: tj-actions/changed-files@v44 with: files_from_source_file: .github/workflows/data/kafka/tracked.txt files_ignore_from_source_file: .github/workflows/data/kafka/ignored.txt @@ -274,13 +274,13 @@ jobs: - name: Get Kafka matrix id: matrix-kafka - uses: mikefarah/yq@v4.42.1 + uses: mikefarah/yq@v4.44.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/kafka/matrix.yml - name: Check if LocalFS files are changed id: changed-local-fs - uses: tj-actions/changed-files@v43 + uses: tj-actions/changed-files@v44 with: files_from_source_file: .github/workflows/data/local-fs/tracked.txt files_ignore_from_source_file: .github/workflows/data/local-fs/ignored.txt @@ -304,13 +304,13 @@ jobs: - name: Get LocalFS matrix id: matrix-local-fs - uses: mikefarah/yq@v4.42.1 + uses: mikefarah/yq@v4.44.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/local-fs/matrix.yml - name: Check if MongoDB files are changed id: changed-mongodb - uses: tj-actions/changed-files@v43 + uses: tj-actions/changed-files@v44 with: files_from_source_file: .github/workflows/data/mongodb/tracked.txt files_ignore_from_source_file: .github/workflows/data/mongodb/ignored.txt @@ -334,13 +334,13 @@ jobs: - name: Get MongoDB matrix id: matrix-mongodb - uses: mikefarah/yq@v4.42.1 + uses: mikefarah/yq@v4.44.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/mongodb/matrix.yml - name: Check if MSSQL files are changed id: changed-mssql - uses: tj-actions/changed-files@v43 + uses: tj-actions/changed-files@v44 with: files_from_source_file: .github/workflows/data/mssql/tracked.txt files_ignore_from_source_file: .github/workflows/data/mssql/ignored.txt @@ -364,13 +364,13 @@ jobs: - name: Get MSSQL matrix id: matrix-mssql - uses: mikefarah/yq@v4.42.1 + uses: mikefarah/yq@v4.44.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/mssql/matrix.yml - name: Check if MySQL files are changed id: changed-mysql - uses: tj-actions/changed-files@v43 + uses: tj-actions/changed-files@v44 with: files_from_source_file: .github/workflows/data/mysql/tracked.txt files_ignore_from_source_file: .github/workflows/data/mysql/ignored.txt @@ -394,13 +394,13 @@ jobs: - name: Get MySQL matrix id: matrix-mysql - uses: mikefarah/yq@v4.42.1 + uses: mikefarah/yq@v4.44.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/mysql/matrix.yml - name: Check if Oracle files are changed id: changed-oracle - uses: tj-actions/changed-files@v43 + uses: tj-actions/changed-files@v44 with: files_from_source_file: .github/workflows/data/oracle/tracked.txt files_ignore_from_source_file: .github/workflows/data/oracle/ignored.txt @@ -424,13 +424,13 @@ jobs: - name: Get Oracle matrix id: matrix-oracle - uses: mikefarah/yq@v4.42.1 + uses: mikefarah/yq@v4.44.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/oracle/matrix.yml - name: Check if Postgres files are changed id: changed-postgres - uses: tj-actions/changed-files@v43 + uses: tj-actions/changed-files@v44 with: files_from_source_file: .github/workflows/data/postgres/tracked.txt files_ignore_from_source_file: .github/workflows/data/postgres/ignored.txt @@ -454,13 +454,13 @@ jobs: - name: Get Postgres matrix id: matrix-postgres - uses: mikefarah/yq@v4.42.1 + uses: mikefarah/yq@v4.44.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/postgres/matrix.yml - name: Check if Teradata files are changed id: changed-teradata - uses: tj-actions/changed-files@v43 + uses: tj-actions/changed-files@v44 with: files_from_source_file: .github/workflows/data/teradata/tracked.txt files_ignore_from_source_file: .github/workflows/data/teradata/ignored.txt @@ -484,13 +484,13 @@ jobs: - name: Get Teradata matrix id: matrix-teradata - uses: mikefarah/yq@v4.42.1 + uses: mikefarah/yq@v4.44.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/teradata/matrix.yml - name: Check if FTP files are changed id: changed-ftp - uses: tj-actions/changed-files@v43 + uses: tj-actions/changed-files@v44 with: files_from_source_file: .github/workflows/data/ftp/tracked.txt files_ignore_from_source_file: .github/workflows/data/ftp/ignored.txt @@ -514,13 +514,13 @@ jobs: - name: Get FTP matrix id: matrix-ftp - uses: mikefarah/yq@v4.42.1 + uses: mikefarah/yq@v4.44.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/ftp/matrix.yml - name: Check if FTPS files are changed id: changed-ftps - uses: tj-actions/changed-files@v43 + uses: tj-actions/changed-files@v44 with: files_from_source_file: .github/workflows/data/ftps/tracked.txt files_ignore_from_source_file: .github/workflows/data/ftps/ignored.txt @@ -544,13 +544,13 @@ jobs: - name: Get FTPS matrix id: matrix-ftps - uses: mikefarah/yq@v4.42.1 + uses: mikefarah/yq@v4.44.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/ftps/matrix.yml - name: Check if HDFS files are changed id: changed-hdfs - uses: tj-actions/changed-files@v43 + uses: tj-actions/changed-files@v44 with: files_from_source_file: .github/workflows/data/hdfs/tracked.txt files_ignore_from_source_file: .github/workflows/data/hdfs/ignored.txt @@ -574,13 +574,13 @@ jobs: - name: Get HDFS matrix id: matrix-hdfs - uses: mikefarah/yq@v4.42.1 + uses: mikefarah/yq@v4.44.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/hdfs/matrix.yml - name: Check if S3 files are changed id: changed-s3 - uses: tj-actions/changed-files@v43 + uses: tj-actions/changed-files@v44 with: files_from_source_file: .github/workflows/data/s3/tracked.txt files_ignore_from_source_file: .github/workflows/data/s3/ignored.txt @@ -604,13 +604,13 @@ jobs: - name: Get S3 matrix id: matrix-s3 - uses: mikefarah/yq@v4.42.1 + uses: mikefarah/yq@v4.44.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/s3/matrix.yml - name: Check if SFTP files are changed id: changed-sftp - uses: tj-actions/changed-files@v43 + uses: tj-actions/changed-files@v44 with: files_from_source_file: .github/workflows/data/sftp/tracked.txt files_ignore_from_source_file: .github/workflows/data/sftp/ignored.txt @@ -634,13 +634,13 @@ jobs: - name: Get SFTP matrix id: matrix-sftp - uses: mikefarah/yq@v4.42.1 + uses: mikefarah/yq@v4.44.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/sftp/matrix.yml - name: Check if Samba files are changed id: changed-samba - uses: tj-actions/changed-files@v43 + uses: tj-actions/changed-files@v44 with: files_from_source_file: .github/workflows/data/samba/tracked.txt files_ignore_from_source_file: .github/workflows/data/samba/ignored.txt @@ -664,13 +664,13 @@ jobs: - name: Get Samba matrix id: matrix-samba - uses: mikefarah/yq@v4.42.1 + uses: mikefarah/yq@v4.44.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/samba/matrix.yml - name: Check if WebDAV files are changed id: changed-webdav - uses: tj-actions/changed-files@v43 + uses: tj-actions/changed-files@v44 with: files_from_source_file: .github/workflows/data/webdav/tracked.txt files_ignore_from_source_file: .github/workflows/data/webdav/ignored.txt @@ -694,6 +694,6 @@ jobs: - name: Get WebDAV matrix id: matrix-webdav - uses: mikefarah/yq@v4.42.1 + uses: mikefarah/yq@v4.44.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/webdav/matrix.yml diff --git a/.github/workflows/test-clickhouse.yml b/.github/workflows/test-clickhouse.yml index 4a29f84cb..4f8d436ec 100644 --- a/.github/workflows/test-clickhouse.yml +++ b/.github/workflows/test-clickhouse.yml @@ -71,10 +71,10 @@ jobs: if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-clickhouse-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/clickhouse.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-clickhouse-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/clickhouse.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-clickhouse- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel @@ -83,10 +83,6 @@ jobs: run: | pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/clickhouse.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - - name: Wait for Clickhouse to be ready - run: | - ./docker/wait-for-it.sh -h localhost -p 8123 -t 60 - - name: Run tests run: | mkdir reports/ || echo "Directory exists" diff --git a/.github/workflows/test-core.yml b/.github/workflows/test-core.yml index cdd977398..65d681dc1 100644 --- a/.github/workflows/test-core.yml +++ b/.github/workflows/test-core.yml @@ -57,10 +57,10 @@ jobs: if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-core-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-core-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-core- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel @@ -72,7 +72,7 @@ jobs: - name: Run tests run: | ./run_tests.sh -m 'not connection' - ./run_tests.sh onetl/_util + ./run_tests.sh onetl/_util onetl/_internal.py onetl/hooks onetl/file/filter onetl/file/limit onetl/hwm/store/hwm_class_registry.py - name: Upload coverage results uses: actions/upload-artifact@v4 diff --git a/.github/workflows/test-ftp.yml b/.github/workflows/test-ftp.yml index 4e947d738..e41e1f3eb 100644 --- a/.github/workflows/test-ftp.yml +++ b/.github/workflows/test-ftp.yml @@ -50,20 +50,15 @@ jobs: run: | pip install -I -r requirements/core.txt -r requirements/ftp.txt -r requirements/tests/base.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - # Replace with Github Actions' services after https://github.com/chonjay21/docker-ftp/pull/3 - # Cannot use services because we need to mount config file from the repo, but services start before checkout. - # See https://github.com/orgs/community/discussions/25792 + # Replace with Github Actions' services after https://github.com/chonjay21/docker-ftp/pull/3 + # Cannot use services because we need to mount config file from the repo, but services start before checkout. + # See https://github.com/orgs/community/discussions/25792 - name: Start FTP run: | docker compose down -v --remove-orphans - docker compose up -d ftp + docker compose up -d ftp --wait --wait --wait-timeout 200 env: FTP_IMAGE: chonjay21/ftps:${{ inputs.ftp-version }} - COMPOSE_PROJECT_NAME: ${{ github.run_id }}-ftp${{ inputs.ftp-version }} - - - name: Wait for FTP to be ready - run: | - ./docker/wait-for-it.sh -h localhost -p 2121 -t 60 - name: Run tests run: | @@ -76,8 +71,6 @@ jobs: if: always() run: | docker compose down -v --remove-orphans - env: - COMPOSE_PROJECT_NAME: ${{ github.run_id }}-ftp${{ inputs.ftp-version }} - name: Upload coverage results uses: actions/upload-artifact@v4 diff --git a/.github/workflows/test-ftps.yml b/.github/workflows/test-ftps.yml index 19cce458c..4fb9c6234 100644 --- a/.github/workflows/test-ftps.yml +++ b/.github/workflows/test-ftps.yml @@ -50,20 +50,15 @@ jobs: run: | pip install -I -r requirements/core.txt -r requirements/ftp.txt -r requirements/tests/base.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - # Replace with Github Actions' services after https://github.com/chonjay21/docker-ftps/pull/3 - # Cannot use services because we need to mount config file from the repo, but services start before checkout. - # See https://github.com/orgs/community/discussions/25792 + # Replace with Github Actions' services after https://github.com/chonjay21/docker-ftps/pull/3 + # Cannot use services because we need to mount config file from the repo, but services start before checkout. + # See https://github.com/orgs/community/discussions/25792 - name: Start FTPS run: | docker compose down -v --remove-orphans - docker compose up -d ftps + docker compose up -d ftps --wait --wait --wait-timeout 200 env: FTPS_IMAGE: chonjay21/ftps:${{ inputs.ftps-version }} - COMPOSE_PROJECT_NAME: ${{ github.run_id }}-ftps${{ inputs.ftps-version }} - - - name: Wait for FTPS to be ready - run: | - ./docker/wait-for-it.sh -h localhost -p 2122 -t 60 - name: Run tests run: | @@ -76,8 +71,6 @@ jobs: if: always() run: | docker compose down -v --remove-orphans - env: - COMPOSE_PROJECT_NAME: ${{ github.run_id }}-ftps${{ inputs.ftps-version }} - name: Upload coverage results uses: actions/upload-artifact@v4 diff --git a/.github/workflows/test-greenplum.yml b/.github/workflows/test-greenplum.yml index bf5e5012e..d54e96970 100644 --- a/.github/workflows/test-greenplum.yml +++ b/.github/workflows/test-greenplum.yml @@ -76,10 +76,10 @@ jobs: if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-greenplum-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/postgres.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-greenplum-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/postgres.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-greenplum- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark- - name: Set up Postgres client if: runner.os == 'Linux' @@ -113,7 +113,6 @@ jobs: export ONETL_GP_PACKAGE_VERSION=${{ inputs.package-version }} ./pytest_runner.sh -m greenplum env: - ONETL_DB_WITH_GREENPLUM: 'true' GREENPLUM_PACKAGES_USER: ${{ secrets.GREENPLUM_PACKAGES_USER }} GREENPLUM_PACKAGES_PASSWORD: ${{ secrets.GREENPLUM_PACKAGES_PASSWORD }} diff --git a/.github/workflows/test-hdfs.yml b/.github/workflows/test-hdfs.yml index e97ee6249..6e52a5df1 100644 --- a/.github/workflows/test-hdfs.yml +++ b/.github/workflows/test-hdfs.yml @@ -51,25 +51,17 @@ jobs: sudo apt-get update sudo apt-get install --no-install-recommends libkrb5-dev gcc - - name: Cache Ivy - uses: actions/cache@v4 - if: inputs.with-cache - with: - path: ~/.ivy2 - key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-hdfs-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - restore-keys: | - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-hdfs-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-hdfs- - - name: Cache pip uses: actions/cache@v4 if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-hdfs-${{ hashFiles('requirements/core.txt', 'requirements/kerberos.txt', 'requirements/hdfs.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-hdfs-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-hdfs-${{ hashFiles('requirements/core.txt', 'requirements/kerberos.txt', 'requirements/hdfs.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-hdfs-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-hdfs- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel @@ -78,8 +70,8 @@ jobs: run: | pip install -I -r requirements/core.txt -r requirements/kerberos.txt -r requirements/hdfs.txt -r requirements/tests/base.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - # Cannot use services because we need to mount config file from the repo, but services start before checkout. - # See https://github.com/orgs/community/discussions/25792 + # Cannot use services because we need to mount config file from the repo, but services start before checkout. + # See https://github.com/orgs/community/discussions/25792 - name: Start HDFS run: | docker compose down -v --remove-orphans @@ -89,11 +81,6 @@ jobs: wait $wait_pid env: HDFS_IMAGE: mtsrus/hadoop:${{ inputs.hadoop-version }} - COMPOSE_PROJECT_NAME: ${{ github.run_id }}-hadoop${{ inputs.hadoop-version }} - - - name: Wait for HDFS to be ready - run: | - ./docker/wait-for-it.sh -h localhost -p 9870 -t 60 - name: Run tests run: | @@ -107,8 +94,6 @@ jobs: if: always() run: | docker compose down -v --remove-orphans - env: - COMPOSE_PROJECT_NAME: ${{ github.run_id }}-hadoop${{ inputs.hadoop-version }} - name: Upload coverage results uses: actions/upload-artifact@v4 diff --git a/.github/workflows/test-hive.yml b/.github/workflows/test-hive.yml index 446cb76d0..893348ab6 100644 --- a/.github/workflows/test-hive.yml +++ b/.github/workflows/test-hive.yml @@ -57,10 +57,10 @@ jobs: if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-hive-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-hive-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-hive- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel diff --git a/.github/workflows/test-kafka.yml b/.github/workflows/test-kafka.yml index d1389049f..34c2894a9 100644 --- a/.github/workflows/test-kafka.yml +++ b/.github/workflows/test-kafka.yml @@ -58,13 +58,6 @@ jobs: KAFKA_CFG_ADVERTISED_LISTENERS: INTERNAL_PLAINTEXT_ANONYMOUS://kafka:9092,EXTERNAL_PLAINTEXT_ANONYMOUS://localhost:9093,INTERNAL_PLAINTEXT_SASL://kafka:9094,EXTERNAL_PLAINTEXT_SASL://localhost:9095 KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP: INTERNAL_PLAINTEXT_ANONYMOUS:PLAINTEXT,EXTERNAL_PLAINTEXT_ANONYMOUS:PLAINTEXT,INTERNAL_PLAINTEXT_SASL:SASL_PLAINTEXT,EXTERNAL_PLAINTEXT_SASL:SASL_PLAINTEXT KAFKA_CFG_SASL_ENABLED_MECHANISMS: PLAIN,SCRAM-SHA-256,SCRAM-SHA-512 - # old config names for <1.1.1 - KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181 - KAFKA_INTER_BROKER_LISTENER_NAME: INTERNAL_PLAINTEXT_ANONYMOUS - KAFKA_LISTENERS: INTERNAL_PLAINTEXT_ANONYMOUS://:9092,EXTERNAL_PLAINTEXT_ANONYMOUS://:9093,INTERNAL_PLAINTEXT_SASL://:9094,EXTERNAL_PLAINTEXT_SASL://:9095 - KAFKA_ADVERTISED_LISTENERS: INTERNAL_PLAINTEXT_ANONYMOUS://kafka:9092,EXTERNAL_PLAINTEXT_ANONYMOUS://localhost:9093,INTERNAL_PLAINTEXT_SASL://kafka:9094,EXTERNAL_PLAINTEXT_SASL://localhost:9095 - KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: INTERNAL_PLAINTEXT_ANONYMOUS:PLAINTEXT,EXTERNAL_PLAINTEXT_ANONYMOUS:PLAINTEXT,INTERNAL_PLAINTEXT_SASL:SASL_PLAINTEXT,EXTERNAL_PLAINTEXT_SASL:SASL_PLAINTEXT - KAFKA_SASL_ENABLED_MECHANISMS: PLAIN,SCRAM-SHA-256,SCRAM-SHA-512 ports: - 9093:9093 - 9095:9095 @@ -104,10 +97,10 @@ jobs: if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-kafka-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/kafka.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-kafka-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/kafka.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-kafka- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel @@ -116,11 +109,6 @@ jobs: run: | pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/kafka.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - - name: Wait for Kafka to be ready - run: | - ./docker/wait-for-it.sh -h localhost -p 9093 -t 60 - ./docker/wait-for-it.sh -h localhost -p 9095 -t 60 - - name: Run tests run: | mkdir reports/ || echo "Directory exists" diff --git a/.github/workflows/test-local-fs.yml b/.github/workflows/test-local-fs.yml index 529deef41..f4b37c45a 100644 --- a/.github/workflows/test-local-fs.yml +++ b/.github/workflows/test-local-fs.yml @@ -57,10 +57,10 @@ jobs: if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-local-fs-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-local-fs-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-local-fs- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel diff --git a/.github/workflows/test-mongodb.yml b/.github/workflows/test-mongodb.yml index c842ce7af..a617450b6 100644 --- a/.github/workflows/test-mongodb.yml +++ b/.github/workflows/test-mongodb.yml @@ -69,10 +69,10 @@ jobs: if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-mongodb-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/mongodb.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-mongodb-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/mongodb.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-mongodb- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel @@ -81,10 +81,6 @@ jobs: run: | pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/mongodb.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - - name: Wait for MongoDB to be ready - run: | - ./docker/wait-for-it.sh -h localhost -p 27017 -t 60 - - name: Run tests run: | mkdir reports/ || echo "Directory exists" diff --git a/.github/workflows/test-mssql.yml b/.github/workflows/test-mssql.yml index efc8edce8..0819887aa 100644 --- a/.github/workflows/test-mssql.yml +++ b/.github/workflows/test-mssql.yml @@ -29,18 +29,6 @@ jobs: test-mssql: name: Run MSSQL tests (server=${{ inputs.mssql-version }}, spark=${{ inputs.spark-version }}, pydantic=${{ inputs.pydantic-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) runs-on: ${{ inputs.os }} - services: - mssql: - image: mcmoe/mssqldocker:${{ inputs.mssql-version }} - env: - TZ: UTC - MSSQL_DB: onetl - MSSQL_USER: onetl - MSSQL_PASSWORD: 7ellowEl7akey - ACCEPT_EULA: Y - SA_PASSWORD: 2astazeY - ports: - - 1433:1433 steps: - name: Checkout code @@ -72,10 +60,10 @@ jobs: if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-mssql-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/mssql.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-mssql-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/mssql.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-mssql- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel @@ -84,9 +72,14 @@ jobs: run: | pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/mssql.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - - name: Wait for MSSQL to be ready + # Cannot use services because we need to mount config file from the repo, but services start before checkout. + # See https://github.com/orgs/community/discussions/25792 + - name: Start MSSQL run: | - ./docker/wait-for-it.sh -h localhost -p 1433 -t 60 + docker compose down -v --remove-orphans + docker compose up -d mssql --wait --wait --wait-timeout 200 + env: + MSSQL_IMAGE: mcr.microsoft.com/mssql/server:${{ inputs.mssql-version }} - name: Run tests run: | @@ -95,6 +88,11 @@ jobs: source ./env ./pytest_runner.sh -m mssql + - name: Shutdown MSSQL + if: always() + run: | + docker compose down -v --remove-orphans + - name: Upload coverage results uses: actions/upload-artifact@v4 with: diff --git a/.github/workflows/test-mysql.yml b/.github/workflows/test-mysql.yml index c98562dd4..e2035cfc7 100644 --- a/.github/workflows/test-mysql.yml +++ b/.github/workflows/test-mysql.yml @@ -71,10 +71,10 @@ jobs: if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-mysql-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/mysql.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-mysql-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/mysql.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-mysql- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel @@ -83,10 +83,6 @@ jobs: run: | pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/mysql.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - - name: Wait for MySQL to be ready - run: | - ./docker/wait-for-it.sh -h localhost -p 3306 -t 60 - - name: Run tests run: | mkdir reports/ || echo "Directory exists" diff --git a/.github/workflows/test-oracle.yml b/.github/workflows/test-oracle.yml index c22bc9553..e11a57b84 100644 --- a/.github/workflows/test-oracle.yml +++ b/.github/workflows/test-oracle.yml @@ -77,10 +77,10 @@ jobs: if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-oracle-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/oracle.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-oracle-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/oracle.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-oracle- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark- - name: Set up Oracle instantclient if: runner.os == 'Linux' @@ -98,10 +98,6 @@ jobs: run: | pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/oracle.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - - name: Wait for Oracle to be ready - run: | - ./docker/wait-for-it.sh -h localhost -p 1522 -t 60 - - name: Run tests run: | export ONETL_ORA_CLIENT_PATH=./oracle/instantclient_21_10 diff --git a/.github/workflows/test-postgres.yml b/.github/workflows/test-postgres.yml index 7ac821bc1..ef31a0375 100644 --- a/.github/workflows/test-postgres.yml +++ b/.github/workflows/test-postgres.yml @@ -70,10 +70,10 @@ jobs: if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-postgres-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/postgres.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-postgres-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/postgres.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-postgres- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel @@ -82,10 +82,6 @@ jobs: run: | pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/postgres.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - - name: Wait for Postgres to be ready - run: | - ./docker/wait-for-it.sh -h localhost -p 5432 -t 60 - - name: Run tests run: | mkdir reports/ || echo "Directory exists" diff --git a/.github/workflows/test-s3.yml b/.github/workflows/test-s3.yml index a75297fe3..8da4540cd 100644 --- a/.github/workflows/test-s3.yml +++ b/.github/workflows/test-s3.yml @@ -71,10 +71,10 @@ jobs: if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-s3-${{ hashFiles('requirements/core.txt', 'requirements/s3.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-s3-${{ hashFiles('requirements/core.txt', 'requirements/s3.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-s3- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel @@ -83,10 +83,6 @@ jobs: run: | pip install -I -r requirements/core.txt -r requirements/s3.txt -r requirements/tests/base.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - - name: Wait for S3 to be ready - run: | - ./docker/wait-for-it.sh -h localhost -p 9010 -t 60 - - name: Run tests run: | mkdir reports/ || echo "Directory exists" diff --git a/.github/workflows/test-samba.yml b/.github/workflows/test-samba.yml index 3a7c1c921..58db08b88 100644 --- a/.github/workflows/test-samba.yml +++ b/.github/workflows/test-samba.yml @@ -57,11 +57,6 @@ jobs: docker compose up -d samba env: SAMBA_IMAGE: elswork/samba:${{ inputs.server-version }} - COMPOSE_PROJECT_NAME: ${{ github.run_id }}-samba${{ inputs.server-version }} - - - name: Wait for Samba to be ready - run: | - ./docker/wait-for-it.sh -h localhost -p 445 -t 60 - name: Run tests run: | @@ -74,8 +69,6 @@ jobs: if: always() run: | docker compose down -v --remove-orphans - env: - COMPOSE_PROJECT_NAME: ${{ github.run_id }}-samba${{ inputs.server-version }} - name: Upload coverage results uses: actions/upload-artifact@v4 diff --git a/.github/workflows/test-sftp.yml b/.github/workflows/test-sftp.yml index 569d580f7..ffbf786f2 100644 --- a/.github/workflows/test-sftp.yml +++ b/.github/workflows/test-sftp.yml @@ -60,10 +60,6 @@ jobs: run: | pip install -I -r requirements/core.txt -r requirements/sftp.txt -r requirements/tests/base.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - - name: Wait for SFTP to be ready - run: | - ./docker/wait-for-it.sh -h localhost -p 2222 -t 60 - - name: Run tests run: | mkdir reports/ || echo "Directory exists" diff --git a/.github/workflows/test-teradata.yml b/.github/workflows/test-teradata.yml index 214859e2b..b348da5f2 100644 --- a/.github/workflows/test-teradata.yml +++ b/.github/workflows/test-teradata.yml @@ -54,12 +54,13 @@ jobs: - name: Cache pip uses: actions/cache@v4 + if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-teradata-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-teradata-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-teradata- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-spark- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel diff --git a/.github/workflows/test-webdav.yml b/.github/workflows/test-webdav.yml index ee23f0ae8..472519643 100644 --- a/.github/workflows/test-webdav.yml +++ b/.github/workflows/test-webdav.yml @@ -59,11 +59,6 @@ jobs: docker compose up -d webdav env: WEBDAV_IMAGE: chonjay21/webdav:${{ inputs.webdav-version }} - COMPOSE_PROJECT_NAME: ${{ github.run_id }}-webdav${{ inputs.webdav-version }} - - - name: Wait for WebDAV to be ready - run: | - ./docker/wait-for-it.sh -h localhost -p 8000 -t 60 - name: Run tests run: | @@ -76,8 +71,6 @@ jobs: if: always() run: | docker compose down -v --remove-orphans - env: - COMPOSE_PROJECT_NAME: ${{ github.run_id }}-webdav${{ inputs.webdav-version }} - name: Upload coverage results uses: actions/upload-artifact@v4 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 516ea1cf1..aa7bde988 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,7 +3,7 @@ default_language_version: repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.5.0 + rev: v4.6.0 hooks: - id: check-ast - id: check-case-conflict @@ -58,7 +58,7 @@ repos: args: [-w] - repo: https://github.com/macisamuele/language-formatters-pre-commit-hooks - rev: v2.12.0 + rev: v2.13.0 hooks: - id: pretty-format-yaml args: [--autofix, --indent, '2', --preserve-quotes, --offset, '2'] @@ -67,6 +67,7 @@ repos: rev: v6.2.1 hooks: - id: beautysh + additional_dependencies: [setuptools] - repo: https://github.com/IamTheFij/docker-pre-commit rev: v3.0.1 @@ -89,13 +90,18 @@ repos: - id: text-unicode-replacement-char - repo: https://github.com/asottile/pyupgrade - rev: v3.15.1 + rev: v3.15.2 hooks: - id: pyupgrade args: [--py37-plus, --keep-runtime-typing] + - repo: https://github.com/asottile/add-trailing-comma + rev: v3.1.0 + hooks: + - id: add-trailing-comma + - repo: https://github.com/psf/black - rev: 24.3.0 + rev: 24.4.2 hooks: - id: black language_version: python3 @@ -105,7 +111,7 @@ repos: hooks: - id: blacken-docs additional_dependencies: - - black==24.3.0 + - black==24.4.2 - repo: https://github.com/pycqa/bandit rev: 1.7.8 diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index e347a6ecf..e7a60fc13 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -11,7 +11,7 @@ Limitations We should keep close to these items during development: -* Some companies still use old Spark versions, like 2.3.0. So it is required to keep compatibility if possible, e.g. adding branches for different Spark versions. +* Some companies still use old Spark versions, like 2.3.1. So it is required to keep compatibility if possible, e.g. adding branches for different Spark versions. * Different users uses onETL in different ways - some uses only DB connectors, some only files. Connector-specific dependencies should be optional. * Instead of creating classes with a lot of different options, prefer splitting them into smaller classes, e.g. options class, context manager, etc, and using composition. @@ -71,7 +71,7 @@ Create virtualenv and install dependencies: -r requirements/tests/postgres.txt \ -r requirements/tests/oracle.txt \ -r requirements/tests/pydantic-2.txt \ - -r requirements/tests/spark-3.5.0.txt + -r requirements/tests/spark-3.5.1.txt # TODO: remove after https://github.com/zqmillet/sphinx-plantuml/pull/4 pip install sphinx-plantuml --no-deps @@ -176,7 +176,6 @@ Without docker-compose * Download `VMware Greenplum connector for Spark `_ * Either move it to ``~/.ivy2/jars/``, or pass file path to ``CLASSPATH`` - * Set environment variable ``ONETL_DB_WITH_GREENPLUM=true`` to enable adding connector to Spark session Start all containers with dependencies: diff --git a/README.rst b/README.rst index ffc68e52b..da0b84cb6 100644 --- a/README.rst +++ b/README.rst @@ -53,6 +53,7 @@ Non-goals Requirements ------------ + * **Python 3.7 - 3.12** * PySpark 2.3.x - 3.5.x (depends on used connector) * Java 8+ (required by Spark, see below) @@ -61,50 +62,65 @@ Requirements Supported storages ------------------ -+--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ -| Type | Storage | Powered by | -+====================+==============+=========================================================================================================================+ -| Database | Clickhouse | Apache Spark `JDBC Data Source `_ | -+ +--------------+ + -| | MSSQL | | -+ +--------------+ + -| | MySQL | | -+ +--------------+ + -| | Postgres | | -+ +--------------+ + -| | Oracle | | -+ +--------------+ + -| | Teradata | | -+ +--------------+-------------------------------------------------------------------------------------------------------------------------+ -| | Hive | Apache Spark `Hive integration `_ | -+ +--------------+-------------------------------------------------------------------------------------------------------------------------+ -| | Kafka | Apache Spark `Kafka integration `_ | -+ +--------------+-------------------------------------------------------------------------------------------------------------------------+ -| | Greenplum | VMware `Greenplum Spark connector `_ | -+ +--------------+-------------------------------------------------------------------------------------------------------------------------+ -| | MongoDB | `MongoDB Spark connector `_ | -+--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ -| File | HDFS | `HDFS Python client `_ | -+ +--------------+-------------------------------------------------------------------------------------------------------------------------+ -| | S3 | `minio-py client `_ | -+ +--------------+-------------------------------------------------------------------------------------------------------------------------+ -| | SFTP | `Paramiko library `_ | -+ +--------------+-------------------------------------------------------------------------------------------------------------------------+ -| | FTP | `FTPUtil library `_ | -+ +--------------+ + -| | FTPS | | -+ +--------------+-------------------------------------------------------------------------------------------------------------------------+ -| | WebDAV | `WebdavClient3 library `_ | -+ +--------------+-------------------------------------------------------------------------------------------------------------------------+ -| | Samba | `pysmb library `_ | -+--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ -| Files as DataFrame | SparkLocalFS | Apache Spark `File Data Source `_ | -| +--------------+ + -| | SparkHDFS | | -| +--------------+-------------------------------------------------------------------------------------------------------------------------+ -| | SparkS3 | `Hadoop AWS `_ library | -+--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ - +Database +~~~~~~~~ + ++--------------+-------------------------------------------------------------------------------------------------------------------------+ +| Storage | Powered by | ++==============+=========================================================================================================================+ +| Clickhouse | Apache Spark `JDBC Data Source `_ | ++--------------+-------------------------------------------------------------------------------------------------------------------------+ +| MSSQL | Apache Spark `JDBC Data Source `_ | ++--------------+-------------------------------------------------------------------------------------------------------------------------+ +| MySQL | Apache Spark `JDBC Data Source `_ | ++--------------+-------------------------------------------------------------------------------------------------------------------------+ +| Postgres | Apache Spark `JDBC Data Source `_ | ++--------------+-------------------------------------------------------------------------------------------------------------------------+ +| Oracle | Apache Spark `JDBC Data Source `_ | ++--------------+-------------------------------------------------------------------------------------------------------------------------+ +| Teradata | Apache Spark `JDBC Data Source `_ | ++--------------+-------------------------------------------------------------------------------------------------------------------------+ +| Hive | Apache Spark `Hive integration `_ | ++--------------+-------------------------------------------------------------------------------------------------------------------------+ +| Kafka | Apache Spark `Kafka integration `_ | ++--------------+-------------------------------------------------------------------------------------------------------------------------+ +| Greenplum | VMware `Greenplum Spark connector `_ | ++--------------+-------------------------------------------------------------------------------------------------------------------------+ +| MongoDB | `MongoDB Spark connector `_ | ++--------------+-------------------------------------------------------------------------------------------------------------------------+ + +File +~~~~ ++--------------+--------------------------------------------------------------------+ +| Storage | Powered by | ++==============+====================================================================+ +| HDFS | `HDFS Python client `_ | ++--------------+--------------------------------------------------------------------+ +| S3 | `minio-py client `_ | ++--------------+--------------------------------------------------------------------+ +| SFTP | `Paramiko library `_ | ++--------------+--------------------------------------------------------------------+ +| FTP | `FTPUtil library `_ | ++--------------+--------------------------------------------------------------------+ +| FTPS | `FTPUtil library `_ | ++--------------+--------------------------------------------------------------------+ +| WebDAV | `WebdavClient3 library `_ | ++--------------+--------------------------------------------------------------------+ +| Samba | `pysmb library `_ | ++--------------+--------------------------------------------------------------------+ + +Files as DataFrame +~~~~~~~~~~~~~~~~~~ + ++--------------+---------------------------------------------------------------------------------------------------------------+ +| Storage | Powered by | ++==============+===============================================================================================================+ +| SparkLocalFS | Apache Spark `File Data Source `_ | ++--------------+---------------------------------------------------------------------------------------------------------------+ +| SparkHDFS | Apache Spark `File Data Source `_ | ++--------------+---------------------------------------------------------------------------------------------------------------+ +| SparkS3 | `Hadoop AWS `_ library | ++--------------+---------------------------------------------------------------------------------------------------------------+ .. documentation @@ -171,17 +187,17 @@ Compatibility matrix +--------------------------------------------------------------+-------------+-------------+-------+ | Spark | Python | Java | Scala | +==============================================================+=============+=============+=======+ -| `2.3.x `_ | 3.7 only | 8 only | 2.11 | +| `2.3.x `_ | 3.7 only | 8 only | 2.11 | +--------------------------------------------------------------+-------------+-------------+-------+ | `2.4.x `_ | 3.7 only | 8 only | 2.11 | +--------------------------------------------------------------+-------------+-------------+-------+ | `3.2.x `_ | 3.7 - 3.10 | 8u201 - 11 | 2.12 | +--------------------------------------------------------------+-------------+-------------+-------+ -| `3.3.x `_ | 3.7 - 3.10 | 8u201 - 17 | 2.12 | +| `3.3.x `_ | 3.7 - 3.10 | 8u201 - 17 | 2.12 | +--------------------------------------------------------------+-------------+-------------+-------+ -| `3.4.x `_ | 3.7 - 3.12 | 8u362 - 20 | 2.12 | +| `3.4.x `_ | 3.7 - 3.12 | 8u362 - 20 | 2.12 | +--------------------------------------------------------------+-------------+-------------+-------+ -| `3.5.x `_ | 3.8 - 3.12 | 8u371 - 20 | 2.12 | +| `3.5.x `_ | 3.8 - 3.12 | 8u371 - 20 | 2.12 | +--------------------------------------------------------------+-------------+-------------+-------+ .. _pyspark-install: @@ -196,7 +212,7 @@ or install PySpark explicitly: .. code:: bash - pip install onetl pyspark==3.5.0 # install a specific PySpark version + pip install onetl pyspark==3.5.1 # install a specific PySpark version or inject PySpark to ``sys.path`` in some other way BEFORE creating a class instance. **Otherwise connection object cannot be created.** @@ -537,7 +553,7 @@ Read files directly from S3 path, convert them to dataframe, transform it and th setup_logging() # Initialize new SparkSession with Hadoop AWS libraries and Postgres driver loaded - maven_packages = SparkS3.get_packages(spark_version="3.5.0") + Postgres.get_packages() + maven_packages = SparkS3.get_packages(spark_version="3.5.1") + Postgres.get_packages() spark = ( SparkSession.builder.appName("spark_app_onetl_demo") .config("spark.jars.packages", ",".join(maven_packages)) diff --git a/docker-compose.yml b/docker-compose.yml index 6ba2aca64..3a61170e0 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -9,7 +9,7 @@ services: context: . target: base args: - SPARK_VERSION: 3.5.0 + SPARK_VERSION: 3.5.1 env_file: .env.docker volumes: - ./:/app/ @@ -31,6 +31,8 @@ services: - 5433:5432 networks: - onetl + sysctls: + - net.ipv6.conf.all.disable_ipv6=1 clickhouse: image: ${CLICKHOUSE_IMAGE:-clickhouse/clickhouse-server:latest-alpine} @@ -83,11 +85,14 @@ services: - onetl mssql: - image: ${MSSQL_IMAGE:-mcmoe/mssqldocker:latest} + image: ${MSSQL_IMAGE:-mcr.microsoft.com/mssql/server:latest} restart: unless-stopped env_file: .env.dependencies ports: - 1433:1433 + volumes: + - ./docker/mssql/:/usr/config/ + entrypoint: ["/usr/config/entrypoint.sh"] networks: - onetl platform: linux/amd64 @@ -173,7 +178,7 @@ services: - onetl samba: - image: elswork/samba + image: ${SAMBA_IMAGE:-elswork/samba} restart: unless-stopped ports: - "139:139" diff --git a/docker/Dockerfile b/docker/Dockerfile index 36cbb129f..d3d34ef21 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -42,10 +42,9 @@ USER onetl ENV PATH=${ONETL_USER_HOME}/.local/bin:${PATH} COPY --chown=onetl:onetl ./run_tests.sh ./pytest_runner.sh ./combine_coverage.sh /app/ -COPY --chown=onetl:onetl ./docker/wait-for-it.sh /app/docker/wait-for-it.sh -RUN chmod +x /app/run_tests.sh /app/pytest_runner.sh /app/combine_coverage.sh /app/docker/wait-for-it.sh +RUN chmod +x /app/run_tests.sh /app/pytest_runner.sh /app/combine_coverage.sh -ARG SPARK_VERSION=3.5.0 +ARG SPARK_VERSION=3.5.1 # Spark is heavy, and version change is quite rare COPY --chown=onetl:onetl ./requirements/tests/spark-${SPARK_VERSION}.txt /app/requirements/tests/ RUN pip install -r /app/requirements/tests/spark-${SPARK_VERSION}.txt diff --git a/docker/mssql/configure-db.sh b/docker/mssql/configure-db.sh new file mode 100755 index 000000000..51b39ae32 --- /dev/null +++ b/docker/mssql/configure-db.sh @@ -0,0 +1,36 @@ +#!/bin/bash +set -o pipefail + +# Wait 60 seconds for SQL Server to start up by ensuring that +# calling SQLCMD does not return an error code, which will ensure that sqlcmd is accessible +# and that system and user databases return "0" which means all databases are in an "online" state +# https://docs.microsoft.com/en-us/sql/relational-databases/system-catalog-views/sys-databases-transact-sql?view=sql-server-2017 + +declare DBSTATUS +declare ERRCODE +TIMEOUT=60 +START=$(date +%s) +echo "Configure DB script started at $(date)" + +while true; do + DELTA=$(($(date +%s) - START)) + if [[ $DELTA -gt $TIMEOUT ]]; then + echo "ERROR: SQL Server took more than ${TIMEOUT} seconds to START up or one or more databases are not in an ONLINE state" + exit 1 + fi + + DBSTATUS=$(/opt/mssql-tools/bin/sqlcmd -h -1 -t 1 -U sa -P ${MSSQL_SA_PASSWORD} -Q "SET NOCOUNT ON; Select SUM(state) from sys.databases" 2>/dev/null | sed -e 's/^[[:space:]]*//') + ERRCODE=$? + if [[ "$DBSTATUS" -eq "0" && "$ERRCODE" -eq "0" ]]; then + echo "INFO: Database ready." + break + else + echo "INFO: Waiting for database to be ready..." + sleep 1 + fi +done + +# Run the setup script to create the DB and the schema in the DB +echo "Running setup.sql"; +/opt/mssql-tools/bin/sqlcmd -S localhost -U sa -P $MSSQL_SA_PASSWORD -d master -i /usr/config/setup.sql; +echo "Success"; diff --git a/docker/mssql/entrypoint.sh b/docker/mssql/entrypoint.sh new file mode 100755 index 000000000..4f274086b --- /dev/null +++ b/docker/mssql/entrypoint.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +# Start the script to create the DB and user +/usr/config/configure-db.sh & + +# Start SQL Server +/opt/mssql/bin/sqlservr diff --git a/docker/mssql/setup.sql b/docker/mssql/setup.sql new file mode 100644 index 000000000..0cf617237 --- /dev/null +++ b/docker/mssql/setup.sql @@ -0,0 +1,20 @@ +/* + +Enter custom T-SQL here that would run after SQL Server has started up. + +*/ + +CREATE DATABASE onetl; +GO + +USE onetl; +GO + +CREATE LOGIN onetl WITH PASSWORD = '7ellowEl7akey'; +GO + +CREATE USER onetl FOR LOGIN onetl; +GO + +GRANT CONTROL ON DATABASE::onetl TO onetl; +GO diff --git a/docker/wait-for-it.sh b/docker/wait-for-it.sh deleted file mode 100755 index 7410fa3a6..000000000 --- a/docker/wait-for-it.sh +++ /dev/null @@ -1,182 +0,0 @@ -#!/usr/bin/env bash -# Use this script to test if a given TCP host/port are available - -WAITFORIT_cmdname=${0##*/} - -echoerr() { if [[ $WAITFORIT_QUIET -ne 1 ]]; then echo "$@" 1>&2; fi } - -usage() -{ - cat << USAGE >&2 -Usage: - $WAITFORIT_cmdname host:port [-s] [-t timeout] [-- command args] - -h HOST | --host=HOST Host or IP under test - -p PORT | --port=PORT TCP port under test - Alternatively, you specify the host and port as host:port - -s | --strict Only execute subcommand if the test succeeds - -q | --quiet Don't output any status messages - -t TIMEOUT | --timeout=TIMEOUT - Timeout in seconds, zero for no timeout - -- COMMAND ARGS Execute command with args after the test finishes -USAGE - exit 1 -} - -wait_for() -{ - if [[ $WAITFORIT_TIMEOUT -gt 0 ]]; then - echoerr "$WAITFORIT_cmdname: waiting $WAITFORIT_TIMEOUT seconds for $WAITFORIT_HOST:$WAITFORIT_PORT" - else - echoerr "$WAITFORIT_cmdname: waiting for $WAITFORIT_HOST:$WAITFORIT_PORT without a timeout" - fi - WAITFORIT_start_ts=$(date +%s) - while : - do - if [[ $WAITFORIT_ISBUSY -eq 1 ]]; then - nc -z $WAITFORIT_HOST $WAITFORIT_PORT - WAITFORIT_result=$? - else - (echo > /dev/tcp/$WAITFORIT_HOST/$WAITFORIT_PORT) >/dev/null 2>&1 - WAITFORIT_result=$? - fi - if [[ $WAITFORIT_result -eq 0 ]]; then - WAITFORIT_end_ts=$(date +%s) - echoerr "$WAITFORIT_cmdname: $WAITFORIT_HOST:$WAITFORIT_PORT is available after $((WAITFORIT_end_ts - WAITFORIT_start_ts)) seconds" - break - fi - sleep 1 - done - return $WAITFORIT_result -} - -wait_for_wrapper() -{ - # In order to support SIGINT during timeout: http://unix.stackexchange.com/a/57692 - if [[ $WAITFORIT_QUIET -eq 1 ]]; then - timeout $WAITFORIT_BUSYTIMEFLAG $WAITFORIT_TIMEOUT $0 --quiet --child --host=$WAITFORIT_HOST --port=$WAITFORIT_PORT --timeout=$WAITFORIT_TIMEOUT & - else - timeout $WAITFORIT_BUSYTIMEFLAG $WAITFORIT_TIMEOUT $0 --child --host=$WAITFORIT_HOST --port=$WAITFORIT_PORT --timeout=$WAITFORIT_TIMEOUT & - fi - WAITFORIT_PID=$! - trap "kill -INT -$WAITFORIT_PID" INT - wait $WAITFORIT_PID - WAITFORIT_RESULT=$? - if [[ $WAITFORIT_RESULT -ne 0 ]]; then - echoerr "$WAITFORIT_cmdname: timeout occurred after waiting $WAITFORIT_TIMEOUT seconds for $WAITFORIT_HOST:$WAITFORIT_PORT" - fi - return $WAITFORIT_RESULT -} - -# process arguments -while [[ $# -gt 0 ]] -do - case "$1" in - *:* ) - WAITFORIT_hostport=(${1//:/ }) - WAITFORIT_HOST=${WAITFORIT_hostport[0]} - WAITFORIT_PORT=${WAITFORIT_hostport[1]} - shift 1 - ;; - --child) - WAITFORIT_CHILD=1 - shift 1 - ;; - -q | --quiet) - WAITFORIT_QUIET=1 - shift 1 - ;; - -s | --strict) - WAITFORIT_STRICT=1 - shift 1 - ;; - -h) - WAITFORIT_HOST="$2" - if [[ $WAITFORIT_HOST == "" ]]; then break; fi - shift 2 - ;; - --host=*) - WAITFORIT_HOST="${1#*=}" - shift 1 - ;; - -p) - WAITFORIT_PORT="$2" - if [[ $WAITFORIT_PORT == "" ]]; then break; fi - shift 2 - ;; - --port=*) - WAITFORIT_PORT="${1#*=}" - shift 1 - ;; - -t) - WAITFORIT_TIMEOUT="$2" - if [[ $WAITFORIT_TIMEOUT == "" ]]; then break; fi - shift 2 - ;; - --timeout=*) - WAITFORIT_TIMEOUT="${1#*=}" - shift 1 - ;; - --) - shift - WAITFORIT_CLI=("$@") - break - ;; - --help) - usage - ;; - *) - echoerr "Unknown argument: $1" - usage - ;; - esac -done - -if [[ "$WAITFORIT_HOST" == "" || "$WAITFORIT_PORT" == "" ]]; then - echoerr "Error: you need to provide a host and port to test." - usage -fi - -WAITFORIT_TIMEOUT=${WAITFORIT_TIMEOUT:-15} -WAITFORIT_STRICT=${WAITFORIT_STRICT:-0} -WAITFORIT_CHILD=${WAITFORIT_CHILD:-0} -WAITFORIT_QUIET=${WAITFORIT_QUIET:-0} - -# Check to see if timeout is from busybox? -WAITFORIT_TIMEOUT_PATH=$(type -p timeout) -WAITFORIT_TIMEOUT_PATH=$(realpath $WAITFORIT_TIMEOUT_PATH 2>/dev/null || readlink -f $WAITFORIT_TIMEOUT_PATH) - -WAITFORIT_BUSYTIMEFLAG="" -if [[ $WAITFORIT_TIMEOUT_PATH =~ "busybox" ]]; then - WAITFORIT_ISBUSY=1 - # Check if busybox timeout uses -t flag - # (recent Alpine versions don't support -t anymore) - if timeout &>/dev/stdout | grep -q -e '-t '; then - WAITFORIT_BUSYTIMEFLAG="-t" - fi -else - WAITFORIT_ISBUSY=0 -fi - -if [[ $WAITFORIT_CHILD -gt 0 ]]; then - wait_for - WAITFORIT_RESULT=$? - exit $WAITFORIT_RESULT -else - if [[ $WAITFORIT_TIMEOUT -gt 0 ]]; then - wait_for_wrapper - WAITFORIT_RESULT=$? - else - wait_for - WAITFORIT_RESULT=$? - fi -fi - -if [[ $WAITFORIT_CLI != "" ]]; then - if [[ $WAITFORIT_RESULT -ne 0 && $WAITFORIT_STRICT -eq 1 ]]; then - echoerr "$WAITFORIT_cmdname: strict mode, refusing to execute subprocess" - exit $WAITFORIT_RESULT - fi - exec "${WAITFORIT_CLI[@]}" -else - exit $WAITFORIT_RESULT -fi diff --git a/docs/changelog/0.10.0.rst b/docs/changelog/0.10.0.rst index e546d150c..189a2457f 100644 --- a/docs/changelog/0.10.0.rst +++ b/docs/changelog/0.10.0.rst @@ -44,7 +44,7 @@ Breaking Changes - ``HWM`` classes used by previous onETL versions were moved from ``etl_entities`` to ``etl_entities.old_hwm`` submodule. They are here for compatibility reasons, but are planned to be removed in ``etl-entities`` v3 release. - New ``HWM`` classes have flat structure instead of nested. - New ``HWM`` classes have mandatory ``name`` attribute (it was known as ``qualified_name`` before). - - Type aliases used while serializing and deserializing ``HWM`` objects to ``dict`` representation were changed too: ``int`` -> ``column_int``. + - Type aliases used while serializing and deserializing ``HWM`` objects to ``dict`` representation were changed too: ``int`` → ``column_int``. To make migration simpler, you can use new method: @@ -53,7 +53,7 @@ Breaking Changes old_hwm = OldIntHWM(...) new_hwm = old_hwm.as_new_hwm() - Which automatically converts all fields from old structure to new one, including ``qualified_name`` -> ``name``. + Which automatically converts all fields from old structure to new one, including ``qualified_name`` → ``name``. - **Breaking changes:** diff --git a/docs/changelog/0.11.0.rst b/docs/changelog/0.11.0.rst new file mode 100644 index 000000000..8abdc2b7a --- /dev/null +++ b/docs/changelog/0.11.0.rst @@ -0,0 +1,229 @@ +0.11.0 (2024-05-27) +=================== + +Breaking Changes +---------------- + +There can be some changes in connection behavior, related to version upgrades. So we mark these changes as **breaking** although +most of users will not see any differences. + +- Update Clickhouse JDBC driver to latest version (:github:pull:`249`): + * Package was renamed ``ru.yandex.clickhouse:clickhouse-jdbc`` → ``com.clickhouse:clickhouse-jdbc``. + * Package version changed ``0.3.2`` → ``0.6.0-patch5``. + * Driver name changed ``ru.yandex.clickhouse.ClickHouseDriver`` → ``com.clickhouse.jdbc.ClickHouseDriver``. + + This brings up several fixes for Spark <-> Clickhouse type compatibility, and also Clickhouse clusters support. + +- Update other JDBC drivers to latest versions: + * MSSQL ``12.2.0`` → ``12.6.2`` (:github:pull:`254`). + * MySQL ``8.0.33`` → ``8.4.0`` (:github:pull:`253`, :github:pull:`285`). + * Oracle ``23.2.0.0`` → ``23.4.0.24.05`` (:github:pull:`252`, :github:pull:`284`). + * Postgres ``42.6.0`` → ``42.7.3`` (:github:pull:`251`). + +- Update MongoDB connector to latest version: ``10.1.1`` → ``10.3.0`` (:github:pull:`255`, :github:pull:`283`). + + This brings up Spark 3.5 support. + +- Update ``XML`` package to latest version: ``0.17.0`` → ``0.18.0`` (:github:pull:`259`). + + This brings few bugfixes with datetime format handling. + +- Serialize ``ColumnDatetimeHWM`` to Clickhouse's ``DateTime64(6)`` (precision up to microseconds) instead of ``DateTime`` (precision up to seconds) (:github:pull:`267`). + + In previous onETL versions, ``ColumnDatetimeHWM`` value was rounded to the second, and thus reading some rows that were read in previous runs, + producing duplicates. + + For Clickhouse versions below 21.1 comparing column of type ``DateTime`` with a value of type ``DateTime64`` is not supported, returning an empty dataframe. + To avoid this, replace: + + .. code:: python + + DBReader( + ..., + hwm=DBReader.AutoDetectHWM( + name="my_hwm", + expression="hwm_column", # <-- + ), + ) + + with: + + .. code:: python + + DBReader( + ..., + hwm=DBReader.AutoDetectHWM( + name="my_hwm", + expression="CAST(hwm_column AS DateTime64)", # <-- add explicit CAST + ), + ) + +- Pass JDBC connection extra params as ``properties`` dict instead of URL with query part (:github:pull:`268`). + + This allows passing custom connection parameters like ``Clickhouse(extra={"custom_http_options": "option1=value1,option2=value2"})`` + without need to apply urlencode to parameter value, like ``option1%3Dvalue1%2Coption2%3Dvalue2``. + +- For JDBC connections add new ``SQLOptions`` class for ``DB.sql(query, options=...)`` method (:github:pull:`272`). + + Firsly, to keep naming more consistent. + + Secondly, some of options are not supported by ``DB.sql(...)`` method, but supported by ``DBReader``. + For example, ``SQLOptions`` do not support ``partitioning_mode`` and require explicit definition of ``lower_bound`` and ``upper_bound`` when ``num_partitions`` is greater than 1. + ``ReadOptions`` does support ``partitioning_mode`` and allows skipping ``lower_bound`` and ``upper_bound`` values. + + This require some code changes. Before: + + .. code-block:: python + + from onetl.connection import Postgres + + postgres = Postgres(...) + df = postgres.sql( + """ + SELECT * + FROM some.mytable + WHERE key = 'something' + """, + options=Postgres.ReadOptions( + partitioning_mode="range", + partition_column="id", + num_partitions=10, + ), + ) + + After: + + .. code-block:: python + + from onetl.connection import Postgres + + postgres = Postgres(...) + df = postgres.sql( + """ + SELECT * + FROM some.mytable + WHERE key = 'something' + """, + options=Postgres.SQLOptions( + # partitioning_mode is not supported! + partition_column="id", + num_partitions=10, + lower_bound=0, # <-- set explicitly + upper_bound=1000, # <-- set explicitly + ), + ) + + For now, ``DB.sql(query, options=...)`` can accept ``ReadOptions`` to keep backward compatibility, but emits deprecation warning. + The support will be removed in ``v1.0.0``. + +- Split up ``JDBCOptions`` class into ``FetchOptions`` and ``ExecuteOptions`` (:github:pull:`274`). + + New classes are used by ``DB.fetch(query, options=...)`` and ``DB.execute(query, options=...)`` methods respectively. + This is mostly to keep naming more consistent. + + This require some code changes. Before: + + .. code-block:: python + + from onetl.connection import Postgres + + postgres = Postgres(...) + df = postgres.fetch( + "SELECT * FROM some.mytable WHERE key = 'something'", + options=Postgres.JDBCOptions( + fetchsize=1000, + query_timeout=30, + ), + ) + + postgres.execute( + "UPDATE some.mytable SET value = 'new' WHERE key = 'something'", + options=Postgres.JDBCOptions(query_timeout=30), + ) + + After: + + .. code-block:: python + + from onetl.connection import Postgres + + # Using FetchOptions for fetching data + postgres = Postgres(...) + df = postgres.fetch( + "SELECT * FROM some.mytable WHERE key = 'something'", + options=Postgres.FetchOptions( # <-- change class name + fetchsize=1000, + query_timeout=30, + ), + ) + + # Using ExecuteOptions for executing statements + postgres.execute( + "UPDATE some.mytable SET value = 'new' WHERE key = 'something'", + options=Postgres.ExecuteOptions(query_timeout=30), # <-- change class name + ) + + For now, ``DB.fetch(query, options=...)`` and ``DB.execute(query, options=...)`` can accept ``JDBCOptions``, to keep backward compatibility, + but emit a deprecation warning. The old class will be removed in ``v1.0.0``. + +Features +-------- + +Improve user experience with Kafka messages and Database tables with serialized columns, like JSON/XML. + +- Allow passing custom package version as argument for ``DB.get_packages(...)`` method of several DB connectors: + * ``Clickhouse.get_packages(package_version=..., apache_http_client_version=...)`` (:github:pull:`249`). + * ``MongoDB.get_packages(scala_version=..., spark_version=..., package_version=...)`` (:github:pull:`255`). + * ``MySQL.get_packages(package_version=...)`` (:github:pull:`253`). + * ``MSSQL.get_packages(java_version=..., package_version=...)`` (:github:pull:`254`). + * ``Oracle.get_packages(java_version=..., package_version=...)`` (:github:pull:`252`). + * ``Postgres.get_packages(package_version=...)`` (:github:pull:`251`). + * ``Teradata.get_packages(package_version=...)`` (:github:pull:`256`). + + Now users can downgrade or upgrade connection without waiting for next onETL release. Previously only ``Kafka`` and ``Greenplum`` supported this feature. + +- Add ``FileFormat.parse_column(...)`` method to several classes: + * ``Avro.parse_column(col)`` (:github:pull:`265`). + * ``JSON.parse_column(col, schema=...)`` (:github:pull:`257`). + * ``CSV.parse_column(col, schema=...)`` (:github:pull:`258`). + * ``XML.parse_column(col, schema=...)`` (:github:pull:`269`). + + This allows parsing data in ``value`` field of Kafka message or string/binary column of some table as a nested Spark structure. + +- Add ``FileFormat.serialize_column(...)`` method to several classes: + * ``Avro.serialize_column(col)`` (:github:pull:`265`). + * ``JSON.serialize_column(col)`` (:github:pull:`257`). + * ``CSV.serialize_column(col)`` (:github:pull:`258`). + + This allows saving Spark nested structures or arrays to ``value`` field of Kafka message or string/binary column of some table. + +Improvements +------------ + +Few documentation improvements. + +- Replace all ``assert`` in documentation with doctest syntax. This should make documentation more readable (:github:pull:`273`). + +- Add generic ``Troubleshooting`` guide (:github:pull:`275`). + +- Improve Kafka documentation: + * Add "Prerequisites" page describing different aspects of connecting to Kafka. + * Improve "Reading from" and "Writing to" page of Kafka documentation, add more examples and usage notes. + * Add "Troubleshooting" page (:github:pull:`276`). + +- Improve Hive documentation: + * Add "Prerequisites" page describing different aspects of connecting to Hive. + * Improve "Reading from" and "Writing to" page of Hive documentation, add more examples and recommendations. + * Improve "Executing statements in Hive" page of Hive documentation. (:github:pull:`278`). + +- Add "Prerequisites" page describing different aspects of using SparkHDFS and SparkS3 connectors. (:github:pull:`279`). + +- Add note about connecting to Clickhouse cluster. (:github:pull:`280`). + +- Add notes about versions when specific class/method/attribute/argument was added, renamed or changed behavior (:github:`282`). + + +Bug Fixes +--------- + +- Fix missing ``pysmb`` package after installing ``pip install onetl[files]`` . diff --git a/docs/changelog/0.7.0.rst b/docs/changelog/0.7.0.rst index ac2928195..385f736f7 100644 --- a/docs/changelog/0.7.0.rst +++ b/docs/changelog/0.7.0.rst @@ -114,13 +114,13 @@ Breaking Changes * Greenplum ``2.1.3`` → ``2.1.4``. * MSSQL ``10.2.1.jre8`` → ``12.2.0.jre8``. Minimal supported version of MSSQL is now 2014 instead 2021. * MySQL ``8.0.30`` → ``8.0.33``: - * * Package was renamed ``mysql:mysql-connector-java`` → ``com.mysql:mysql-connector-j``. - * * Driver class was renamed ``com.mysql.jdbc.Driver`` → ``com.mysql.cj.jdbc.Driver``. + * Package was renamed ``mysql:mysql-connector-java`` → ``com.mysql:mysql-connector-j``. + * Driver class was renamed ``com.mysql.jdbc.Driver`` → ``com.mysql.cj.jdbc.Driver``. * Oracle ``21.6.0.0.1`` → ``23.2.0.0``. * Postgres ``42.4.0`` → ``42.6.0``. * Teradata ``17.20.00.08`` → ``17.20.00.15``: - * * Package was renamed ``com.teradata.jdbc:terajdbc4`` → ``com.teradata.jdbc:terajdbc``. - * * Teradata driver is now published to Maven. + * Package was renamed ``com.teradata.jdbc:terajdbc4`` → ``com.teradata.jdbc:terajdbc``. + * Teradata driver is now published to Maven. See :github:pull:`31`. diff --git a/docs/changelog/0.8.0.rst b/docs/changelog/0.8.0.rst index 52295a4d2..0e82a1cb5 100644 --- a/docs/changelog/0.8.0.rst +++ b/docs/changelog/0.8.0.rst @@ -62,8 +62,6 @@ Breaking Changes from onetl.core import DBWriter from onetl.core import FileDownloader from onetl.core import FileUploader - from onetl.core import FileResult - from onetl.core import FileSet with new modules ``onetl.db`` and ``onetl.file``: @@ -76,10 +74,6 @@ Breaking Changes from onetl.file import FileDownloader from onetl.file import FileUploader - # not a public interface - from onetl.file.file_result import FileResult - from onetl.file.file_set import FileSet - Imports from old module ``onetl.core`` still can be used, but marked as deprecated. Module will be removed in v1.0.0. (:github:pull:`46`) diff --git a/docs/changelog/0.9.0.rst b/docs/changelog/0.9.0.rst index 24927f945..ddd561e5f 100644 --- a/docs/changelog/0.9.0.rst +++ b/docs/changelog/0.9.0.rst @@ -6,43 +6,43 @@ Breaking Changes - Rename methods: - * ``DBConnection.read_df`` -> ``DBConnection.read_source_as_df`` - * ``DBConnection.write_df`` -> ``DBConnection.write_df_to_target`` (:github:pull:`66`) + * ``DBConnection.read_df`` → ``DBConnection.read_source_as_df`` + * ``DBConnection.write_df`` → ``DBConnection.write_df_to_target`` (:github:pull:`66`) - Rename classes: - * ``HDFS.slots`` -> ``HDFS.Slots`` - * ``Hive.slots`` -> ``Hive.Slots`` + * ``HDFS.slots`` → ``HDFS.Slots`` + * ``Hive.slots`` → ``Hive.Slots`` Old names are left intact, but will be removed in v1.0.0 (:github:pull:`103`) - Rename options to make them self-explanatory: - * ``Hive.WriteOptions(mode="append")`` -> ``Hive.WriteOptions(if_exists="append")`` - * ``Hive.WriteOptions(mode="overwrite_table")`` -> ``Hive.WriteOptions(if_exists="replace_entire_table")`` - * ``Hive.WriteOptions(mode="overwrite_partitions")`` -> ``Hive.WriteOptions(if_exists="replace_overlapping_partitions")`` + * ``Hive.WriteOptions(mode="append")`` → ``Hive.WriteOptions(if_exists="append")`` + * ``Hive.WriteOptions(mode="overwrite_table")`` → ``Hive.WriteOptions(if_exists="replace_entire_table")`` + * ``Hive.WriteOptions(mode="overwrite_partitions")`` → ``Hive.WriteOptions(if_exists="replace_overlapping_partitions")`` - * ``JDBC.WriteOptions(mode="append")`` -> ``JDBC.WriteOptions(if_exists="append")`` - * ``JDBC.WriteOptions(mode="overwrite")`` -> ``JDBC.WriteOptions(if_exists="replace_entire_table")`` + * ``JDBC.WriteOptions(mode="append")`` → ``JDBC.WriteOptions(if_exists="append")`` + * ``JDBC.WriteOptions(mode="overwrite")`` → ``JDBC.WriteOptions(if_exists="replace_entire_table")`` - * ``Greenplum.WriteOptions(mode="append")`` -> ``Greenplum.WriteOptions(if_exists="append")`` - * ``Greenplum.WriteOptions(mode="overwrite")`` -> ``Greenplum.WriteOptions(if_exists="replace_entire_table")`` + * ``Greenplum.WriteOptions(mode="append")`` → ``Greenplum.WriteOptions(if_exists="append")`` + * ``Greenplum.WriteOptions(mode="overwrite")`` → ``Greenplum.WriteOptions(if_exists="replace_entire_table")`` - * ``MongoDB.WriteOptions(mode="append")`` -> ``Greenplum.WriteOptions(if_exists="append")`` - * ``MongoDB.WriteOptions(mode="overwrite")`` -> ``Greenplum.WriteOptions(if_exists="replace_entire_collection")`` + * ``MongoDB.WriteOptions(mode="append")`` → ``Greenplum.WriteOptions(if_exists="append")`` + * ``MongoDB.WriteOptions(mode="overwrite")`` → ``Greenplum.WriteOptions(if_exists="replace_entire_collection")`` - * ``FileDownloader.Options(mode="error")`` -> ``FileDownloader.Options(if_exists="error")`` - * ``FileDownloader.Options(mode="ignore")`` -> ``FileDownloader.Options(if_exists="ignore")`` - * ``FileDownloader.Options(mode="overwrite")`` -> ``FileDownloader.Options(if_exists="replace_file")`` - * ``FileDownloader.Options(mode="delete_all")`` -> ``FileDownloader.Options(if_exists="replace_entire_directory")`` + * ``FileDownloader.Options(mode="error")`` → ``FileDownloader.Options(if_exists="error")`` + * ``FileDownloader.Options(mode="ignore")`` → ``FileDownloader.Options(if_exists="ignore")`` + * ``FileDownloader.Options(mode="overwrite")`` → ``FileDownloader.Options(if_exists="replace_file")`` + * ``FileDownloader.Options(mode="delete_all")`` → ``FileDownloader.Options(if_exists="replace_entire_directory")`` - * ``FileUploader.Options(mode="error")`` -> ``FileUploader.Options(if_exists="error")`` - * ``FileUploader.Options(mode="ignore")`` -> ``FileUploader.Options(if_exists="ignore")`` - * ``FileUploader.Options(mode="overwrite")`` -> ``FileUploader.Options(if_exists="replace_file")`` - * ``FileUploader.Options(mode="delete_all")`` -> ``FileUploader.Options(if_exists="replace_entire_directory")`` + * ``FileUploader.Options(mode="error")`` → ``FileUploader.Options(if_exists="error")`` + * ``FileUploader.Options(mode="ignore")`` → ``FileUploader.Options(if_exists="ignore")`` + * ``FileUploader.Options(mode="overwrite")`` → ``FileUploader.Options(if_exists="replace_file")`` + * ``FileUploader.Options(mode="delete_all")`` → ``FileUploader.Options(if_exists="replace_entire_directory")`` - * ``FileMover.Options(mode="error")`` -> ``FileMover.Options(if_exists="error")`` - * ``FileMover.Options(mode="ignore")`` -> ``FileMover.Options(if_exists="ignore")`` - * ``FileMover.Options(mode="overwrite")`` -> ``FileMover.Options(if_exists="replace_file")`` - * ``FileMover.Options(mode="delete_all")`` -> ``FileMover.Options(if_exists="replace_entire_directory")`` + * ``FileMover.Options(mode="error")`` → ``FileMover.Options(if_exists="error")`` + * ``FileMover.Options(mode="ignore")`` → ``FileMover.Options(if_exists="ignore")`` + * ``FileMover.Options(mode="overwrite")`` → ``FileMover.Options(if_exists="replace_file")`` + * ``FileMover.Options(mode="delete_all")`` → ``FileMover.Options(if_exists="replace_entire_directory")`` Old names are left intact, but will be removed in v1.0.0 (:github:pull:`108`) - Rename ``onetl.log.disable_clients_logging()`` to ``onetl.log.setup_clients_logging()``. (:github:pull:`120`) diff --git a/docs/changelog/index.rst b/docs/changelog/index.rst index 715b7fdf6..62c93671d 100644 --- a/docs/changelog/index.rst +++ b/docs/changelog/index.rst @@ -3,6 +3,7 @@ :caption: Changelog DRAFT + 0.11.0 0.10.2 0.10.1 0.10.0 diff --git a/docs/conf.py b/docs/conf.py index 188b1cdbd..9427e1902 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -16,7 +16,7 @@ import sys from pathlib import Path -from packaging import version as Version +from packaging.version import Version PROJECT_ROOT_DIR = Path(__file__).parent.parent.resolve() @@ -34,7 +34,7 @@ # # The short X.Y version. -ver = Version.parse(subprocess.check_output("python ../setup.py --version", shell=True, text=True).strip()) +ver = Version(subprocess.check_output("python ../setup.py --version", shell=True, text=True).strip()) version = ver.base_version # The full version, including alpha/beta/rc tags. release = ver.public @@ -54,11 +54,11 @@ "sphinx_copybutton", "sphinx.ext.autodoc", "sphinx.ext.autosummary", - "sphinxcontrib.autodoc_pydantic", "sphinxcontrib.towncrier", # provides `towncrier-draft-entries` directive "sphinxcontrib.plantuml", "sphinx.ext.extlinks", "sphinx_favicon", + "sphinxcontrib.autodoc_pydantic", ] numpydoc_show_class_members = False autodoc_pydantic_model_show_config = False @@ -70,6 +70,12 @@ autodoc_pydantic_field_list_validators = False sphinx_tabs_disable_tab_closing = True +# prevent >>>, ... and doctest outputs from copying +copybutton_prompt_text = r">>> |\.\.\. |\$ |In \[\d*\]: | {2,5}\.\.\.: | {5,8}: " +copybutton_prompt_is_regexp = True +copybutton_copy_empty_lines = False +copybutton_only_copy_prompt_lines = True + towncrier_draft_autoversion_mode = "draft" towncrier_draft_include_empty = False towncrier_draft_working_directory = PROJECT_ROOT_DIR diff --git a/docs/connection/db_connection/clickhouse/execute.rst b/docs/connection/db_connection/clickhouse/execute.rst index 11e01c753..f33369c57 100644 --- a/docs/connection/db_connection/clickhouse/execute.rst +++ b/docs/connection/db_connection/clickhouse/execute.rst @@ -17,10 +17,10 @@ There are 2 ways to execute some statement in Clickhouse Use ``Clickhouse.fetch`` ~~~~~~~~~~~~~~~~~~~~~~~~ -Use this method to execute some ``SELECT`` query which returns **small number or rows**, like reading +Use this method to perform some ``SELECT`` query which returns **small number or rows**, like reading Clickhouse config, or reading data from some reference table. Method returns Spark DataFrame. -Method accepts :obj:`JDBCOptions `. +Method accepts :obj:`Clickhouse.FetchOptions `. Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. @@ -50,7 +50,7 @@ Examples df = clickhouse.fetch( "SELECT value FROM some.reference_table WHERE key = 'some_constant'", - options=Clickhouse.JDBCOptions(query_timeout=10), + options=Clickhouse.FetchOptions(query_timeout=10), ) clickhouse.close() value = df.collect()[0][0] # get value from first row and first column @@ -60,7 +60,7 @@ Use ``Clickhouse.execute`` Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it. -Method accepts :obj:`JDBCOptions `. +Method accepts :obj:`Clickhouse.ExecuteOptions `. Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. @@ -69,7 +69,7 @@ Syntax support This method supports **any** query syntax supported by Clickhouse, like: -* ✅︎ ``CREATE TABLE ...``, ``CREATE VIEW ...``, ``DROP TABLE ...``, and so on +* ✅︎ ``CREATE TABLE ...``, ``CREATE VIEW ...``, and so on * ✅︎ ``ALTER ...`` * ✅︎ ``INSERT INTO ... SELECT ...``, ``UPDATE ...``, ``DELETE ...``, and so on * ✅︎ ``DROP TABLE ...``, ``DROP VIEW ...``, and so on @@ -98,7 +98,7 @@ Examples ENGINE = MergeTree() ORDER BY id """, - options=Clickhouse.JDBCOptions(query_timeout=10), + options=Clickhouse.ExecuteOptions(query_timeout=10), ) Notes @@ -111,9 +111,17 @@ So it should **NOT** be used to read large amounts of data. Use :ref:`DBReader < Options ------- -.. currentmodule:: onetl.connection.db_connection.jdbc_mixin.options +.. currentmodule:: onetl.connection.db_connection.clickhouse.options -.. autopydantic_model:: JDBCOptions +.. autopydantic_model:: ClickhouseFetchOptions + :inherited-members: GenericOptions + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false + + +.. autopydantic_model:: ClickhouseExecuteOptions + :inherited-members: GenericOptions :member-order: bysource :model-show-field-summary: false :field-show-constraints: false diff --git a/docs/connection/db_connection/clickhouse/prerequisites.rst b/docs/connection/db_connection/clickhouse/prerequisites.rst index 654add047..03384b1a0 100644 --- a/docs/connection/db_connection/clickhouse/prerequisites.rst +++ b/docs/connection/db_connection/clickhouse/prerequisites.rst @@ -6,7 +6,7 @@ Prerequisites Version Compatibility --------------------- -* Clickhouse server versions: 20.7 or higher +* Clickhouse server versions: 21.1 or higher * Spark versions: 2.3.x - 3.5.x * Java versions: 8 - 20 @@ -21,7 +21,7 @@ BEFORE creating the connector instance. See :ref:`install-spark` installation instruction for more details. Connecting to Clickhouse ------------------------ +------------------------ Connection port ~~~~~~~~~~~~~~~ @@ -30,11 +30,27 @@ Connector can only use **HTTP** (usually ``8123`` port) or **HTTPS** (usually `` TCP and GRPC protocols are NOT supported. -Clickhouse cluster interaction -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Connecting to cluster +~~~~~~~~~~~~~~~~~~~~~ -If you're using Clickhouse cluster, it is currently possible to connect only to one specific cluster node. -Connecting to multiple nodes simultaneously is not supported. +It is possible to connect to Clickhouse cluster, and use it's load balancing capabilities to read or write data in parallel. +Each Spark executor can connect to random Clickhouse nodes, instead of sending all the data to a node specified in connection params. + +This requires all Clickhouse servers to run on different hosts, and **listen the same HTTP port**. +Set ``auto_discovery=True`` to enable this feature (disabled by default): + +.. code:: python + + Clickhouse( + host="node1.of.cluster", + port=8123, + extra={ + "auto_discovery": True, + "load_balancing_policy": "roundRobin", + }, + ) + +See `official documentation `_. Required grants ~~~~~~~~~~~~~~~ diff --git a/docs/connection/db_connection/clickhouse/read.rst b/docs/connection/db_connection/clickhouse/read.rst index 33bcff4ce..0b2bc929a 100644 --- a/docs/connection/db_connection/clickhouse/read.rst +++ b/docs/connection/db_connection/clickhouse/read.rst @@ -22,7 +22,7 @@ Supported DBReader features * * ✅︎ :ref:`incremental-batch-strategy` * ❌ ``hint`` (is not supported by Clickhouse) * ❌ ``df_schema`` -* ✅︎ ``options`` (see :obj:`JDBCReadOptions `) +* ✅︎ ``options`` (see :obj:`Clickhouse.ReadOptions `) Examples -------- @@ -85,9 +85,10 @@ Especially if there are indexes or partitions for columns used in ``where`` clau Options ------- -.. currentmodule:: onetl.connection.db_connection.jdbc_connection.options +.. currentmodule:: onetl.connection.db_connection.clickhouse.options -.. autopydantic_model:: JDBCReadOptions +.. autopydantic_model:: ClickhouseReadOptions + :inherited-members: GenericOptions :member-order: bysource :model-show-field-summary: false :field-show-constraints: false diff --git a/docs/connection/db_connection/clickhouse/sql.rst b/docs/connection/db_connection/clickhouse/sql.rst index 1a3a1d52a..376b2d0c3 100644 --- a/docs/connection/db_connection/clickhouse/sql.rst +++ b/docs/connection/db_connection/clickhouse/sql.rst @@ -43,7 +43,7 @@ Examples WHERE key = 'something' """, - options=Clickhouse.ReadOptions( + options=Clickhouse.SQLOptions( partition_column="id", num_partitions=10, lower_bound=0, @@ -66,3 +66,14 @@ Pay attention to ``where`` value Instead of filtering data on Spark side using ``df.filter(df.column == 'value')`` pass proper ``WHERE column = 'value'`` clause. This both reduces the amount of data send from Clickhouse to Spark, and may also improve performance of the query. Especially if there are indexes or partitions for columns used in ``where`` clause. + +Options +------- + +.. currentmodule:: onetl.connection.db_connection.clickhouse.options + +.. autopydantic_model:: ClickhouseSQLOptions + :inherited-members: GenericOptions + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false diff --git a/docs/connection/db_connection/clickhouse/types.rst b/docs/connection/db_connection/clickhouse/types.rst index 1df369dd4..00a71551a 100644 --- a/docs/connection/db_connection/clickhouse/types.rst +++ b/docs/connection/db_connection/clickhouse/types.rst @@ -14,7 +14,7 @@ Reading from Clickhouse This is how Clickhouse connector performs this: * For each column in query result (``SELECT column1, column2, ... FROM table ...``) get column name and Clickhouse type. -* Find corresponding ``Clickhouse type (read)`` -> ``Spark type`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* Find corresponding ``Clickhouse type (read)`` → ``Spark type`` combination (see below) for each DataFrame column. If no combination is found, raise exception. * Create DataFrame from query with specific column names and Spark types. Writing to some existing Clickhouse table @@ -25,8 +25,8 @@ This is how Clickhouse connector performs this: * Get names of columns in DataFrame. [1]_ * Perform ``SELECT * FROM table LIMIT 0`` query. * Take only columns present in DataFrame (by name, case insensitive). For each found column get Clickhouse type. -* **Find corresponding** ``Clickhouse type (read)`` -> ``Spark type`` **combination** (see below) for each DataFrame column. If no combination is found, raise exception. [2]_ -* Find corresponding ``Spark type`` -> ``Clickhousetype (write)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* **Find corresponding** ``Clickhouse type (read)`` → ``Spark type`` **combination** (see below) for each DataFrame column. If no combination is found, raise exception. [2]_ +* Find corresponding ``Spark type`` → ``Clickhousetype (write)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. * If ``Clickhousetype (write)`` match ``Clickhouse type (read)``, no additional casts will be performed, DataFrame column will be written to Clickhouse as is. * If ``Clickhousetype (write)`` does not match ``Clickhouse type (read)``, DataFrame column will be casted to target column type **on Clickhouse side**. For example, you can write column with text data to ``Int32`` column, if column contains valid integer values within supported value range and precision. @@ -47,7 +47,7 @@ Create new table using Spark This is how Clickhouse connector performs this: -* Find corresponding ``Spark type`` -> ``Clickhouse type (create)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* Find corresponding ``Spark type`` → ``Clickhouse type (create)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. * Generate DDL for creating table in Clickhouse, like ``CREATE TABLE (col1 ...)``, and run it. * Write DataFrame to created table as is. @@ -125,11 +125,9 @@ Numeric types ~~~~~~~~~~~~~ +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ -| Clickhouse type (read) | Spark type | Clickhousetype (write) | Clickhouse type (create) | +| Clickhouse type (read) | Spark type | Clickhouse type (write) | Clickhouse type (create) | +================================+===================================+===============================+===============================+ -| ``Bool`` | ``IntegerType()`` | ``Int32`` | ``Int32`` | -+--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ -| ``-`` | ``BooleanType()`` | ``UInt64`` | ``UInt64`` | +| ``Bool`` | ``BooleanType()`` | ``Bool`` | ``UInt64`` | +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ | ``Decimal`` | ``DecimalType(P=10, S=0)`` | ``Decimal(P=10, S=0)`` | ``Decimal(P=10, S=0)`` | +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ @@ -147,11 +145,9 @@ Numeric types +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ | ``Decimal256(S=0..76)`` | unsupported [3]_ | | | +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ -| ``Float32`` | ``DoubleType()`` | ``Float64`` | ``Float64`` | -+--------------------------------+ | | | -| ``Float64`` | | | | +| ``Float32`` | ``FloatType()`` | ``Float32`` | ``Float32`` | +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ -| ``-`` | ``FloatType()`` | ``Float32`` | ``Float32`` | +| ``Float64`` | ``DoubleType()`` | ``Float64`` | ``Float64`` | +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ | ``Int8`` | ``IntegerType()`` | ``Int32`` | ``Int32`` | +--------------------------------+ | | | @@ -161,25 +157,25 @@ Numeric types +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ | ``Int64`` | ``LongType()`` | ``Int64`` | ``Int64`` | +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ -| ``Int128`` | ``DecimalType(20,0)`` | ``Decimal(20,0)`` | ``Decimal(20,0)`` | -+--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ -| ``Int256`` | unsupported [3]_ | | | +| ``Int128`` | unsupported [3]_ | | | ++--------------------------------+ | | | +| ``Int256`` | | | | +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ | ``-`` | ``ByteType()`` | ``Int8`` | ``Int8`` | +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ | ``-`` | ``ShortType()`` | ``Int32`` | ``Int32`` | +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ | ``UInt8`` | ``IntegerType()`` | ``Int32`` | ``Int32`` | -+--------------------------------+ | | | -| ``UInt16`` | | | | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``UInt16`` | ``LongType()`` | ``Int64`` | ``Int64`` | +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ | ``UInt32`` | ``DecimalType(20,0)`` | ``Decimal(20,0)`` | ``Decimal(20,0)`` | +--------------------------------+ | | | | ``UInt64`` | | | | -+--------------------------------+ | | | -| ``UInt128`` | | | | +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ -| ``UInt256`` | unsupported [3]_ | | | +| ``UInt128`` | unsupported [3]_ | | | ++--------------------------------+ | | | +| ``UInt256`` | | | | +--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ .. [3] @@ -198,33 +194,37 @@ Notes: * ``TIMESTAMP`` is alias for ``DateTime32``, but ``TIMESTAMP(N)`` is alias for ``DateTime64(N)`` +-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ -| Clickhouse type (read) | Spark type | Clickhousetype (write) | Clickhouse type (create) | +| Clickhouse type (read) | Spark type | Clickhouse type (write) | Clickhouse type (create) | +===================================+======================================+==================================+===============================+ | ``Date`` | ``DateType()`` | ``Date`` | ``Date`` | +-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ -| ``Date32`` | unsupported | | | +| ``Date32`` | ``DateType()`` | ``Date`` | ``Date``, | +| | | | **cannot insert data** [4]_ | +-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ -| ``DateTime32``, seconds | ``TimestampType()``, microseconds | ``DateTime64(6)``, microseconds | ``DateTime32``, seconds, | +| ``DateTime32``, seconds | ``TimestampType()``, microseconds | ``DateTime64(6)``, microseconds | ``DateTime32``, seconds | +-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ | ``DateTime64(3)``, milliseconds | ``TimestampType()``, microseconds | ``DateTime64(6)``, microseconds | ``DateTime32``, seconds, | | | | | **precision loss** [5]_ | -| | | | | +-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ -| ``DateTime64(6)``, microseconds | ``TimestampType()``, microseconds | ``DateTime64(6)``, microseconds | ``DateTime32``, seconds, | -+-----------------------------------+--------------------------------------+----------------------------------+ **cannot be inserted** [6]_ | -| ``DateTime64(7..9)``, nanoseconds | ``TimestampType()``, microseconds, | ``DateTime64(6)``, microseconds, | | -| | **precision loss** [4]_ | **precision loss** [4]_ | | +| ``DateTime64(6)``, microseconds | ``TimestampType()``, microseconds | | ``DateTime32``, seconds, | ++-----------------------------------+--------------------------------------+ | **precision loss** [7]_ | +| ``DateTime64(7..9)``, nanoseconds | ``TimestampType()``, microseconds, | | | +| | **precision loss** [6]_ | | | | | | | | -+-----------------------------------+--------------------------------------+----------------------------------+ | -| ``-`` | ``TimestampNTZType()``, microseconds | ``DateTime64(6)`` | | ++-----------------------------------+--------------------------------------+ | | +| ``-`` | ``TimestampNTZType()``, microseconds | | | ++-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ +| ``DateTime32(TZ)`` | unsupported [7]_ | | | ++-----------------------------------+ | | | +| ``DateTime64(P, TZ)`` | | | | +-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ -| ``IntervalNanosecond`` | unsupported | | | +| ``IntervalNanosecond`` | ``LongType()`` | ``Int64`` | ``Int64`` | +-----------------------------------+ | | | | ``IntervalMicrosecond`` | | | | +-----------------------------------+ | | | | ``IntervalMillisecond`` | | | | -+-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ -| ``IntervalSecond`` | ``IntegerType()`` | ``Int32`` | ``Int32`` | ++-----------------------------------+ | | | +| ``IntervalSecond`` | | | | +-----------------------------------+ | | | | ``IntervalMinute`` | | | | +-----------------------------------+ | | | @@ -267,17 +267,27 @@ Notes: * `Spark TimestampType documentation `_ .. [4] - Clickhouse support datetime up to nanoseconds precision (``23:59:59.999999999``), - but Spark ``TimestampType()`` supports datetime up to microseconds precision (``23:59:59.999999``). - Nanoseconds will be lost during read or write operations. + ``Date32`` has different bytes representation than ``Date``, and inserting value of type ``Date32`` to ``Date`` column + leads to errors on Clickhouse side, e.g. ``Date(106617) should be between 0 and 65535 inclusive of both values``. + Although Spark does properly read the ``Date32`` column as ``DateType()``, and there should be no difference at all. + Probably this is some bug in Clickhouse driver. .. [5] Generic JDBC dialect generates DDL with Clickhouse type ``TIMESTAMP`` which is alias for ``DateTime32`` with precision up to seconds (``23:59:59``). Inserting data with milliseconds precision (``23:59:59.999``) will lead to **throwing away milliseconds**. + Solution: create table manually, with proper column type. .. [6] - Clickhouse will raise an exception that data in format ``2001-01-01 23:59:59.999999`` has data ``.999999`` which does not match format ``YYYY-MM-DD hh:mm:ss``. - So you can create Clickhouse table with Spark, but cannot write data to column of this type. + Clickhouse support datetime up to nanoseconds precision (``23:59:59.999999999``), + but Spark ``TimestampType()`` supports datetime up to microseconds precision (``23:59:59.999999``). + Nanoseconds will be lost during read or write operations. + Solution: create table manually, with proper column type. + +.. [7] + Clickhouse will raise an exception that data in format ``2001-01-01 23:59:59.999999`` has data ``.999999`` which does not match format ``YYYY-MM-DD hh:mm:ss`` + of ``DateTime32`` column type (see [5]_). + So Spark can create Clickhouse table, but cannot write data to column of this type. + Solution: create table manually, with proper column type. String types ~~~~~~~~~~~~~ @@ -296,6 +306,8 @@ String types | ``IPv4`` | | | | +--------------------------------------+ | | | | ``IPv6`` | | | | ++--------------------------------------+ | | | +| ``UUID`` | | | | +--------------------------------------+------------------+ | | | ``-`` | ``BinaryType()`` | | | +--------------------------------------+------------------+------------------------+--------------------------+ @@ -316,9 +328,8 @@ Columns of these Clickhouse types cannot be read by Spark: * ``Ring`` * ``SimpleAggregateFunction(func, T)`` * ``Tuple(T1, T2, ...)`` - * ``UUID`` -Dataframe with these Spark types be written to Clickhouse: +Dataframe with these Spark types cannot be written to Clickhouse: * ``ArrayType(T)`` * ``BinaryType()`` * ``CharType(N)`` @@ -341,15 +352,20 @@ Explicit type cast ~~~~~~~~~~~~ Use ``CAST`` or ``toJSONString`` to get column data as string in JSON format, -and then cast string column in resulting dataframe to proper type using `from_json `_: -.. code:: python +For parsing JSON columns in ClickHouse, :obj:`JSON.parse_column ` method. + +.. code-block:: python - from pyspark.sql.functions import from_json from pyspark.sql.types import ArrayType, IntegerType + from onetl.file.format import JSON + from onetl.connection import ClickHouse + from onetl.db import DBReader + reader = DBReader( connection=clickhouse, + table="default.source_tbl", columns=[ "id", "toJSONString(array_column) array_column", @@ -360,18 +376,25 @@ and then cast string column in resulting dataframe to proper type using `from_js # Spark requires all columns to have some specific type, describe it column_type = ArrayType(IntegerType()) + json = JSON() df = df.select( df.id, - from_json(df.array_column, column_type).alias("array_column"), + json.parse_column("array_column", column_type), ) ``DBWriter`` ~~~~~~~~~~~~ -Convert dataframe column to JSON using `to_json `_, -and write it as ``String`` column in Clickhouse: +For writing JSON data to ClickHouse, use the :obj:`JSON.serialize_column ` method to convert a DataFrame column to JSON format efficiently and write it as a ``String`` column in Clickhouse. + -.. code:: python +.. code-block:: python + + from onetl.file.format import JSON + from onetl.connection import ClickHouse + from onetl.db import DBWriter + + clickhouse = ClickHouse(...) clickhouse.execute( """ @@ -384,11 +407,10 @@ and write it as ``String`` column in Clickhouse: """, ) - from pyspark.sql.functions import to_json - + json = JSON() df = df.select( df.id, - to_json(df.array_column).alias("array_column_json"), + json.serialize_column(df.array_column).alias("array_column_json"), ) writer.run(df) diff --git a/docs/connection/db_connection/clickhouse/write.rst b/docs/connection/db_connection/clickhouse/write.rst index 1fe56868b..6237bdf16 100644 --- a/docs/connection/db_connection/clickhouse/write.rst +++ b/docs/connection/db_connection/clickhouse/write.rst @@ -45,11 +45,12 @@ Examples Options ------- -Method above accepts :obj:`JDBCWriteOptions ` +Method above accepts :obj:`Clickhouse.WriteOptions ` -.. currentmodule:: onetl.connection.db_connection.jdbc_connection.options +.. currentmodule:: onetl.connection.db_connection.clickhouse.options -.. autopydantic_model:: JDBCWriteOptions +.. autopydantic_model:: ClickhouseWriteOptions + :inherited-members: GenericOptions :member-order: bysource :model-show-field-summary: false :field-show-constraints: false diff --git a/docs/connection/db_connection/greenplum/execute.rst b/docs/connection/db_connection/greenplum/execute.rst index c0c415369..d19f8ab9d 100644 --- a/docs/connection/db_connection/greenplum/execute.rst +++ b/docs/connection/db_connection/greenplum/execute.rst @@ -17,10 +17,10 @@ There are 2 ways to execute some statement in Greenplum Use ``Greenplum.fetch`` ~~~~~~~~~~~~~~~~~~~~~~~ -Use this method to execute some ``SELECT`` query which returns **small number or rows**, like reading +Use this method to perform some ``SELECT`` query which returns **small number or rows**, like reading Greenplum config, or reading data from some reference table. Method returns Spark DataFrame. -Method accepts :obj:`JDBCOptions `. +Method accepts :obj:`Greenplum.FetchOptions `. Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. @@ -50,7 +50,7 @@ Examples df = greenplum.fetch( "SELECT value FROM some.reference_table WHERE key = 'some_constant'", - options=Greenplum.JDBCOptions(query_timeout=10), + options=Greenplum.FetchOptions(query_timeout=10), ) greenplum.close() value = df.collect()[0][0] # get value from first row and first column @@ -60,7 +60,7 @@ Use ``Greenplum.execute`` Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it. -Method accepts :obj:`JDBCOptions `. +Method accepts :obj:`Greenplum.ExecuteOptions `. Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. @@ -69,7 +69,7 @@ Syntax support This method supports **any** query syntax supported by Greenplum, like: -* ✅︎ ``CREATE TABLE ...``, ``CREATE VIEW ...``, ``DROP TABLE ...``, and so on +* ✅︎ ``CREATE TABLE ...``, ``CREATE VIEW ...``, and so on * ✅︎ ``ALTER ...`` * ✅︎ ``INSERT INTO ... SELECT ...``, ``UPDATE ...``, ``DELETE ...``, and so on * ✅︎ ``DROP TABLE ...``, ``DROP VIEW ...``, and so on @@ -99,7 +99,7 @@ Examples ) DISTRIBUTED BY id """, - options=Greenplum.JDBCOptions(query_timeout=10), + options=Greenplum.ExecuteOptions(query_timeout=10), ) Interaction schema @@ -143,9 +143,17 @@ The only port used while interacting with Greenplum in this case is ``5432`` (Gr Options ------- -.. currentmodule:: onetl.connection.db_connection.jdbc_mixin.options +.. currentmodule:: onetl.connection.db_connection.greenplum.options -.. autopydantic_model:: JDBCOptions +.. autopydantic_model:: GreenplumFetchOptions + :inherited-members: GenericOptions + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false + + +.. autopydantic_model:: GreenplumExecuteOptions + :inherited-members: GenericOptions :member-order: bysource :model-show-field-summary: false :field-show-constraints: false diff --git a/docs/connection/db_connection/greenplum/read.rst b/docs/connection/db_connection/greenplum/read.rst index 310521677..98674d55a 100644 --- a/docs/connection/db_connection/greenplum/read.rst +++ b/docs/connection/db_connection/greenplum/read.rst @@ -27,14 +27,15 @@ Supported DBReader features * * ✅︎ :ref:`incremental-batch-strategy` * ❌ ``hint`` (is not supported by Greenplum) * ❌ ``df_schema`` -* ✅︎ ``options`` (see :obj:`GreenplumReadOptions `) +* ✅︎ ``options`` (see :obj:`Greenplum.ReadOptions `) .. warning:: In case of Greenplum connector, ``DBReader`` does not generate raw ``SELECT`` query. Instead it relies on Spark SQL syntax which in some cases (using column projection and predicate pushdown) can be converted to Greenplum SQL. - So ``columns``, ``where`` and ``hwm.expression`` should be specified in Spark SQL syntax, not Greenplum SQL. + So ``columns``, ``where`` and ``hwm.expression`` should be specified in `Spark SQL `_ syntax, + not Greenplum SQL. This is OK: diff --git a/docs/connection/db_connection/greenplum/types.rst b/docs/connection/db_connection/greenplum/types.rst index baea34914..9199467e2 100644 --- a/docs/connection/db_connection/greenplum/types.rst +++ b/docs/connection/db_connection/greenplum/types.rst @@ -15,7 +15,7 @@ This is how Greenplum connector performs this: * Execute query ``SELECT * FROM table LIMIT 0`` [1]_. * For each column in query result get column name and Greenplum type. -* Find corresponding ``Greenplum type (read)`` -> ``Spark type`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* Find corresponding ``Greenplum type (read)`` → ``Spark type`` combination (see below) for each DataFrame column. If no combination is found, raise exception. * Use Spark column projection and predicate pushdown features to build a final query. * Create DataFrame from generated query with inferred schema. @@ -34,7 +34,7 @@ This is how Greenplum connector performs this: * Match table columns with DataFrame columns (by name, case insensitive). If some column is present only in target table, but not in DataFrame (like ``DEFAULT`` or ``SERIAL`` column), and vice versa, raise an exception. See `Explicit type cast`_. -* Find corresponding ``Spark type`` -> ``Greenplumtype (write)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* Find corresponding ``Spark type`` → ``Greenplumtype (write)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. * If ``Greenplumtype (write)`` match ``Greenplum type (read)``, no additional casts will be performed, DataFrame column will be written to Greenplum as is. * If ``Greenplumtype (write)`` does not match ``Greenplum type (read)``, DataFrame column will be casted to target column type **on Greenplum side**. For example, you can write column with text data to ``json`` column which Greenplum connector currently does not support. @@ -47,7 +47,7 @@ Create new table using Spark This is how Greenplum connector performs this: -* Find corresponding ``Spark type`` -> ``Greenplum type (create)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* Find corresponding ``Spark type`` → ``Greenplum type (create)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. * Generate DDL for creating table in Greenplum, like ``CREATE TABLE (col1 ...)``, and run it. * Write DataFrame to created table as is. @@ -301,35 +301,29 @@ Explicit type cast ``DBReader`` ~~~~~~~~~~~~ -Unfortunately, it is not possible to cast unsupported column to some supported type on ``DBReader`` side: +Direct casting of Greenplum types is not supported by DBReader due to the connector’s implementation specifics. .. code-block:: python - DBReader( + reader = DBReader( connection=greenplum, # will fail - columns=["CAST(column AS text)"], + columns=["CAST(unsupported_column AS text)"], ) -This is related to Greenplum connector implementation. Instead of passing this ``CAST`` expression to ``SELECT`` query -as is, it performs type cast on Spark side, so this syntax is not supported. - -But there is a workaround - create a view with casting unsupported column to ``text`` (or any other supported type). +But there is a workaround - create a view with casting unsupported column to text (or any other supported type). +For example, you can use `to_json `_ Postgres function to convert column of any type to string representation and then parse this column on Spark side using :obj:`JSON.parse_column ` method. -For example, you can use ``to_json`` Postgres function for convert column of any type to string representation. -You can then parse this column on Spark side using `from_json `_: - -.. code:: python +.. code-block:: python - from pyspark.sql.functions import from_json from pyspark.sql.types import ArrayType, IntegerType from onetl.connection import Greenplum from onetl.db import DBReader + from onetl.file.format import JSON greenplum = Greenplum(...) - # create view with proper type cast greenplum.execute( """ CREATE VIEW schema.view_with_json_column AS @@ -350,29 +344,26 @@ You can then parse this column on Spark side using `from_json ` method. -For example, you can convert data using `to_json `_ function. - -.. code:: python +.. code-block:: python - from pyspark.sql.functions import to_json from onetl.connection import Greenplum - from onetl.db import DBReader + from onetl.db import DBWriter + from onetl.file.format import JSON greenplum = Greenplum(...) @@ -390,7 +381,7 @@ For example, you can convert data using `to_json `_ syntax, not HiveQL. + +Examples +^^^^^^^^ + +.. code-block:: python + + from onetl.connection import Hive + + hive = Hive(...) + + hive.execute("DROP TABLE schema.table") + hive.execute( + """ + CREATE TABLE schema.table AS ( + id NUMBER, + key VARCHAR, + value DOUBLE + ) + PARTITION BY (business_date DATE) + STORED AS orc + """ + ) + +Details +------- .. currentmodule:: onetl.connection.db_connection.hive.connection diff --git a/docs/connection/db_connection/hive/index.rst b/docs/connection/db_connection/hive/index.rst index 9dd900b07..6d42666cc 100644 --- a/docs/connection/db_connection/hive/index.rst +++ b/docs/connection/db_connection/hive/index.rst @@ -7,6 +7,7 @@ Hive :maxdepth: 1 :caption: Connection + prerequisites connection .. toctree:: @@ -14,6 +15,7 @@ Hive :caption: Operations read + sql write execute diff --git a/docs/connection/db_connection/hive/prerequisites.rst b/docs/connection/db_connection/hive/prerequisites.rst new file mode 100644 index 000000000..d690f918f --- /dev/null +++ b/docs/connection/db_connection/hive/prerequisites.rst @@ -0,0 +1,130 @@ +.. _hive-prerequisites: + +Prerequisites +============= + +.. note:: + + onETL's Hive connection is actually SparkSession with access to `Hive Thrift Metastore `_ + and HDFS/S3. + All data motion is made using Spark. Hive Metastore is used only to store tables and partitions metadata. + + This connector does **NOT** require Hive server. It also does **NOT** use Hive JDBC connector. + +Version Compatibility +--------------------- + +* Hive Metastore version: 0.12 - 3.1.3 (may require to add proper .jar file explicitly) +* Spark versions: 2.3.x - 3.5.x +* Java versions: 8 - 20 + +See `official documentation `_. + + +Installing PySpark +------------------ + +To use Hive connector you should have PySpark installed (or injected to ``sys.path``) +BEFORE creating the connector instance. + +See :ref:`install-spark` installation instruction for more details. + +Connecting to Hive Metastore +---------------------------- + +.. note:: + + If you're using managed Hadoop cluster, skip this step, as all Spark configs are should already present on the host. + +Create ``$SPARK_CONF_DIR/hive-site.xml`` with Hive Metastore URL: + +.. code:: xml + + + + + + hive.metastore.uris + thrift://metastore.host.name:9083 + + + +Create ``$SPARK_CONF_DIR/core-site.xml`` with warehouse location ,e.g. HDFS IPC port of Hadoop namenode, or S3 bucket address & credentials: + +.. tabs:: + + .. code-tab:: xml HDFS + + + + + + fs.defaultFS + hdfs://myhadoopcluster:9820 + + + + .. code-tab:: xml S3 + + + + + !-- See https://hadoop.apache.org/docs/current/hadoop-aws/tools/hadoop-aws/index.html#General_S3A_Client_configuration + + fs.defaultFS + s3a://mys3bucket/ + + + fs.s3a.bucket.mybucket.endpoint + http://s3.somain + + + fs.s3a.bucket.mybucket.connection.ssl.enabled + false + + + fs.s3a.bucket.mybucket.path.style.access + true + + + fs.s3a.bucket.mybucket.aws.credentials.provider + org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider + + + fs.s3a.bucket.mybucket.access.key + some-user + + + fs.s3a.bucket.mybucket.secret.key + mysecrettoken + + + +Using Kerberos +-------------- + +Some of Hadoop managed clusters use Kerberos authentication. In this case, you should call `kinit `_ command +**BEFORE** starting Spark session to generate Kerberos ticket. See :ref:`install-kerberos`. + +Sometimes it is also required to pass keytab file to Spark config, allowing Spark executors to generate own Kerberos tickets: + +.. tabs:: + + .. code-tab:: python Spark 3 + + SparkSession.builder + .option("spark.kerberos.access.hadoopFileSystems", "hdfs://namenode1.domain.com:9820,hdfs://namenode2.domain.com:9820") + .option("spark.kerberos.principal", "user") + .option("spark.kerberos.keytab", "/path/to/keytab") + .gerOrCreate() + + .. code-tab:: python Spark 2 + + SparkSession.builder + .option("spark.yarn.access.hadoopFileSystems", "hdfs://namenode1.domain.com:9820,hdfs://namenode2.domain.com:9820") + .option("spark.yarn.principal", "user") + .option("spark.yarn.keytab", "/path/to/keytab") + .gerOrCreate() + +See `Spark security documentation `_ +for more details. diff --git a/docs/connection/db_connection/hive/read.rst b/docs/connection/db_connection/hive/read.rst index a9961b4ab..fb8091055 100644 --- a/docs/connection/db_connection/hive/read.rst +++ b/docs/connection/db_connection/hive/read.rst @@ -1,13 +1,100 @@ .. _hive-read: -Reading from Hive -================= +Reading from Hive using ``DBReader`` +==================================== -There are 2 ways of distributed data reading from Hive: +:obj:`DBReader ` supports :ref:`strategy` for incremental data reading, +but does not support custom queries, like ``JOIN``. -* Using :obj:`DBReader ` with different :ref:`strategy` -* Using :obj:`Hive.sql ` +Supported DBReader features +--------------------------- -.. currentmodule:: onetl.connection.db_connection.hive.connection +* ✅︎ ``columns`` +* ✅︎ ``where`` +* ✅︎ ``hwm``, supported strategies: +* * ✅︎ :ref:`snapshot-strategy` +* * ✅︎ :ref:`incremental-strategy` +* * ✅︎ :ref:`snapshot-batch-strategy` +* * ✅︎ :ref:`incremental-batch-strategy` +* ❌ ``hint`` (is not supported by Hive) +* ❌ ``df_schema`` +* ❌ ``options`` (only Spark config params are used) -.. automethod:: Hive.sql +.. warning:: + + Actually, ``columns``, ``where`` and ``hwm.expression`` should be written using `SparkSQL `_ syntax, + not HiveQL. + +Examples +-------- + +Snapshot strategy: + +.. code-block:: python + + from onetl.connection import Hive + from onetl.db import DBReader + + hive = Hive(...) + + reader = DBReader( + connection=hive, + source="schema.table", + columns=["id", "key", "CAST(value AS text) value", "updated_dt"], + where="key = 'something'", + ) + df = reader.run() + +Incremental strategy: + +.. code-block:: python + + from onetl.connection import Hive + from onetl.db import DBReader + from onetl.strategy import IncrementalStrategy + + hive = Hive(...) + + reader = DBReader( + connection=hive, + source="schema.table", + columns=["id", "key", "CAST(value AS text) value", "updated_dt"], + where="key = 'something'", + hwm=DBReader.AutoDetectHWM(name="hive_hwm", expression="updated_dt"), + ) + + with IncrementalStrategy(): + df = reader.run() + +Recommendations +--------------- + +Use column-based write formats +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Prefer these write formats: + * `ORC `_ + * `Parquet `_ + * `Iceberg `_ + * `Hudi `_ + * `Delta `_ + +For colum-based write formats, each file contains separated sections there column data is stored. The file footer contains +location of each column section/group. Spark can use this information to load only sections required by specific query, e.g. only selected columns, +to drastically speed up the query. + +Another advantage is high compression ratio, e.g. 10x-100x in comparison to JSON or CSV. + +Select only required columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of passing ``"*"`` in ``DBReader(columns=[...])`` prefer passing exact column names. +This drastically reduces the amount of data read by Spark, **if column-based file formats are used**. + +Use partition columns in ``where`` clause +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Queries should include ``WHERE`` clause with filters on Hive partitioning columns. +This allows Spark to read only small set of files (*partition pruning*) instead of scanning the entire table, so this drastically increases performance. + +Supported operators are: ``=``, ``>``, ``<`` and ``BETWEEN``, and only against some **static** value. diff --git a/docs/connection/db_connection/hive/sql.rst b/docs/connection/db_connection/hive/sql.rst new file mode 100644 index 000000000..7b02ec2b7 --- /dev/null +++ b/docs/connection/db_connection/hive/sql.rst @@ -0,0 +1,81 @@ +.. _hive-sql: + +Reading from Hive using ``Hive.sql`` +==================================== + +``Hive.sql`` allows passing custom SQL query, but does not support incremental strategies. + +Syntax support +-------------- + +Only queries with the following syntax are supported: + +* ✅︎ ``SELECT ... FROM ...`` +* ✅︎ ``WITH alias AS (...) SELECT ...`` +* ❌ ``SET ...; SELECT ...;`` - multiple statements not supported + +.. warning:: + + Actually, query should be written using `SparkSQL `_ syntax, not HiveQL. + +Examples +-------- + +.. code-block:: python + + from onetl.connection import Hive + + hive = Hive(...) + df = hive.sql( + """ + SELECT + id, + key, + CAST(value AS text) value, + updated_at + FROM + some.mytable + WHERE + key = 'something' + """ + ) + +Recommendations +--------------- + +Use column-based write formats +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Prefer these write formats: + * `ORC `_ + * `Parquet `_ + * `Iceberg `_ + * `Hudi `_ + * `Delta `_ + +For colum-based write formats, each file contains separated sections there column data is stored. The file footer contains +location of each column section/group. Spark can use this information to load only sections required by specific query, e.g. only selected columns, +to drastically speed up the query. + +Another advantage is high compression ratio, e.g. 10x-100x in comparison to JSON or CSV. + +Select only required columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of passing ``SELECT * FROM ...`` prefer passing exact column names ``SELECT col1, col2, ...``. +This drastically reduces the amount of data read by Spark, **if column-based file formats are used**. + +Use partition columns in ``where`` clause +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Queries should include ``WHERE`` clause with filters on Hive partitioning columns. +This allows Spark to read only small set of files (*partition pruning*) instead of scanning the entire table, so this drastically increases performance. + +Supported operators are: ``=``, ``>``, ``<`` and ``BETWEEN``, and only against some **static** value. + +Details +------- + +.. currentmodule:: onetl.connection.db_connection.hive.connection + +.. automethod:: Hive.sql diff --git a/docs/connection/db_connection/hive/write.rst b/docs/connection/db_connection/hive/write.rst index 70c9f3099..0b286a83d 100644 --- a/docs/connection/db_connection/hive/write.rst +++ b/docs/connection/db_connection/hive/write.rst @@ -1,9 +1,195 @@ .. _hive-write: -Writing to Hive -=============== +Writing to Hive using ``DBWriter`` +=================================== -For writing data to Hive, use :obj:`DBWriter ` with options below. +For writing data to Hive, use :obj:`DBWriter `. + +Examples +-------- + +.. code-block:: python + + from onetl.connection import Hive + from onetl.db import DBWriter + + hive = Hive(...) + + df = ... # data is here + + # Create dataframe with specific number of Spark partitions. + # Use the Hive partitioning columns to group the data. Create only 20 files per Hive partition. + # Also sort the data by column which most data is correlated with, reducing files size. + write_df = df.repartition( + 20, "country", "business_date", "user_id" + ).sortWithinPartitions("country", "business_date", "user_id") + + writer = DBWriter( + connection=hive, + target="schema.table", + options=Hive.WriteOptions( + if_exists="append", + # Hive partitioning columns. + # `user_id`` column is not included, as it has a lot of distinct values. + partitionBy=["country", "business_date"], + ), + ) + + writer.run(write_df) + +Recommendations +--------------- + +Use column-based write formats +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Prefer these write formats: + * `ORC `_ + * `Parquet `_ + * `Iceberg `_ + * `Hudi `_ + * `Delta `_ + +For colum-based write formats, each file contains separated sections there column data is stored. The file footer contains +location of each column section/group. Spark can use this information to load only sections required by specific query, e.g. only selected columns, +to drastically speed up the query. + +Another advantage is high compression ratio, e.g. 10x-100x in comparison to JSON or CSV. + +Use partitioning +~~~~~~~~~~~~~~~~ + +How does it work +^^^^^^^^^^^^^^^^ + +Hive support splitting data to partitions, which are different directories in filesystem with names like ``some_col=value1/another_col=value2``. + +For example, dataframe with content like this: + ++-----------------+---------------------+--------------+-------------+ +| country: string | business_date: date | user_id: int | bytes: long | ++=================+=====================+==============+=============+ +| RU | 2024-01-01 | 1234 | 25325253525 | ++-----------------+---------------------+--------------+-------------+ +| RU | 2024-01-01 | 2345 | 23234535243 | ++-----------------+---------------------+--------------+-------------+ +| RU | 2024-01-02 | 1234 | 62346634564 | ++-----------------+---------------------+--------------+-------------+ +| US | 2024-01-01 | 5678 | 4252345354 | ++-----------------+---------------------+--------------+-------------+ +| US | 2024-01-02 | 5678 | 5474575745 | ++-----------------+---------------------+--------------+-------------+ +| US | 2024-01-03 | 5678 | 3464574567 | ++-----------------+---------------------+--------------+-------------+ + +With ``partition_by=["country", "business_dt"]`` data will be stored as files in the following subfolders: + * ``/country=RU/business_date=2024-01-01/`` + * ``/country=RU/business_date=2024-01-02/`` + * ``/country=US/business_date=2024-01-01/`` + * ``/country=US/business_date=2024-01-02/`` + * ``/country=US/business_date=2024-01-03/`` + +A separated subdirectory is created for each distinct combination of column values in the dataframe. + +Please do not confuse Spark dataframe partitions (a.k.a batches of data handled by Spark executors, usually in parallel) +and Hive partitioning (store data in different subdirectories). +Number of Spark dataframe partitions is correlated the number of files created in **each** Hive partition. +For example, Spark dataframe with 10 partitions and 5 distinct values of Hive partition columns will be saved as 5 subfolders with 10 files each = 50 files in total. +Without Hive partitioning, all the files are placed into one flat directory. + +But why? +^^^^^^^^ + +Queries which has ``WHERE`` clause with filters on Hive partitioning columns, like ``WHERE country = 'RU' AND business_date='2024-01-01'``, will +read only files from this exact partitions, like ``/country=RU/business_date=2024-01-01/``, and skip files from other partitions. + +This drastically increases performance and reduces the amount of memory used by Spark. +Consider using Hive partitioning in all tables. + +Which columns should I use? +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Usually Hive partitioning columns are based on event date or location, like ``country: string``, ``business_date: date``, ``run_date: date`` and so on. + +**Partition columns should contain data with low cardinality.** +Dates, small integers, strings with low number of possible values are OK. +But timestamp, float, decimals, longs (like user id), strings with lots oj unique values (like user name or email) should **NOT** be used as Hive partitioning columns. +Unlike some other databases, range and hash-based partitions are not supported. + +Partition column should be a part of a dataframe. If you want to partition values by date component of ``business_dt: timestamp`` column, +add a new column to dataframe like this: ``df.withColumn("business_date", date(df.business_dt))``. + +Use compression +~~~~~~~~~~~~~~~ + +Using compression algorithms like ``snappy``, ``lz4`` or ``zstd`` can reduce the size of files (up to 10x). + +Prefer creating large files +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Storing millions of small files is not that HDFS and S3 are designed for. Minimal file size should be at least 10Mb, but usually it is like 128Mb+ or 256Mb+ (HDFS block size). +**NEVER** create files with few Kbytes in size. + +Number of files can be different in different cases. +On one hand, Spark Adaptive Query Execution (AQE) can merge small Spark dataframe partitions into one larger. +On the other hand, dataframes with skewed data can produce a larger number of files than expected. + +To create small amount of large files, you can reduce number of Spark dataframe partitions. +Use `df.repartition(N, columns...) `_ function, +like this: ``df.repartition(20, "col1", "col2")``. +This creates new Spark dataframe with partitions using ``hash(df.col1 + df.col2) mod 20`` expression, avoiding data skew. + +Note: larger dataframe partitions requires more resources (CPU, RAM) on Spark executor. The exact number of partitions +should be determined empirically, as it depends on the amount of data and available resources. + +Sort data before writing +~~~~~~~~~~~~~~~~~~~~~~~~ + +Dataframe with sorted content: + ++-----------------+---------------------+--------------+-------------------------+-------------+ +| country: string | business_date: date | user_id: int | business_dt: timestamp | bytes: long | ++=================+=====================+==============+=========================+=============+ +| RU | 2024-01-01 | 1234 | 2024-01-01T11:22:33.456 | 25325253525 | ++-----------------+---------------------+--------------+-------------------------+-------------+ +| RU | 2024-01-01 | 1234 | 2024-01-01T12:23:44.567 | 25325253525 | ++-----------------+---------------------+--------------+-------------------------+-------------+ +| RU | 2024-01-02 | 1234 | 2024-01-01T13:25:56.789 | 34335645635 | ++-----------------+---------------------+--------------+-------------------------+-------------+ +| US | 2024-01-01 | 2345 | 2024-01-01T10:00:00.000 | 12341 | ++-----------------+---------------------+--------------+-------------------------+-------------+ +| US | 2024-01-02 | 2345 | 2024-01-01T15:11:22.345 | 13435 | ++-----------------+---------------------+--------------+-------------------------+-------------+ +| US | 2024-01-03 | 2345 | 2024-01-01T20:22:33.567 | 14564 | ++-----------------+---------------------+--------------+-------------------------+-------------+ + +Has a much better compression rate than unsorted one, e.g. 2x or even higher: + ++-----------------+---------------------+--------------+-------------------------+-------------+ +| country: string | business_date: date | user_id: int | business_dt: timestamp | bytes: long | ++=================+=====================+==============+=========================+=============+ +| RU | 2024-01-01 | 1234 | 2024-01-01T11:22:33.456 | 25325253525 | ++-----------------+---------------------+--------------+-------------------------+-------------+ +| RU | 2024-01-01 | 6345 | 2024-12-01T23:03:44.567 | 25365 | ++-----------------+---------------------+--------------+-------------------------+-------------+ +| RU | 2024-01-02 | 5234 | 2024-07-01T06:10:56.789 | 45643456747 | ++-----------------+---------------------+--------------+-------------------------+-------------+ +| US | 2024-01-01 | 4582 | 2024-04-01T17:59:00.000 | 362546475 | ++-----------------+---------------------+--------------+-------------------------+-------------+ +| US | 2024-01-02 | 2345 | 2024-09-01T04:24:22.345 | 3235 | ++-----------------+---------------------+--------------+-------------------------+-------------+ +| US | 2024-01-03 | 3575 | 2024-03-01T21:37:33.567 | 346345764 | ++-----------------+---------------------+--------------+-------------------------+-------------+ + +Choosing columns to sort data by is really depends on the data. If data is correlated with some specific +column, like in example above the amount of traffic is correlated with both ``user_id`` and ``timestamp``, +use ``df.sortWithinPartitions("user_id", "timestamp")`` before writing the data. + +If ``df.repartition(N, repartition_columns...)`` is used in combination with ``df.sortWithinPartitions(sort_columns...)``, +then ``sort_columns`` should start with ``repartition_columns`` or be equal to it. + +Options +------- .. currentmodule:: onetl.connection.db_connection.hive.options diff --git a/docs/connection/db_connection/kafka/index.rst b/docs/connection/db_connection/kafka/index.rst index a02aaecdc..9cef52066 100644 --- a/docs/connection/db_connection/kafka/index.rst +++ b/docs/connection/db_connection/kafka/index.rst @@ -7,7 +7,9 @@ Kafka :maxdepth: 1 :caption: Connection + prerequisites connection + troubleshooting .. toctree:: :maxdepth: 1 diff --git a/docs/connection/db_connection/kafka/prerequisites.rst b/docs/connection/db_connection/kafka/prerequisites.rst new file mode 100644 index 000000000..d6e0ac99d --- /dev/null +++ b/docs/connection/db_connection/kafka/prerequisites.rst @@ -0,0 +1,70 @@ +.. _kafka-prerequisites: + +Prerequisites +============= + +Version Compatibility +--------------------- + +* Kafka server versions: 0.10 or higher +* Spark versions: 2.4.x - 3.5.x +* Java versions: 8 - 17 + +See `official documentation `_. + +Installing PySpark +------------------ + +To use Kafka connector you should have PySpark installed (or injected to ``sys.path``) +BEFORE creating the connector instance. + +See :ref:`install-spark` installation instruction for more details. + +Connecting to Kafka +----------------------- + +Connection address +~~~~~~~~~~~~~~~~~~ + +Kafka is a distributed service, and usually has a list of brokers you can connect to (unlike other connectors, there only one host+port can be set). +Please contact your Kafka administrator to get addresses of these brokers, as there are no defaults. + +Also Kafka has a feature called *advertised listeners* - client connects to one broker, and received list of other brokers in the clusters. +So you don't have to pass all brokers to ``addresses``, it can be some subset. Other broker addresses will be fetched directly from the cluster. + +Connection protocol +~~~~~~~~~~~~~~~~~~~ + +Kafka can support different connection protocols. List of currently supported protocols: + * :obj:`PLAINTEXT ` (not secure) + * :obj:`SSL ` (secure, recommended) + +Note that specific port can listen for only one of these protocols, so it is important to set +proper port number + protocol combination. + +Authentication mechanism +~~~~~~~~~~~~~~~~~~~~~~~~ + +Kafka can support different authentication mechanism (also known as `SASL `_). +List of currently supported mechanisms: + * :obj:`PLAIN `. To no confuse this with ``PLAINTEXT`` connection protocol, onETL uses name ``BasicAuth``. + * :obj:`GSSAPI `. To simplify naming, onETL uses name ``KerberosAuth``. + * :obj:`SCRAM-SHA-256 or SCRAM-SHA-512 ` (recommended). + +Different mechanisms use different types of credentials (login + password, keytab file, and so on). + +Note that connection protocol and auth mechanism are set in pairs: + * If you see ``SASL_PLAINTEXT`` this means ``PLAINTEXT`` connection protocol + some auth mechanism. + * If you see ``SASL_SSL`` this means ``SSL`` connection protocol + some auth mechanism. + * If you see just ``PLAINTEXT`` or ``SSL`` (**no** ``SASL``), this means that authentication is disabled (anonymous access). + +Please contact your Kafka administrator to get details about enabled auth mechanism in a specific Kafka instance. + +Required grants +~~~~~~~~~~~~~~~ + +Ask your Kafka administrator to set following grants for a user, *if Kafka instance uses ACL*: + * ``Describe`` + ``Read`` for reading data from Kafka (Consumer). + * ``Describe`` + ``Write`` for writing data from Kafka (Producer). + +More details can be found in `documentation `_. diff --git a/docs/connection/db_connection/kafka/read.rst b/docs/connection/db_connection/kafka/read.rst index 8b2917943..292089e06 100644 --- a/docs/connection/db_connection/kafka/read.rst +++ b/docs/connection/db_connection/kafka/read.rst @@ -3,62 +3,132 @@ Reading from Kafka ================== -For reading data from Kafka, use :obj:`DBReader ` with specific options (see below). +Data can be read from Kafka to Spark using :obj:`DBReader `. +It also supports :ref:`strategy` for incremental data reading. + +Supported DBReader features +--------------------------- + +* ❌ ``columns`` (is not supported by Kafka) +* ❌ ``where`` (is not supported by Kafka) +* ✅︎ ``hwm``, supported strategies: +* * ✅︎ :ref:`snapshot-strategy` +* * ✅︎ :ref:`incremental-strategy` +* * ❌ :ref:`snapshot-batch-strategy` +* * ❌ :ref:`incremental-batch-strategy` +* ❌ ``hint`` (is not supported by Kafka) +* ❌ ``df_schema`` (see note below) +* ✅︎ ``options`` (see :obj:`Kafka.ReadOptions `) + +Dataframe schema +---------------- + +Unlike other DB connections, Kafka does not have concept of columns. +All the topics messages have the same set of fields, see structure below: + +.. code:: text + + root + |-- key: binary (nullable = true) + |-- value: binary (nullable = true) + |-- topic: string (nullable = false) + |-- partition: integer (nullable = false) + |-- offset: integer (nullable = false) + |-- timestamp: timestamp (nullable = false) + |-- timestampType: integer (nullable = false) + |-- headers: struct (nullable = true) + |-- key: string (nullable = false) + |-- value: binary (nullable = true) + +``headers`` field is present in the dataframe only if ``Kafka.ReadOptions(include_headers=True)`` is passed (compatibility with Kafka 1.x). + + +Value deserialization +--------------------- + +To read ``value`` or ``key`` of other type than bytes (e.g. struct or integer), users have to deserialize values manually. + +This could be done using following methods: + * :obj:`Avro.parse_column ` + * :obj:`JSON.parse_column ` + * :obj:`CSV.parse_column ` + * :obj:`XML.parse_column ` + +Examples +-------- + +Snapshot strategy, ``value`` is Avro binary data: + +.. code-block:: python + + from onetl.connection import Kafka + from onetl.db import DBReader, DBWriter + from onetl.file.format import Avro + from pyspark.sql.functions import decode + + # read all topic data from Kafka + kafka = Kafka(...) + reader = DBReader(connection=kafka, source="avro_topic") + read_df = reader.run() + + # parse Avro format to Spark struct + avro = Avro( + schema_dict={ + "type": "record", + "name": "Person", + "fields": [ + {"name": "name", "type": "string"}, + {"name": "age", "type": "int"}, + ], + } + ) + deserialized_df = read_df.select( + # cast binary key to string + decode("key", "UTF-8").alias("key"), + avro.parse_column("value"), + ) + +Incremental strategy, ``value`` is JSON string: .. note:: - Unlike other connection classes, Kafka always return dataframe with fixed schema - (see `documentation `_): - - .. dropdown:: DataFrame Schema - - .. code:: python - - from pyspark.sql.types import ( - ArrayType, - BinaryType, - IntegerType, - LongType, - StringType, - StructField, - StructType, - TimestampType, - ) - - schema = StructType( - [ - StructField("value", BinaryType(), nullable=True), - StructField("key", BinaryType(), nullable=True), - StructField("topic", StringType(), nullable=False), - StructField("partition", IntegerType(), nullable=False), - StructField("offset", LongType(), nullable=False), - StructField("timestamp", TimestampType(), nullable=False), - StructField("timestampType", IntegerType(), nullable=False), - # this field is returned only with ``include_headers=True`` - StructField( - "headers", - ArrayType( - StructType( - [ - StructField("key", StringType(), nullable=False), - StructField("value", BinaryType(), nullable=True), - ], - ), - ), - nullable=True, - ), - ], - ) - -.. warning:: - - Columns: - - * ``value`` - * ``key`` - * ``headers[*].value`` - - are always returned as raw bytes. If they contain values of custom type, these values should be deserialized manually. + Currently Kafka connector does support only HWMs based on ``offset`` field. Other fields, like ``timestamp``, are not yet supported. + +.. code-block:: python + + from onetl.connection import Kafka + from onetl.db import DBReader, DBWriter + from onetl.file.format import JSON + from pyspark.sql.functions import decode + + kafka = Kafka(...) + + # read only new data from Kafka topic + reader = DBReader( + connection=kafka, + source="topic_name", + hwm=DBReader.AutoDetectHWM(name="kafka_hwm", expression="offset"), + ) + + with IncrementalStrategy(): + read_df = reader.run() + + # parse JSON format to Spark struct + json = JSON() + schema = StructType( + [ + StructField("name", StringType(), nullable=True), + StructField("age", IntegerType(), nullable=True), + ], + ) + deserialized_df = read_df.select( + # cast binary key to string + decode("key", "UTF-8").alias("key"), + json.parse_column("value", json), + ) + +Options +------- .. currentmodule:: onetl.connection.db_connection.kafka.options diff --git a/docs/connection/db_connection/kafka/troubleshooting.rst b/docs/connection/db_connection/kafka/troubleshooting.rst new file mode 100644 index 000000000..fde6fafa6 --- /dev/null +++ b/docs/connection/db_connection/kafka/troubleshooting.rst @@ -0,0 +1,13 @@ +.. _kafka-troubleshooting: + +Kafka Troubleshooting +===================== + +.. note:: + + General guide: :ref:`troubleshooting`. + +Cannot connect using ``SSL`` protocol +------------------------------------- + +Please check that certificate files are not Base-64 encoded. diff --git a/docs/connection/db_connection/kafka/write.rst b/docs/connection/db_connection/kafka/write.rst index 064c8ead1..6b24aa251 100644 --- a/docs/connection/db_connection/kafka/write.rst +++ b/docs/connection/db_connection/kafka/write.rst @@ -5,58 +5,68 @@ Writing to Kafka For writing data to Kafka, use :obj:`DBWriter ` with specific options (see below). -.. note:: - - Unlike other connection classes, Kafka only accepts dataframe with fixed schema - (see `documentation `_): - - .. dropdown:: DataFrame Schema - - .. code:: python - - from pyspark.sql.types import ( - ArrayType, - BinaryType, - IntegerType, - StringType, - StructField, - StructType, - ) - - schema = StructType( - [ - # mandatory fields: - StructField("value", BinaryType(), nullable=True), - # optional fields, can be omitted: - StructField("key", BinaryType(), nullable=True), - StructField("partition", IntegerType(), nullable=True), - # this field can be passed only with ``include_headers=True`` - StructField( - "headers", - ArrayType( - StructType( - [ - StructField("key", StringType(), nullable=False), - StructField("value", BinaryType(), nullable=True), - ], - ), - ), - nullable=True, - ), - ], - ) - - You cannot pass dataframe with other column names or types. - -.. warning:: - - Columns: - - * ``value`` - * ``key`` - * ``headers[*].value`` - - can only be string or raw bytes. If they contain values of custom type, these values should be serialized manually. +Dataframe schema +---------------- + +Unlike other DB connections, Kafka does not have concept of columns. +All the topics messages have the same set of fields. Only some of them can be written: + +.. code:: text + + root + |-- key: binary (nullable = true) + |-- value: binary (nullable = true) + |-- headers: struct (nullable = true) + |-- key: string (nullable = false) + |-- value: binary (nullable = true) + +``headers`` can be passed only with ``Kafka.WriteOptions(include_headers=True)`` (compatibility with Kafka 1.x). + +Field ``topic`` should not be present in the dataframe, as it is passed to ``DBWriter(target=...)``. + +Other fields, like ``partition``, ``offset``, ``timestamp`` are set by Kafka, and cannot be passed explicitly. + +Value serialization +------------------- + +To write ``value`` or ``key`` of other type than bytes (e.g. struct or integer), users have to serialize values manually. + +This could be done using following methods: + * :obj:`Avro.serialize_column ` + * :obj:`JSON.serialize_column ` + * :obj:`CSV.serialize_column ` + +Examples +-------- + +Convert ``value`` to JSON string, and write to Kafka: + +.. code-block:: python + + from onetl.connection import Kafka + from onetl.db import DBWriter + from onetl.file.format import JSON + + df = ... # original data is here + + # serialize struct data as JSON + json = JSON() + write_df = df.select( + df.key, + json.serialize_column(df.value), + ) + + # write data to Kafka + kafka = Kafka(...) + + writer = DBWriter( + connection=kafka, + target="topic_name", + ) + writer.run(write_df) + +Options +------- .. currentmodule:: onetl.connection.db_connection.kafka.options diff --git a/docs/connection/db_connection/mongodb/prerequisites.rst b/docs/connection/db_connection/mongodb/prerequisites.rst index 6bc6e90e9..7df5f5022 100644 --- a/docs/connection/db_connection/mongodb/prerequisites.rst +++ b/docs/connection/db_connection/mongodb/prerequisites.rst @@ -7,8 +7,7 @@ Version Compatibility --------------------- * MongoDB server versions: 4.0 or higher -* Spark versions: 3.2.x - 3.4.x -* Scala versions: 2.12 - 2.13 +* Spark versions: 3.2.x - 3.5.x * Java versions: 8 - 20 See `official documentation `_. diff --git a/docs/connection/db_connection/mongodb/read.rst b/docs/connection/db_connection/mongodb/read.rst index e90bc6e2b..860630e11 100644 --- a/docs/connection/db_connection/mongodb/read.rst +++ b/docs/connection/db_connection/mongodb/read.rst @@ -23,7 +23,7 @@ Supported DBReader features * * Note that ``expression`` field of HWM can only be a field name, not a custom expression * ✅︎ ``hint`` (see `official documentation `_) * ✅︎ ``df_schema`` (mandatory) -* ✅︎ ``options`` (see :obj:`MongoDBReadOptions `) +* ✅︎ ``options`` (see :obj:`MongoDB.ReadOptions `) Examples -------- diff --git a/docs/connection/db_connection/mongodb/types.rst b/docs/connection/db_connection/mongodb/types.rst index bd5978aa9..2a0231643 100644 --- a/docs/connection/db_connection/mongodb/types.rst +++ b/docs/connection/db_connection/mongodb/types.rst @@ -73,8 +73,8 @@ References Here you can find source code with type conversions: -* `MongoDB -> Spark `_ -* `Spark -> MongoDB `_ +* `MongoDB -> Spark `_ +* `Spark -> MongoDB `_ Supported types --------------- diff --git a/docs/connection/db_connection/mongodb/write.rst b/docs/connection/db_connection/mongodb/write.rst index 3ae86fece..3686a3be7 100644 --- a/docs/connection/db_connection/mongodb/write.rst +++ b/docs/connection/db_connection/mongodb/write.rst @@ -35,7 +35,7 @@ Examples Write options ------------- -Method above accepts :obj:`JDBCWriteOptions ` +Method above accepts :obj:`MongoDB.WriteOptions ` .. currentmodule:: onetl.connection.db_connection.mongodb.options diff --git a/docs/connection/db_connection/mssql/execute.rst b/docs/connection/db_connection/mssql/execute.rst index 729c75a30..13b348fa9 100644 --- a/docs/connection/db_connection/mssql/execute.rst +++ b/docs/connection/db_connection/mssql/execute.rst @@ -17,10 +17,10 @@ There are 2 ways to execute some statement in MSSQL Use ``MSSQL.fetch`` ~~~~~~~~~~~~~~~~~~~ -Use this method to execute some ``SELECT`` query which returns **small number or rows**, like reading +Use this method to perform some ``SELECT`` query which returns **small number or rows**, like reading MSSQL config, or reading data from some reference table. Method returns Spark DataFrame. -Method accepts :obj:`JDBCOptions `. +Method accepts :obj:`MSSQL.FetchOptions `. Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. @@ -49,7 +49,7 @@ Examples df = mssql.fetch( "SELECT value FROM some.reference_table WHERE key = 'some_constant'", - options=MSSQL.JDBCOptions(query_timeout=10), + options=MSSQL.FetchOptions(query_timeout=10), ) mssql.close() value = df.collect()[0][0] # get value from first row and first column @@ -59,7 +59,7 @@ Use ``MSSQL.execute`` Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it. -Method accepts :obj:`JDBCOptions `. +Method accepts :obj:`MSSQL.ExecuteOptions `. Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. @@ -97,15 +97,22 @@ Examples value NUMBER ) """, - options=MSSQL.JDBCOptions(query_timeout=10), + options=MSSQL.ExecuteOptions(query_timeout=10), ) Options ------- -.. currentmodule:: onetl.connection.db_connection.jdbc_mixin.options +.. currentmodule:: onetl.connection.db_connection.mssql.options -.. autopydantic_model:: JDBCOptions +.. autopydantic_model:: MSSQLFetchOptions + :inherited-members: GenericOptions + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false + +.. autopydantic_model:: MSSQLExecuteOptions + :inherited-members: GenericOptions :member-order: bysource :model-show-field-summary: false :field-show-constraints: false diff --git a/docs/connection/db_connection/mssql/read.rst b/docs/connection/db_connection/mssql/read.rst index 0c8599aea..c15402a42 100644 --- a/docs/connection/db_connection/mssql/read.rst +++ b/docs/connection/db_connection/mssql/read.rst @@ -22,7 +22,7 @@ Supported DBReader features * * ✅︎ :ref:`incremental-batch-strategy` * ❌ ``hint`` (MSSQL does support hints, but DBReader not, at least for now) * ❌ ``df_schema`` -* ✅︎ ``options`` (see :obj:`JDBCReadOptions `) +* ✅︎ ``options`` (see :obj:`MSSQL.ReadOptions `) Examples -------- @@ -85,9 +85,10 @@ Especially if there are indexes or partitions for columns used in ``where`` clau Options ------- -.. currentmodule:: onetl.connection.db_connection.jdbc_connection.options +.. currentmodule:: onetl.connection.db_connection.mssql.options -.. autopydantic_model:: JDBCReadOptions +.. autopydantic_model:: MSSQLReadOptions + :inherited-members: GenericOptions :member-order: bysource :model-show-field-summary: false :field-show-constraints: false diff --git a/docs/connection/db_connection/mssql/sql.rst b/docs/connection/db_connection/mssql/sql.rst index 8a59d376a..e456c6f4e 100644 --- a/docs/connection/db_connection/mssql/sql.rst +++ b/docs/connection/db_connection/mssql/sql.rst @@ -43,7 +43,7 @@ Examples WHERE key = 'something' """, - options=MSSQL.ReadOptions( + options=MSSQL.SQLOptions( partition_column="id", num_partitions=10, lower_bound=0, @@ -66,3 +66,14 @@ Pay attention to ``where`` value Instead of filtering data on Spark side using ``df.filter(df.column == 'value')`` pass proper ``WHERE column = 'value'`` clause. This both reduces the amount of data send from MSSQL to Spark, and may also improve performance of the query. Especially if there are indexes or partitions for columns used in ``where`` clause. + +Options +------- + +.. currentmodule:: onetl.connection.db_connection.mssql.options + +.. autopydantic_model:: MSSQLSQLOptions + :inherited-members: GenericOptions + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false diff --git a/docs/connection/db_connection/mssql/types.rst b/docs/connection/db_connection/mssql/types.rst index ea7fd7102..1052143ed 100644 --- a/docs/connection/db_connection/mssql/types.rst +++ b/docs/connection/db_connection/mssql/types.rst @@ -14,7 +14,7 @@ Reading from MSSQL This is how MSSQL connector performs this: * For each column in query result (``SELECT column1, column2, ... FROM table ...``) get column name and MSSQL type. -* Find corresponding ``MSSQL type (read)`` -> ``Spark type`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* Find corresponding ``MSSQL type (read)`` → ``Spark type`` combination (see below) for each DataFrame column. If no combination is found, raise exception. * Create DataFrame from query with specific column names and Spark types. Writing to some existing MSSQL table @@ -25,7 +25,7 @@ This is how MSSQL connector performs this: * Get names of columns in DataFrame. [1]_ * Perform ``SELECT * FROM table LIMIT 0`` query. * Take only columns present in DataFrame (by name, case insensitive). For each found column get MSSQL type. -* Find corresponding ``Spark type`` -> ``MSSQL type (write)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* Find corresponding ``Spark type`` → ``MSSQL type (write)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. * If ``MSSQL type (write)`` match ``MSSQL type (read)``, no additional casts will be performed, DataFrame column will be written to MSSQL as is. * If ``MSSQL type (write)`` does not match ``MSSQL type (read)``, DataFrame column will be casted to target column type **on MSSQL side**. For example, you can write column with text data to ``int`` column, if column contains valid integer values within supported value range and precision [2]_. @@ -48,7 +48,7 @@ Create new table using Spark This is how MSSQL connector performs this: -* Find corresponding ``Spark type`` -> ``MSSQL type (create)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* Find corresponding ``Spark type`` → ``MSSQL type (create)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. * Generate DDL for creating table in MSSQL, like ``CREATE TABLE (col1 ...)``, and run it. * Write DataFrame to created table as is. diff --git a/docs/connection/db_connection/mssql/write.rst b/docs/connection/db_connection/mssql/write.rst index 854283704..75d9cceb2 100644 --- a/docs/connection/db_connection/mssql/write.rst +++ b/docs/connection/db_connection/mssql/write.rst @@ -40,11 +40,12 @@ Examples Options ------- -Method above accepts :obj:`JDBCWriteOptions ` +Method above accepts :obj:`MSSQL.WriteOptions ` -.. currentmodule:: onetl.connection.db_connection.jdbc_connection.options +.. currentmodule:: onetl.connection.db_connection.mssql.options -.. autopydantic_model:: JDBCWriteOptions +.. autopydantic_model:: MSSQLWriteOptions + :inherited-members: GenericOptions :member-order: bysource :model-show-field-summary: false :field-show-constraints: false diff --git a/docs/connection/db_connection/mysql/execute.rst b/docs/connection/db_connection/mysql/execute.rst index 34460edae..f9e95f5e6 100644 --- a/docs/connection/db_connection/mysql/execute.rst +++ b/docs/connection/db_connection/mysql/execute.rst @@ -17,10 +17,10 @@ There are 2 ways to execute some statement in MySQL Use ``MySQL.fetch`` ~~~~~~~~~~~~~~~~~~~ -Use this method to execute some ``SELECT`` query which returns **small number or rows**, like reading +Use this method to perform some ``SELECT`` query which returns **small number or rows**, like reading MySQL config, or reading data from some reference table. Method returns Spark DataFrame. -Method accepts :obj:`JDBCOptions `. +Method accepts :obj:`MySQL.FetchOptions `. Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. @@ -50,7 +50,7 @@ Examples df = mysql.fetch( "SELECT value FROM some.reference_table WHERE key = 'some_constant'", - options=MySQL.JDBCOptions(query_timeout=10), + options=MySQL.FetchOptions(query_timeout=10), ) mysql.close() value = df.collect()[0][0] # get value from first row and first column @@ -60,7 +60,7 @@ Use ``MySQL.execute`` Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it. -Method accepts :obj:`JDBCOptions `. +Method accepts :obj:`MySQL.ExecuteOptions `. Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. @@ -69,7 +69,7 @@ Syntax support This method supports **any** query syntax supported by MySQL, like: -* ✅︎ ``CREATE TABLE ...``, ``CREATE VIEW ...``, ``DROP TABLE ...``, and so on +* ✅︎ ``CREATE TABLE ...``, ``CREATE VIEW ...``, and so on * ✅︎ ``ALTER ...`` * ✅︎ ``INSERT INTO ... SELECT ...``, ``UPDATE ...``, ``DELETE ...``, and so on * ✅︎ ``DROP TABLE ...``, ``DROP VIEW ...``, and so on @@ -98,15 +98,19 @@ Examples ) ENGINE = InnoDB """, - options=MySQL.JDBCOptions(query_timeout=10), + options=MySQL.ExecuteOptions(query_timeout=10), ) Options ------- -.. currentmodule:: onetl.connection.db_connection.jdbc_mixin.options +.. currentmodule:: onetl.connection.db_connection.mysql.options -.. autopydantic_model:: JDBCOptions +.. autopydantic_model:: MySQLFetchOptions + :inherited-members: GenericOptions + :member-order: bysource + + +.. autopydantic_model:: MySQLExecuteOptions + :inherited-members: GenericOptions :member-order: bysource - :model-show-field-summary: false - :field-show-constraints: false diff --git a/docs/connection/db_connection/mysql/prerequisites.rst b/docs/connection/db_connection/mysql/prerequisites.rst index 99b7820cb..225e630b2 100644 --- a/docs/connection/db_connection/mysql/prerequisites.rst +++ b/docs/connection/db_connection/mysql/prerequisites.rst @@ -6,7 +6,7 @@ Prerequisites Version Compatibility --------------------- -* MySQL server versions: 5.7, 8.0 +* MySQL server versions: 5.7 - 8.4 * Spark versions: 2.3.x - 3.5.x * Java versions: 8 - 20 diff --git a/docs/connection/db_connection/mysql/read.rst b/docs/connection/db_connection/mysql/read.rst index e72da45f1..2618e6859 100644 --- a/docs/connection/db_connection/mysql/read.rst +++ b/docs/connection/db_connection/mysql/read.rst @@ -22,7 +22,7 @@ Supported DBReader features * * ✅︎ :ref:`incremental-batch-strategy` * ✅︎ ``hint`` (see `official documentation `_) * ❌ ``df_schema`` -* ✅︎ ``options`` (see :obj:`JDBCReadOptions `) +* ✅︎ ``options`` (see :obj:`MySQL.ReadOptions `) Examples -------- @@ -87,9 +87,8 @@ Especially if there are indexes for columns used in ``where`` clause. Options ------- -.. currentmodule:: onetl.connection.db_connection.jdbc_connection.options +.. currentmodule:: onetl.connection.db_connection.mysql.options -.. autopydantic_model:: JDBCReadOptions +.. autopydantic_model:: MySQLReadOptions + :inherited-members: GenericOptions :member-order: bysource - :model-show-field-summary: false - :field-show-constraints: false diff --git a/docs/connection/db_connection/mysql/sql.rst b/docs/connection/db_connection/mysql/sql.rst index c161efa83..04881bec7 100644 --- a/docs/connection/db_connection/mysql/sql.rst +++ b/docs/connection/db_connection/mysql/sql.rst @@ -44,7 +44,7 @@ Examples WHERE key = 'something' """, - options=MySQL.ReadOptions( + options=MySQL.SQLOptions( partition_column="id", num_partitions=10, lower_bound=0, @@ -67,3 +67,14 @@ Pay attention to ``where`` value Instead of filtering data on Spark side using ``df.filter(df.column == 'value')`` pass proper ``WHERE column = 'value'`` clause. This both reduces the amount of data send from MySQL to Spark, and may also improve performance of the query. Especially if there are indexes or partitions for columns used in ``where`` clause. + +Options +------- + +.. currentmodule:: onetl.connection.db_connection.mysql.options + +.. autopydantic_model:: MySQLSQLOptions + :inherited-members: GenericOptions + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false diff --git a/docs/connection/db_connection/mysql/types.rst b/docs/connection/db_connection/mysql/types.rst index 431665f87..f3fca30a4 100644 --- a/docs/connection/db_connection/mysql/types.rst +++ b/docs/connection/db_connection/mysql/types.rst @@ -14,7 +14,7 @@ Reading from MySQL This is how MySQL connector performs this: * For each column in query result (``SELECT column1, column2, ... FROM table ...``) get column name and MySQL type. -* Find corresponding ``MySQL type (read)`` -> ``Spark type`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* Find corresponding ``MySQL type (read)`` → ``Spark type`` combination (see below) for each DataFrame column. If no combination is found, raise exception. * Create DataFrame from query with specific column names and Spark types. Writing to some existing MySQL table @@ -25,7 +25,7 @@ This is how MySQL connector performs this: * Get names of columns in DataFrame. [1]_ * Perform ``SELECT * FROM table LIMIT 0`` query. * Take only columns present in DataFrame (by name, case insensitive). For each found column get MySQL type. -* Find corresponding ``Spark type`` -> ``MySQL type (write)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* Find corresponding ``Spark type`` → ``MySQL type (write)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. * If ``MySQL type (write)`` match ``MySQL type (read)``, no additional casts will be performed, DataFrame column will be written to MySQL as is. * If ``MySQL type (write)`` does not match ``MySQL type (read)``, DataFrame column will be casted to target column type **on MySQL side**. For example, you can write column with text data to ``int`` column, if column contains valid integer values within supported value range and precision. @@ -42,7 +42,7 @@ Create new table using Spark This is how MySQL connector performs this: -* Find corresponding ``Spark type`` -> ``MySQL type (create)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* Find corresponding ``Spark type`` → ``MySQL type (create)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. * Generate DDL for creating table in MySQL, like ``CREATE TABLE (col1 ...)``, and run it. * Write DataFrame to created table as is. @@ -289,16 +289,15 @@ It is possible to explicitly cast column type using ``DBReader(columns=...)`` sy For example, you can use ``CAST(column AS text)`` to convert data to string representation on MySQL side, and so it will be read as Spark's ``StringType()``. -It is also possible to use `JSON_OBJECT `_ MySQL function -to convert column of any type to string representation, and then parse this column on Spark side using -`from_json `_: +It is also possible to use `JSON_OBJECT `_ MySQL function and parse JSON columns in MySQL with the :obj:`JSON.parse_column ` method. .. code-block:: python - from pyspark.sql.types import IntegerType, StructField, StructType + from pyspark.sql.types import IntegerType, StructType, StructField from onetl.connection import MySQL from onetl.db import DBReader + from onetl.file.format import JSON mysql = MySQL(...) @@ -314,30 +313,30 @@ to convert column of any type to string representation, and then parse this colu ) df = reader.run() - # Spark requires all columns to have some type, describe it - column_type = StructType([StructField("key", IntegerType())]) + json_scheme = StructType([StructField("key", IntegerType())]) - # cast column content to proper Spark type df = df.select( df.id, df.supported_column, # explicit cast df.unsupported_column_str.cast("integer").alias("parsed_integer"), - # or explicit json parsing - from_json(df.json_column, schema).alias("struct_column"), + JSON().parse_column("json_column", json_scheme).alias("struct_column"), ) ``DBWriter`` ~~~~~~~~~~~~ -Convert dataframe column to JSON using `to_json `_, -and write it as ``text`` column in MySQL: +To write JSON data to a ``json`` or ``text`` column in a MySQL table, use the :obj:`JSON.serialize_column ` method. -.. code:: python +.. code-block:: python + + from onetl.connection import MySQL + from onetl.db import DBWriter + from onetl.file.format import JSON mysql.execute( """ - CREATE TABLE schema.target_tbl AS ( + CREATE TABLE schema.target_tbl ( id bigint, array_column_json json -- any string type, actually ) @@ -345,11 +344,9 @@ and write it as ``text`` column in MySQL: """, ) - from pyspark.sql.functions import to_json - df = df.select( df.id, - to_json(df.array_column).alias("array_column_json"), + JSON().serialize_column(df.array_column).alias("array_column_json"), ) writer.run(df) diff --git a/docs/connection/db_connection/mysql/write.rst b/docs/connection/db_connection/mysql/write.rst index 2d7c056c9..869ccc7c5 100644 --- a/docs/connection/db_connection/mysql/write.rst +++ b/docs/connection/db_connection/mysql/write.rst @@ -44,11 +44,12 @@ Examples Options ------- -Method above accepts :obj:`JDBCWriteOptions ` +Method above accepts :obj:`MySQL.WriteOptions ` -.. currentmodule:: onetl.connection.db_connection.jdbc_connection.options +.. currentmodule:: onetl.connection.db_connection.mysql.options -.. autopydantic_model:: JDBCWriteOptions +.. autopydantic_model:: MySQLWriteOptions + :inherited-members: GenericOptions :member-order: bysource :model-show-field-summary: false :field-show-constraints: false diff --git a/docs/connection/db_connection/oracle/execute.rst b/docs/connection/db_connection/oracle/execute.rst index b8533b278..fff504ee9 100644 --- a/docs/connection/db_connection/oracle/execute.rst +++ b/docs/connection/db_connection/oracle/execute.rst @@ -20,7 +20,7 @@ Use ``Oracle.fetch`` Use this method to execute some ``SELECT`` query which returns **small number or rows**, like reading Oracle config, or reading data from some reference table. Method returns Spark DataFrame. -Method accepts :obj:`JDBCOptions `. +Method accepts :obj:`Oracle.FetchOptions `. Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. @@ -50,7 +50,7 @@ Examples df = oracle.fetch( "SELECT value FROM some.reference_table WHERE key = 'some_constant'", - options=Oracle.JDBCOptions(query_timeout=10), + options=Oracle.FetchOptions(query_timeout=10), ) oracle.close() value = df.collect()[0][0] # get value from first row and first column @@ -60,7 +60,7 @@ Use ``Oracle.execute`` Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it. -Method accepts :obj:`JDBCOptions `. +Method accepts :obj:`Oracle.ExecuteOptions `. Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. @@ -98,15 +98,19 @@ Examples value NUMBER ) """, - options=Oracle.JDBCOptions(query_timeout=10), + options=Oracle.ExecuteOptions(query_timeout=10), ) Options ------- -.. currentmodule:: onetl.connection.db_connection.jdbc_mixin.options +.. currentmodule:: onetl.connection.db_connection.oracle.options -.. autopydantic_model:: JDBCOptions +.. autopydantic_model:: OracleFetchOptions + :inherited-members: GenericOptions + :member-order: bysource + + +.. autopydantic_model:: OracleExecuteOptions + :inherited-members: GenericOptions :member-order: bysource - :model-show-field-summary: false - :field-show-constraints: false diff --git a/docs/connection/db_connection/oracle/read.rst b/docs/connection/db_connection/oracle/read.rst index 6592cfc7b..9fd12a035 100644 --- a/docs/connection/db_connection/oracle/read.rst +++ b/docs/connection/db_connection/oracle/read.rst @@ -22,7 +22,7 @@ Supported DBReader features * * ✅︎ :ref:`incremental-batch-strategy` * ✅︎ ``hint`` (see `official documentation `_) * ❌ ``df_schema`` -* ✅︎ ``options`` (see :obj:`JDBCReadOptions `) +* ✅︎ ``options`` (see :obj:`Oracle.ReadOptions `) Examples -------- @@ -87,9 +87,8 @@ Especially if there are indexes or partitions for columns used in ``where`` clau Options ------- -.. currentmodule:: onetl.connection.db_connection.jdbc_connection.options +.. currentmodule:: onetl.connection.db_connection.oracle.options -.. autopydantic_model:: JDBCReadOptions +.. autopydantic_model:: OracleReadOptions + :inherited-members: GenericOptions :member-order: bysource - :model-show-field-summary: false - :field-show-constraints: false diff --git a/docs/connection/db_connection/oracle/sql.rst b/docs/connection/db_connection/oracle/sql.rst index 969c28afa..afe46b064 100644 --- a/docs/connection/db_connection/oracle/sql.rst +++ b/docs/connection/db_connection/oracle/sql.rst @@ -44,7 +44,7 @@ Examples WHERE key = 'something' """, - options=Oracle.ReadOptions( + options=Oracle.SQLOptions( partition_column="id", num_partitions=10, lower_bound=0, @@ -67,3 +67,14 @@ Pay attention to ``where`` value Instead of filtering data on Spark side using ``df.filter(df.column == 'value')`` pass proper ``WHERE column = 'value'`` clause. This both reduces the amount of data send from Oracle to Spark, and may also improve performance of the query. Especially if there are indexes or partitions for columns used in ``where`` clause. + +Options +------- + +.. currentmodule:: onetl.connection.db_connection.oracle.options + +.. autopydantic_model:: OracleSQLOptions + :inherited-members: GenericOptions + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false diff --git a/docs/connection/db_connection/oracle/types.rst b/docs/connection/db_connection/oracle/types.rst index c06e43b13..2c6116ce4 100644 --- a/docs/connection/db_connection/oracle/types.rst +++ b/docs/connection/db_connection/oracle/types.rst @@ -14,7 +14,7 @@ Reading from Oracle This is how Oracle connector performs this: * For each column in query result (``SELECT column1, column2, ... FROM table ...``) get column name and Oracle type. -* Find corresponding ``Oracle type (read)`` -> ``Spark type`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* Find corresponding ``Oracle type (read)`` → ``Spark type`` combination (see below) for each DataFrame column. If no combination is found, raise exception. * Create DataFrame from query with specific column names and Spark types. Writing to some existing Oracle table @@ -25,8 +25,8 @@ This is how Oracle connector performs this: * Get names of columns in DataFrame. [1]_ * Perform ``SELECT * FROM table LIMIT 0`` query. * Take only columns present in DataFrame (by name, case insensitive). For each found column get Clickhouse type. -* **Find corresponding** ``Oracle type (read)`` -> ``Spark type`` **combination** (see below) for each DataFrame column. If no combination is found, raise exception. [2]_ -* Find corresponding ``Spark type`` -> ``Oracle type (write)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* **Find corresponding** ``Oracle type (read)`` → ``Spark type`` **combination** (see below) for each DataFrame column. If no combination is found, raise exception. [2]_ +* Find corresponding ``Spark type`` → ``Oracle type (write)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. * If ``Oracle type (write)`` match ``Oracle type (read)``, no additional casts will be performed, DataFrame column will be written to Oracle as is. * If ``Oracle type (write)`` does not match ``Oracle type (read)``, DataFrame column will be casted to target column type **on Oracle side**. For example, you can write column with text data to ``int`` column, if column contains valid integer values within supported value range and precision. @@ -48,7 +48,7 @@ Create new table using Spark This is how Oracle connector performs this: -* Find corresponding ``Spark type`` -> ``Oracle type (create)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* Find corresponding ``Spark type`` → ``Oracle type (create)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. * Generate DDL for creating table in Oracle, like ``CREATE TABLE (col1 ...)``, and run it. * Write DataFrame to created table as is. @@ -301,12 +301,12 @@ For example, you can use ``CAST(column AS CLOB)`` to convert data to string repr It is also possible to use `JSON_ARRAY `_ or `JSON_OBJECT `_ Oracle functions -to convert column of any type to string representation, and then parse this column on Spark side using -`from_json `_: +to convert column of any type to string representation. Then this JSON string can then be effectively parsed using the :obj:`JSON.parse_column ` method. .. code-block:: python - from pyspark.sql.types import IntegerType + from onetl.file.format import JSON + from pyspark.sql.types import IntegerType, StructType, StructField from onetl.connection import Oracle from onetl.db import DBReader @@ -325,32 +325,27 @@ to convert column of any type to string representation, and then parse this colu ) df = reader.run() - # Spark requires all columns to have some type, describe it - column_type = IntegerType() + json_scheme = StructType([StructField("key", IntegerType())]) - # cast column content to proper Spark type df = df.select( df.id, df.supported_column, - # explicit cast df.unsupported_column_str.cast("integer").alias("parsed_integer"), - # or explicit json parsing - from_json(df.array_column_json, schema).alias("array_column"), + JSON().parse_column("array_column_json", json_scheme).alias("array_column"), ) ``DBWriter`` ~~~~~~~~~~~~ -It is always possible to convert data on Spark side to string, and then write it to ``text`` column in Oracle table. +It is always possible to convert data on Spark side to string, and then write it to text column in Oracle table. -For example, you can convert data using `to_json `_ function. +To serialize and write JSON data to a ``text`` or ``json`` column in an Oracle table use the :obj:`JSON.serialize_column ` method. -.. code:: python - - from pyspark.sql.functions import to_json +.. code-block:: python from onetl.connection import Oracle - from onetl.db import DBReader + from onetl.db import DBWriter + from onetl.file.format import JSON oracle = Oracle(...) @@ -367,7 +362,7 @@ For example, you can convert data using `to_json ` +Method above accepts :obj:`OracleWriteOptions ` -.. currentmodule:: onetl.connection.db_connection.jdbc_connection.options +.. currentmodule:: onetl.connection.db_connection.oracle.options -.. autopydantic_model:: JDBCWriteOptions +.. autopydantic_model:: OracleWriteOptions + :inherited-members: GenericOptions :member-order: bysource :model-show-field-summary: false :field-show-constraints: false diff --git a/docs/connection/db_connection/postgres/execute.rst b/docs/connection/db_connection/postgres/execute.rst index 8c966c110..e8ef17b03 100644 --- a/docs/connection/db_connection/postgres/execute.rst +++ b/docs/connection/db_connection/postgres/execute.rst @@ -20,7 +20,7 @@ Use ``Postgres.fetch`` Use this method to execute some ``SELECT`` query which returns **small number or rows**, like reading Postgres config, or reading data from some reference table. Method returns Spark DataFrame. -Method accepts :obj:`JDBCOptions `. +Method accepts :obj:`Postgres.FetchOptions `. Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. @@ -48,7 +48,7 @@ Examples df = postgres.fetch( "SELECT value FROM some.reference_table WHERE key = 'some_constant'", - options=Postgres.JDBCOptions(query_timeout=10), + options=Postgres.FetchOptions(query_timeout=10), ) postgres.close() value = df.collect()[0][0] # get value from first row and first column @@ -58,7 +58,7 @@ Use ``Postgres.execute`` Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it. -Method accepts :obj:`JDBCOptions `. +Method accepts :obj:`Postgres.ExecuteOptions `. Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. @@ -67,7 +67,7 @@ Syntax support This method supports **any** query syntax supported by Postgres, like: -* ✅︎ ``CREATE TABLE ...``, ``CREATE VIEW ...``, ``DROP TABLE ...``, and so on +* ✅︎ ``CREATE TABLE ...``, ``CREATE VIEW ...``, and so on * ✅︎ ``ALTER ...`` * ✅︎ ``INSERT INTO ... SELECT ...``, ``UPDATE ...``, ``DELETE ...``, and so on * ✅︎ ``DROP TABLE ...``, ``DROP VIEW ...``, and so on @@ -96,15 +96,19 @@ Examples value real ) """, - options=Postgres.JDBCOptions(query_timeout=10), + options=Postgres.ExecuteOptions(query_timeout=10), ) Options ------- -.. currentmodule:: onetl.connection.db_connection.jdbc_mixin.options +.. currentmodule:: onetl.connection.db_connection.postgres.options -.. autopydantic_model:: JDBCOptions +.. autopydantic_model:: PostgresFetchOptions + :inherited-members: GenericOptions + :member-order: bysource + + +.. autopydantic_model:: PostgresExecuteOptions + :inherited-members: GenericOptions :member-order: bysource - :model-show-field-summary: false - :field-show-constraints: false diff --git a/docs/connection/db_connection/postgres/read.rst b/docs/connection/db_connection/postgres/read.rst index 67b5234a2..fa3fe1728 100644 --- a/docs/connection/db_connection/postgres/read.rst +++ b/docs/connection/db_connection/postgres/read.rst @@ -22,7 +22,7 @@ Supported DBReader features * * ✅︎ :ref:`incremental-batch-strategy` * ❌ ``hint`` (is not supported by Postgres) * ❌ ``df_schema`` -* ✅︎ ``options`` (see :obj:`JDBCReadOptions `) +* ✅︎ ``options`` (see :obj:`Postgres.ReadOptions `) Examples -------- @@ -85,9 +85,8 @@ Especially if there are indexes or partitions for columns used in ``where`` clau Options ------- -.. currentmodule:: onetl.connection.db_connection.jdbc_connection.options +.. currentmodule:: onetl.connection.db_connection.postgres.options -.. autopydantic_model:: JDBCReadOptions +.. autopydantic_model:: PostgresReadOptions + :inherited-members: GenericOptions :member-order: bysource - :model-show-field-summary: false - :field-show-constraints: false diff --git a/docs/connection/db_connection/postgres/sql.rst b/docs/connection/db_connection/postgres/sql.rst index 3f762f1ad..bfa90e689 100644 --- a/docs/connection/db_connection/postgres/sql.rst +++ b/docs/connection/db_connection/postgres/sql.rst @@ -43,7 +43,7 @@ Examples WHERE key = 'something' """, - options=Postgres.ReadOptions( + options=Postgres.SQLOptions( partition_column="id", num_partitions=10, lower_bound=0, @@ -66,3 +66,14 @@ Pay attention to ``where`` value Instead of filtering data on Spark side using ``df.filter(df.column == 'value')`` pass proper ``WHERE column = 'value'`` clause. This both reduces the amount of data send from Postgres to Spark, and may also improve performance of the query. Especially if there are indexes or partitions for columns used in ``where`` clause. + +Options +------- + +.. currentmodule:: onetl.connection.db_connection.postgres.options + +.. autopydantic_model:: PostgresSQLOptions + :inherited-members: GenericOptions + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false diff --git a/docs/connection/db_connection/postgres/types.rst b/docs/connection/db_connection/postgres/types.rst index 5424289e4..4214fe63d 100644 --- a/docs/connection/db_connection/postgres/types.rst +++ b/docs/connection/db_connection/postgres/types.rst @@ -14,7 +14,7 @@ Reading from Postgres This is how Postgres connector performs this: * For each column in query result (``SELECT column1, column2, ... FROM table ...``) get column name and Postgres type. -* Find corresponding ``Postgres type (read)`` -> ``Spark type`` combination (see below) for each DataFrame column [1]_. If no combination is found, raise exception. +* Find corresponding ``Postgres type (read)`` → ``Spark type`` combination (see below) for each DataFrame column [1]_. If no combination is found, raise exception. * Create DataFrame from query with specific column names and Spark types. .. [1] @@ -27,8 +27,8 @@ This is how Postgres connector performs this: * Get names of columns in DataFrame. [1]_ * Perform ``SELECT * FROM table LIMIT 0`` query. -* Take only columns present in DataFrame (by name, case insensitive). For each found column get Clickhouse type. -* Find corresponding ``Spark type`` -> ``Postgres type (write)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* Take only columns present in DataFrame (by name, case insensitive) [2]_. For each found column get Postgres type. +* Find corresponding ``Spark type`` → ``Postgres type (write)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. * If ``Postgres type (write)`` match ``Postgres type (read)``, no additional casts will be performed, DataFrame column will be written to Postgres as is. * If ``Postgres type (write)`` does not match ``Postgres type (read)``, DataFrame column will be casted to target column type **on Postgres side**. For example, you can write column with text data to ``int`` column, if column contains valid integer values within supported value range and precision [3]_. @@ -51,7 +51,7 @@ Create new table using Spark This is how Postgres connector performs this: -* Find corresponding ``Spark type`` -> ``Postgres type (create)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* Find corresponding ``Spark type`` → ``Postgres type (create)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. * Generate DDL for creating table in Postgres, like ``CREATE TABLE (col1 ...)``, and run it. * Write DataFrame to created table as is. @@ -248,7 +248,7 @@ String types | ``jsonb`` | | | | +-----------------------------+ | | | | ``xml`` | | | | -+-----------------------------+-----------------------| | | ++-----------------------------+-----------------------+ | | | ``CREATE TYPE ... AS ENUM`` | ``StringType()`` [1]_ | | | +-----------------------------+ | | | | ``tsvector`` | | | | @@ -348,17 +348,15 @@ It is possible to explicitly cast column of unsupported type using ``DBReader(co For example, you can use ``CAST(column AS text)`` to convert data to string representation on Postgres side, and so it will be read as Spark's ``StringType()``. -It is also possible to use `to_json `_ Postgres function -to convert column of any type to string representation, and then parse this column on Spark side using -`from_json `_: +It is also possible to use `to_json `_ Postgres function to convert column of any type to string representation, and then parse this column on Spark side you can use the :obj:`JSON.parse_column ` method: .. code-block:: python - from pyspark.sql.functions import from_json from pyspark.sql.types import IntegerType from onetl.connection import Postgres from onetl.db import DBReader + from onetl.file.format import JSON postgres = Postgres(...) @@ -374,35 +372,37 @@ to convert column of any type to string representation, and then parse this colu ) df = reader.run() - # Spark requires all columns to have some type, describe it - column_type = IntegerType() - - # cast column content to proper Spark type + json_schema = StructType( + [ + StructField("id", IntegerType(), nullable=True), + StructField("name", StringType(), nullable=True), + ..., + ] + ) df = df.select( df.id, df.supported_column, # explicit cast df.unsupported_column_str.cast("integer").alias("parsed_integer"), - # or explicit json parsing - from_json(df.array_column_json, schema).alias("array_column"), + JSON().parse_column("array_column_json", json_schema).alias("json_string"), ) ``DBWriter`` ~~~~~~~~~~~~ -It is always possible to convert data on Spark side to string, and then write it to ``text`` column in Postgres table. - -Using ``to_json`` -^^^^^^^^^^^^^^^^^ +It is always possible to convert data on the Spark side to a string, and then write it to a text column in a Postgres table. -For example, you can convert data using `to_json `_ function. +Using JSON.serialize_column +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +You can use the :obj:`JSON.serialize_column ` method for data serialization: -.. code:: python +.. code-block:: python - from pyspark.sql.functions import to_json + from onetl.file.format import JSON + from pyspark.sql.functions import col from onetl.connection import Postgres - from onetl.db import DBReader + from onetl.db import DBWriter postgres = Postgres(...) @@ -419,7 +419,7 @@ For example, you can convert data using `to_json ` +Method above accepts :obj:`Postgres.WriteOptions ` -.. currentmodule:: onetl.connection.db_connection.jdbc_connection.options +.. currentmodule:: onetl.connection.db_connection.postgres.options -.. autopydantic_model:: JDBCWriteOptions +.. autopydantic_model:: PostgresWriteOptions + :inherited-members: GenericOptions :member-order: bysource :model-show-field-summary: false :field-show-constraints: false diff --git a/docs/connection/db_connection/teradata/execute.rst b/docs/connection/db_connection/teradata/execute.rst index 545b473d8..300a45c30 100644 --- a/docs/connection/db_connection/teradata/execute.rst +++ b/docs/connection/db_connection/teradata/execute.rst @@ -20,7 +20,7 @@ Use ``Teradata.fetch`` Use this method to execute some ``SELECT`` query which returns **small number or rows**, like reading Teradata config, or reading data from some reference table. Method returns Spark DataFrame. -Method accepts :obj:`JDBCOptions `. +Method accepts :obj:`Teradata.FetchOptions `. Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. @@ -45,7 +45,7 @@ Examples df = teradata.fetch( "SELECT value FROM some.reference_table WHERE key = 'some_constant'", - options=Teradata.JDBCOptions(query_timeout=10), + options=Teradata.FetchOptions(query_timeout=10), ) teradata.close() value = df.collect()[0][0] # get value from first row and first column @@ -55,7 +55,7 @@ Use ``Teradata.execute`` Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it. -Method accepts :obj:`JDBCOptions `. +Method accepts :obj:`Teradata.ExecuteOptions `. Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. @@ -64,7 +64,7 @@ Syntax support This method supports **any** query syntax supported by Teradata, like: -* ✅︎ ``CREATE TABLE ...``, ``CREATE VIEW ...``, ``DROP TABLE ...``, and so on +* ✅︎ ``CREATE TABLE ...``, ``CREATE VIEW ...``, and so on * ✅︎ ``ALTER ...`` * ✅︎ ``INSERT INTO ... SELECT ...``, ``UPDATE ...``, ``DELETE ...``, and so on * ✅︎ ``DROP TABLE ...``, ``DROP VIEW ...``, and so on @@ -95,15 +95,19 @@ Examples ) NO PRIMARY INDEX """, - options=Teradata.JDBCOptions(query_timeout=10), + options=Teradata.ExecuteOptions(query_timeout=10), ) Options ------- -.. currentmodule:: onetl.connection.db_connection.jdbc_mixin.options +.. currentmodule:: onetl.connection.db_connection.teradata.options -.. autopydantic_model:: JDBCOptions +.. autopydantic_model:: TeradataFetchOptions + :inherited-members: GenericOptions + :member-order: bysource + + +.. autopydantic_model:: TeradataExecuteOptions + :inherited-members: GenericOptions :member-order: bysource - :model-show-field-summary: false - :field-show-constraints: false diff --git a/docs/connection/db_connection/teradata/read.rst b/docs/connection/db_connection/teradata/read.rst index f4cf95bfb..e3b8d4618 100644 --- a/docs/connection/db_connection/teradata/read.rst +++ b/docs/connection/db_connection/teradata/read.rst @@ -18,7 +18,7 @@ Supported DBReader features * * ✅︎ :ref:`incremental-batch-strategy` * ❌ ``hint`` (is not supported by Teradata) * ❌ ``df_schema`` -* ✅︎ ``options`` (see :obj:`JDBCReadOptions `) +* ✅︎ ``options`` (see :obj:`Teradata.ReadOptions `) Examples -------- @@ -115,9 +115,8 @@ Prefer using ``partitioning_mode="hash"`` from example above. Options ------- -.. currentmodule:: onetl.connection.db_connection.jdbc_connection.options +.. currentmodule:: onetl.connection.db_connection.teradata.options -.. autopydantic_model:: JDBCReadOptions +.. autopydantic_model:: TeradataReadOptions + :inherited-members: GenericOptions :member-order: bysource - :model-show-field-summary: false - :field-show-constraints: false diff --git a/docs/connection/db_connection/teradata/sql.rst b/docs/connection/db_connection/teradata/sql.rst index 3531d01c3..df50251b0 100644 --- a/docs/connection/db_connection/teradata/sql.rst +++ b/docs/connection/db_connection/teradata/sql.rst @@ -41,7 +41,7 @@ Examples WHERE key = 'something' """, - options=Teradata.ReadOptions( + options=Teradata.SQLOptions( partition_column="part_column", num_partitions=10, lower_bound=0, @@ -64,3 +64,14 @@ Pay attention to ``where`` value Instead of filtering data on Spark side using ``df.filter(df.column == 'value')`` pass proper ``WHERE column = 'value'`` clause. This both reduces the amount of data send from Teradata to Spark, and may also improve performance of the query. Especially if there are indexes or partitions for columns used in ``where`` clause. + +Options +------- + +.. currentmodule:: onetl.connection.db_connection.teradata.options + +.. autopydantic_model:: TeradataSQLOptions + :inherited-members: GenericOptions + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false diff --git a/docs/connection/db_connection/teradata/write.rst b/docs/connection/db_connection/teradata/write.rst index 288e79667..aec3844f0 100644 --- a/docs/connection/db_connection/teradata/write.rst +++ b/docs/connection/db_connection/teradata/write.rst @@ -110,11 +110,12 @@ Choosing one of the modes can alter connector behavior. For example: Options ------- -Method above accepts :obj:`JDBCWriteOptions ` +Method above accepts :obj:`Teradata.WriteOptions ` -.. currentmodule:: onetl.connection.db_connection.jdbc_connection.options +.. currentmodule:: onetl.connection.db_connection.teradata.options -.. autopydantic_model:: JDBCWriteOptions +.. autopydantic_model:: TeradataWriteOptions + :inherited-members: GenericOptions :member-order: bysource :model-show-field-summary: false :field-show-constraints: false diff --git a/docs/connection/file_df_connection/spark_hdfs/index.rst b/docs/connection/file_df_connection/spark_hdfs/index.rst index f7769c0d9..e4f79d6a2 100644 --- a/docs/connection/file_df_connection/spark_hdfs/index.rst +++ b/docs/connection/file_df_connection/spark_hdfs/index.rst @@ -1,12 +1,13 @@ .. _spark-hdfs: Spark HDFS -========================== +========== .. toctree:: :maxdepth: 1 :caption: Connection + prerequisites Connection .. toctree:: diff --git a/docs/connection/file_df_connection/spark_hdfs/prerequisites.rst b/docs/connection/file_df_connection/spark_hdfs/prerequisites.rst new file mode 100644 index 000000000..8cac36de1 --- /dev/null +++ b/docs/connection/file_df_connection/spark_hdfs/prerequisites.rst @@ -0,0 +1,48 @@ +.. _spark-hdfs-prerequisites: + +Prerequisites +============= + +Version Compatibility +--------------------- + +* Hadoop versions: 2.x, 3.x (only with Hadoop 3.x libraries) +* Spark versions: 2.3.x - 3.5.x +* Java versions: 8 - 20 + +Installing PySpark +------------------ + +To use SparkHDFS connector you should have PySpark installed (or injected to ``sys.path``) +BEFORE creating the connector instance. + +See :ref:`install-spark` installation instruction for more details. + +Using Kerberos +-------------- + +Some of Hadoop managed clusters use Kerberos authentication. In this case, you should call `kinit `_ command +**BEFORE** starting Spark session to generate Kerberos ticket. See :ref:`install-kerberos`. + +Sometimes it is also required to pass keytab file to Spark config, allowing Spark executors to generate own Kerberos tickets: + +.. tabs:: + + .. code-tab:: python Spark 3 + + SparkSession.builder + .option("spark.kerberos.access.hadoopFileSystems", "hdfs://namenode1.domain.com:9820,hdfs://namenode2.domain.com:9820") + .option("spark.kerberos.principal", "user") + .option("spark.kerberos.keytab", "/path/to/keytab") + .gerOrCreate() + + .. code-tab:: python Spark 2 + + SparkSession.builder + .option("spark.yarn.access.hadoopFileSystems", "hdfs://namenode1.domain.com:9820,hdfs://namenode2.domain.com:9820") + .option("spark.yarn.principal", "user") + .option("spark.yarn.keytab", "/path/to/keytab") + .gerOrCreate() + +See `Spark security documentation `_ +for more details. diff --git a/docs/connection/file_df_connection/spark_s3/index.rst b/docs/connection/file_df_connection/spark_s3/index.rst index f7a32250d..d086acd8a 100644 --- a/docs/connection/file_df_connection/spark_s3/index.rst +++ b/docs/connection/file_df_connection/spark_s3/index.rst @@ -7,5 +7,6 @@ Spark S3 :maxdepth: 1 :caption: Connection + prerequisites Connection - Troubleshooting Guide + Troubleshooting diff --git a/docs/connection/file_df_connection/spark_s3/prerequisites.rst b/docs/connection/file_df_connection/spark_s3/prerequisites.rst new file mode 100644 index 000000000..0d864e664 --- /dev/null +++ b/docs/connection/file_df_connection/spark_s3/prerequisites.rst @@ -0,0 +1,68 @@ +.. _spark-s3-prerequisites: + +Prerequisites +============= + +Version Compatibility +--------------------- + +* Spark versions: 3.2.x - 3.5.x (only with Hadoop 3.x libraries) +* Java versions: 8 - 20 + +Installing PySpark +------------------ + +To use SparkS3 connector you should have PySpark installed (or injected to ``sys.path``) +BEFORE creating the connector instance. + +See :ref:`install-spark` installation instruction for more details. + +Connecting to S3 +---------------- + +Bucket access style +~~~~~~~~~~~~~~~~~~~ + +AWS and some other S3 cloud providers allows bucket access using domain style only, e.g. ``https://mybucket.s3provider.com``. + +Other implementations, like Minio, by default allows path style access only, e.g. ``https://s3provider.com/mybucket`` +(see `MINIO_DOMAIN `_). + +You should set ``path.style.access`` to ``True`` or ``False``, to choose the preferred style. + +Authentication +~~~~~~~~~~~~~~ + +Different S3 instances can use different authentication methods, like: + * ``access_key + secret_key`` (or username + password) + * ``access_key + secret_key + session_token`` + +Usually these are just passed to SparkS3 constructor: + +.. code:: python + + SparkS3( + access_key=..., + secret_key=..., + session_token=..., + ) + +But some S3 cloud providers, like AWS, may require custom credential providers. You can pass them like: + +.. code:: python + + SparkS3( + extra={ + # provider class + "aws.credentials.provider": "org.apache.hadoop.fs.s3a.auth.AssumedRoleCredentialProvider", + # other options, if needed + "assumed.role.arn": "arn:aws:iam::90066806600238:role/s3-restricted", + }, + ) + +See `Hadoop-AWS `_ documentation. + +Troubleshooting +--------------- + +See :ref:`spark-s3-troubleshooting`. diff --git a/docs/connection/file_df_connection/spark_s3/troubleshooting.rst b/docs/connection/file_df_connection/spark_s3/troubleshooting.rst index 2f3017547..a08669af9 100644 --- a/docs/connection/file_df_connection/spark_s3/troubleshooting.rst +++ b/docs/connection/file_df_connection/spark_s3/troubleshooting.rst @@ -3,6 +3,10 @@ Spark S3 Troubleshooting ======================== +.. note:: + + General guide: :ref:`troubleshooting`. + More details: * `Hadoop AWS Troubleshooting Guide `_ @@ -34,12 +38,7 @@ How to determine reason Make logging more verbose ^^^^^^^^^^^^^^^^^^^^^^^^^ -Change Spark session log level to ``DEBUG`` to print result of each attempt: - -.. code:: python - - spark.sparkContext.setLogLevel("debug") - +Change Spark session log level to :ref:`DEBUG ` to print result of each attempt. Resulting logs will look like this .. dropdown:: See log @@ -171,15 +170,6 @@ Resulting logs will look like this 23/08/03 11:25:10 DEBUG request: Retrying Request: GET https://test-bucket.localhost:9000 / Parameters: ({"list-type":["2"],"delimiter":["/"],"max-keys":["2"],"prefix":["fake/"],"fetch-owner":["false"]}Headers: (amz-sdk-invocation-id: e6d62603-96e4-a80f-10a1-816e0822bc71, Content-Type: application/octet-stream, User-Agent: Hadoop 3.3.4, aws-sdk-java/1.12.262 Linux/6.4.7-1-MANJARO OpenJDK_64-Bit_Server_VM/25.292-b10 java/1.8.0_292 scala/2.12.17 vendor/AdoptOpenJDK cfg/retry-mode/legacy, ) 23/08/03 11:25:10 DEBUG AmazonHttpClient: Retriable error detected, will retry in 49ms, attempt number: 0 -After getting all information you need, make logs less verbose: - -.. code:: python - - spark.sparkContext.setLogLevel("info") - - # or - spark.sparkContext.setLogLevel("warn") - Change number of retries ^^^^^^^^^^^^^^^^^^^^^^^^ @@ -225,13 +215,13 @@ If you change port number, this does not lead to changing protocol: .. code:: python - spark_s3 = SparkS3(host="s3.domain.com", port=8080, ...) + spark_s3 = SparkS3(host="s3provider.com", port=8080, ...) You should pass protocol explicitly: .. code:: python - spark_s3 = SparkS3(host="s3.domain.com", port=8080, protocol="http", ...) + spark_s3 = SparkS3(host="s3provider.com", port=8080, protocol="http", ...) SSL certificate is self-signed ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -261,20 +251,14 @@ Accessing S3 without domain-style access style support .. code:: txt - Caused by: java.net.UnknownHostException: my-bucket.s3.domain.com - -By default, Hadoop AWS uses domain-style access ``my-bucket.domain.com`` instead of path-style access ``domain.com/my-bucket``, -because this is default option for AWS S3. - -But some S3 implementations does not support domain-style access, e.g. MinIO by default allows only path-style access -(see `MINIO_DOMAIN `_). + Caused by: java.net.UnknownHostException: my-bucket.s3provider.com To use path-style access, use option below: .. code:: python spark_s3 = SparkS3( - host="s3.domain.com", + host="s3provider.com", bucket="my-bucket", ..., extra={ diff --git a/docs/file/file_downloader/result.rst b/docs/file/file_downloader/result.rst index 8fd20e9df..dec2b2dd4 100644 --- a/docs/file/file_downloader/result.rst +++ b/docs/file/file_downloader/result.rst @@ -6,4 +6,4 @@ File Downloader Result .. currentmodule:: onetl.file.file_downloader.result .. autoclass:: DownloadResult - :members: successful, failed, skipped, missing, successful_count, failed_count, skipped_count, missing_count, total_count, successful_size, failed_size, skipped_size, total_size, raise_if_failed, reraise_failed, raise_if_missing, raise_if_skipped, raise_if_empty, is_empty, raise_if_contains_zero_size, details, summary, dict, json + :members: successful, failed, skipped, missing, successful_count, failed_count, skipped_count, missing_count, total_count, successful_size, failed_size, skipped_size, total_size, raise_if_failed, raise_if_missing, raise_if_skipped, raise_if_empty, is_empty, raise_if_contains_zero_size, details, summary, dict, json diff --git a/docs/file/file_mover/result.rst b/docs/file/file_mover/result.rst index d4ea950f3..c77340bc7 100644 --- a/docs/file/file_mover/result.rst +++ b/docs/file/file_mover/result.rst @@ -6,4 +6,4 @@ File Mover Result .. currentmodule:: onetl.file.file_mover.result .. autoclass:: MoveResult - :members: successful, failed, skipped, missing, successful_count, failed_count, skipped_count, missing_count, total_count, successful_size, failed_size, skipped_size, total_size, raise_if_failed, reraise_failed, raise_if_missing, raise_if_skipped, raise_if_empty, is_empty, raise_if_contains_zero_size, details, summary, dict, json + :members: successful, failed, skipped, missing, successful_count, failed_count, skipped_count, missing_count, total_count, successful_size, failed_size, skipped_size, total_size, raise_if_failed, raise_if_missing, raise_if_skipped, raise_if_empty, is_empty, raise_if_contains_zero_size, details, summary, dict, json diff --git a/docs/file/file_uploader/result.rst b/docs/file/file_uploader/result.rst index af20ace14..f83acf9a5 100644 --- a/docs/file/file_uploader/result.rst +++ b/docs/file/file_uploader/result.rst @@ -6,4 +6,4 @@ File Uploader Result .. currentmodule:: onetl.file.file_uploader.result .. autoclass:: UploadResult - :members: successful, failed, skipped, missing, successful_count, failed_count, skipped_count, missing_count, total_count, successful_size, failed_size, skipped_size, total_size, raise_if_failed, reraise_failed, raise_if_missing, raise_if_skipped, raise_if_empty, is_empty, raise_if_contains_zero_size, details, summary, dict, json + :members: successful, failed, skipped, missing, successful_count, failed_count, skipped_count, missing_count, total_count, successful_size, failed_size, skipped_size, total_size, raise_if_failed, raise_if_missing, raise_if_skipped, raise_if_empty, is_empty, raise_if_contains_zero_size, details, summary, dict, json diff --git a/docs/file_df/file_formats/avro.rst b/docs/file_df/file_formats/avro.rst index 7f1ec0d4f..1fe9ef781 100644 --- a/docs/file_df/file_formats/avro.rst +++ b/docs/file_df/file_formats/avro.rst @@ -6,4 +6,4 @@ Avro .. currentmodule:: onetl.file.format.avro .. autoclass:: Avro - :members: get_packages + :members: get_packages, parse_column, serialize_column diff --git a/docs/file_df/file_formats/csv.rst b/docs/file_df/file_formats/csv.rst index 44201e71a..93119ed25 100644 --- a/docs/file_df/file_formats/csv.rst +++ b/docs/file_df/file_formats/csv.rst @@ -6,4 +6,4 @@ CSV .. currentmodule:: onetl.file.format.csv .. autoclass:: CSV - :members: __init__ + :members: __init__, parse_column, serialize_column diff --git a/docs/file_df/file_formats/json.rst b/docs/file_df/file_formats/json.rst index acbaf460c..a2671aefe 100644 --- a/docs/file_df/file_formats/json.rst +++ b/docs/file_df/file_formats/json.rst @@ -6,4 +6,4 @@ JSON .. currentmodule:: onetl.file.format.json .. autoclass:: JSON - :members: __init__ + :members: __init__, parse_column, serialize_column diff --git a/docs/file_df/file_formats/xml.rst b/docs/file_df/file_formats/xml.rst index 187aa89a4..cfe560ccc 100644 --- a/docs/file_df/file_formats/xml.rst +++ b/docs/file_df/file_formats/xml.rst @@ -6,4 +6,4 @@ XML .. currentmodule:: onetl.file.format.xml .. autoclass:: XML - :members: get_packages + :members: get_packages, parse_column diff --git a/docs/hooks/index.rst b/docs/hooks/index.rst index da56c10f8..261bb4051 100644 --- a/docs/hooks/index.rst +++ b/docs/hooks/index.rst @@ -3,6 +3,8 @@ Hooks ===== +.. versionadded:: 0.6.0 + .. toctree:: :maxdepth: 1 :caption: Hooks diff --git a/docs/index.rst b/docs/index.rst index 71d3fc250..479a7c72c 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -17,6 +17,8 @@ install/index quickstart concepts + logging + troubleshooting/index .. toctree:: :maxdepth: 3 @@ -62,13 +64,6 @@ hooks/index plugins -.. toctree:: - :maxdepth: 2 - :caption: Misc - :hidden: - - logging - .. toctree:: :maxdepth: 2 :caption: Development diff --git a/docs/logging.rst b/docs/logging.rst index 6fc4eda41..1198a64d1 100644 --- a/docs/logging.rst +++ b/docs/logging.rst @@ -1,7 +1,157 @@ .. _logging: Logging -========= +======= + +Logging is quite important to understant what's going on under the hood of onETL. + +Default logging level for Python interpreters is ``WARNING``, +but most of onETL logs are in ``INFO`` level, so users usually don't see much. + +To change logging level, there is a function :obj:`setup_logging ` +which should be called at the top of the script: + +.. code:: python + + from onetl.log import setup_logging + from other.lib import some, more, imports + + setup_logging() + + # rest of code + ... + +This changes both log level and log formatting to something like this: + +.. dropdown:: See logs + + .. code:: text + + 2024-04-12 10:12:10,834 [INFO ] MainThread: |onETL| Using IncrementalStrategy as a strategy + 2024-04-12 10:12:10,835 [INFO ] MainThread: =================================== DBReader.run() starts =================================== + 2024-04-12 10:12:10,835 [INFO ] MainThread: |DBReader| Getting Spark type for HWM expression: 'updated_at' + 2024-04-12 10:12:10,836 [INFO ] MainThread: |MSSQL| Fetching schema of table 'source_schema.table' ... + 2024-04-12 10:12:11,636 [INFO ] MainThread: |MSSQL| Schema fetched. + 2024-04-12 10:12:11,642 [INFO ] MainThread: |DBReader| Got Spark field: StructField('updated_at', TimestampType(), True) + 2024-04-12 10:12:11,642 [INFO ] MainThread: |DBReader| Detected HWM type: 'ColumnDateTimeHWM' + 2024-04-12 10:12:11,643 [INFO ] MainThread: |IncrementalStrategy| Fetching HWM from HorizonHWMStore: + 2024-04-12 10:12:11,643 [INFO ] MainThread: name = 'updated_at#source_schema.table@mssql:/mssql.host:1433/somedb' + 2024-04-12 10:12:12,181 [INFO ] MainThread: |IncrementalStrategy| Fetched HWM: + 2024-04-12 10:12:12,182 [INFO ] MainThread: hwm = ColumnDateTimeHWM( + 2024-04-12 10:12:12,182 [INFO ] MainThread: name = 'updated_at#source_schema.table@mssql:/mssql.host:1433/somedb', + 2024-04-12 10:12:12,182 [INFO ] MainThread: entity = 'source_schema.table', + 2024-04-12 10:12:12,182 [INFO ] MainThread: expression = 'updated_at', + 2024-04-12 10:12:12,184 [INFO ] MainThread: value = datetime.datetime(2024, 4, 11, 18, 10, 2, 120000), + 2024-04-12 10:12:12,184 [INFO ] MainThread: ) + 2024-04-12 10:12:12,184 [INFO ] MainThread: |MSSQL| -> |Spark| Reading DataFrame from source using parameters: + 2024-04-12 10:12:12,185 [INFO ] MainThread: source = 'source_schema.table' + 2024-04-12 10:12:12,185 [INFO ] MainThread: columns = [ + 2024-04-12 10:12:12,185 [INFO ] MainThread: 'id', + 2024-04-12 10:12:12,186 [INFO ] MainThread: 'new_value', + 2024-04-12 10:12:12,186 [INFO ] MainThread: 'old_value', + 2024-04-12 10:12:12,186 [INFO ] MainThread: 'updated_at', + 2024-04-12 10:12:12,186 [INFO ] MainThread: ] + 2024-04-12 10:12:12,187 [INFO ] MainThread: where = "field = 'some'" + 2024-04-12 10:12:12,187 [INFO ] MainThread: hwm = AutoDetectHWM( + 2024-04-12 10:12:12,187 [INFO ] MainThread: name = 'updated_at#source_schema.table@mssql:/mssql.host:1433/somedb', + 2024-04-12 10:12:12,187 [INFO ] MainThread: entity = 'source_schema.table', + 2024-04-12 10:12:12,187 [INFO ] MainThread: expression = 'updated_at', + 2024-04-12 10:12:12,188 [INFO ] MainThread: ) + 2024-04-12 10:12:12,188 [INFO ] MainThread: options = { + 2024-04-12 10:12:12,188 [INFO ] MainThread: 'fetchsize': 100000, + 2024-04-12 10:12:12,188 [INFO ] MainThread: 'numPartitions': 1, + 2024-04-12 10:12:12,189 [INFO ] MainThread: 'partitioningMode': 'range', + 2024-04-12 10:12:12,189 [INFO ] MainThread: } + 2024-04-12 10:12:12,189 [INFO ] MainThread: |MSSQL| Checking connection availability... + 2024-04-12 10:12:12,189 [INFO ] MainThread: |MSSQL| Using connection parameters: + 2024-04-12 10:12:12,190 [INFO ] MainThread: user = 'db_user' + 2024-04-12 10:12:12,190 [INFO ] MainThread: password = SecretStr('**********') + 2024-04-12 10:12:12,190 [INFO ] MainThread: host = 'mssql.host' + 2024-04-12 10:12:12,190 [INFO ] MainThread: port = 1433 + 2024-04-12 10:12:12,191 [INFO ] MainThread: database = 'somedb' + 2024-04-12 10:12:12,191 [INFO ] MainThread: extra = {'ApplicationIntent': 'ReadOnly', 'trustServerCertificate': 'true'} + 2024-04-12 10:12:12,191 [INFO ] MainThread: jdbc_url = 'jdbc:sqlserver:/mssql.host:1433' + 2024-04-12 10:12:12,579 [INFO ] MainThread: |MSSQL| Connection is available. + 2024-04-12 10:12:12,581 [INFO ] MainThread: |MSSQL| Executing SQL query (on driver): + 2024-04-12 10:12:12,581 [INFO ] MainThread: SELECT + 2024-04-12 10:12:12,581 [INFO ] MainThread: MIN(updated_at) AS "min", + 2024-04-12 10:12:12,582 [INFO ] MainThread: MAX(updated_at) AS "max" + 2024-04-12 10:12:12,582 [INFO ] MainThread: FROM + 2024-04-12 10:12:12,582 [INFO ] MainThread: source_schema.table + 2024-04-12 10:12:12,582 [INFO ] MainThread: WHERE + 2024-04-12 10:12:12,582 [INFO ] MainThread: (field = 'some') + 2024-04-12 10:12:12,583 [INFO ] MainThread: AND + 2024-04-12 10:12:12,583 [INFO ] MainThread: (updated_at >= CAST('2024-04-11T18:10:02.120000' AS datetime2)) + 2024-04-12 10:16:22,537 [INFO ] MainThread: |MSSQL| Received values: + 2024-04-12 10:16:22,538 [INFO ] MainThread: MIN(updated_at) = datetime.datetime(2024, 4, 11, 21, 10, 7, 397000) + 2024-04-12 10:16:22,538 [INFO ] MainThread: MAX(updated_at) = datetime.datetime(2024, 4, 12, 13, 12, 2, 123000) + 2024-04-12 10:16:22,540 [INFO ] MainThread: |MSSQL| Executing SQL query (on executor): + 2024-04-12 10:16:22,540 [INFO ] MainThread: SELECT + 2024-04-12 10:16:22,540 [INFO ] MainThread: id, + 2024-04-12 10:16:22,541 [INFO ] MainThread: new_value, + 2024-04-12 10:16:22,541 [INFO ] MainThread: old_value, + 2024-04-12 10:16:22,541 [INFO ] MainThread: updated_at + 2024-04-12 10:16:22,541 [INFO ] MainThread: FROM + 2024-04-12 10:16:22,541 [INFO ] MainThread: source_schema.table + 2024-04-12 10:16:22,542 [INFO ] MainThread: WHERE + 2024-04-12 10:16:22,542 [INFO ] MainThread: (field = 'some') + 2024-04-12 10:16:22,542 [INFO ] MainThread: AND + 2024-04-12 10:16:22,542 [INFO ] MainThread: (updated_at > CAST('2024-04-11T18:10:02.120000' AS datetime2)) + 2024-04-12 10:16:22,542 [INFO ] MainThread: AND + 2024-04-12 10:16:22,542 [INFO ] MainThread: (updated_at <= CAST('2024-04-12T13:12:02.123000' AS datetime2)) + 2024-04-12 10:16:22,892 [INFO ] MainThread: |Spark| DataFrame successfully created from SQL statement + 2024-04-12 10:16:22,892 [INFO ] MainThread: ------------------------------------ DBReader.run() ends ------------------------------------ + 2024-04-12 10:40:42,409 [INFO ] MainThread: =================================== DBWriter.run() starts =================================== + 2024-04-12 10:40:42,409 [INFO ] MainThread: |Spark| -> |Hive| Writing DataFrame to target using parameters: + 2024-04-12 10:40:42,410 [INFO ] MainThread: target = 'target_source_schema.table' + 2024-04-12 10:40:42,410 [INFO ] MainThread: options = { + 2024-04-12 10:40:42,410 [INFO ] MainThread: 'mode': 'append', + 2024-04-12 10:40:42,410 [INFO ] MainThread: 'format': 'orc', + 2024-04-12 10:40:42,410 [INFO ] MainThread: 'partitionBy': 'part_dt', + 2024-04-12 10:40:42,410 [INFO ] MainThread: } + 2024-04-12 10:40:42,411 [INFO ] MainThread: df_schema: + 2024-04-12 10:40:42,412 [INFO ] MainThread: root + 2024-04-12 10:40:42,412 [INFO ] MainThread: |-- id: integer (nullable = true) + 2024-04-12 10:40:42,413 [INFO ] MainThread: |-- new_value: string (nullable = true) + 2024-04-12 10:40:42,413 [INFO ] MainThread: |-- old_value: string (nullable = true) + 2024-04-12 10:40:42,413 [INFO ] MainThread: |-- updated_at: timestamp (nullable = true) + 2024-04-12 10:40:42,413 [INFO ] MainThread: |-- part_dt: date (nullable = true) + 2024-04-12 10:40:42,414 [INFO ] MainThread: + 2024-04-12 10:40:42,421 [INFO ] MainThread: |Hive| Checking connection availability... + 2024-04-12 10:40:42,421 [INFO ] MainThread: |Hive| Using connection parameters: + 2024-04-12 10:40:42,421 [INFO ] MainThread: cluster = 'dwh' + 2024-04-12 10:40:42,475 [INFO ] MainThread: |Hive| Connection is available. + 2024-04-12 10:40:42,476 [INFO ] MainThread: |Hive| Fetching schema of table 'target_source_schema.table' ... + 2024-04-12 10:40:43,518 [INFO ] MainThread: |Hive| Schema fetched. + 2024-04-12 10:40:43,521 [INFO ] MainThread: |Hive| Table 'target_source_schema.table' already exists + 2024-04-12 10:40:43,521 [WARNING ] MainThread: |Hive| User-specified options {'partitionBy': 'part_dt'} are ignored while inserting into existing table. Using only table parameters from Hive metastore + 2024-04-12 10:40:43,782 [INFO ] MainThread: |Hive| Inserting data into existing table 'target_source_schema.table' ... + 2024-04-12 11:06:07,396 [INFO ] MainThread: |Hive| Data is successfully inserted into table 'target_source_schema.table'. + 2024-04-12 11:06:07,397 [INFO ] MainThread: ------------------------------------ DBWriter.run() ends ------------------------------------ + 2024-04-12 11:06:07,397 [INFO ] MainThread: |onETL| Exiting IncrementalStrategy + 2024-04-12 11:06:07,397 [INFO ] MainThread: |IncrementalStrategy| Saving HWM to 'HorizonHWMStore': + 2024-04-12 11:06:07,397 [INFO ] MainThread: hwm = ColumnDateTimeHWM( + 2024-04-12 11:06:07,397 [INFO ] MainThread: name = 'updated_at#source_schema.table@mssql:/mssql.host:1433/somedb', + 2024-04-12 11:06:07,397 [INFO ] MainThread: entity = 'source_source_schema.table', + 2024-04-12 11:06:07,397 [INFO ] MainThread: expression = 'updated_at', + 2024-04-12 11:06:07,397 [INFO ] MainThread: value = datetime.datetime(2024, 4, 12, 13, 12, 2, 123000), + 2024-04-12 11:06:07,397 [INFO ] MainThread: ) + 2024-04-12 11:06:07,495 [INFO ] MainThread: |IncrementalStrategy| HWM has been saved + +Each step performed by onETL is extensively logged, which should help with debugging. + +You can make logs even more verbose by changing level to ``DEBUG``: + +.. code:: python + + from onetl.log import setup_logging + + setup_logging(level="DEBUG", enable_clients=True) + + # rest of code + ... + +This also changes log level for all underlying Python libraries, e.g. showing each HTTP request being made, and so on. .. currentmodule:: onetl.log diff --git a/docs/plugins.rst b/docs/plugins.rst index 4c10bc762..7a429d484 100644 --- a/docs/plugins.rst +++ b/docs/plugins.rst @@ -3,6 +3,8 @@ Plugins ======= +.. versionadded:: 0.6.0 + What are plugins? ----------------- @@ -82,6 +84,8 @@ like :ref:`hook-decorator`, it will be executed during this import. How to enable/disable plugins? ------------------------------ +.. versionadded:: 0.7.0 + Disable/enable all plugins ~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/troubleshooting/index.rst b/docs/troubleshooting/index.rst new file mode 100644 index 000000000..536170561 --- /dev/null +++ b/docs/troubleshooting/index.rst @@ -0,0 +1,25 @@ +.. _troubleshooting: + +Troubleshooting +=============== + +In case of error please follow instructions below: + +* Read the logs or exception messages you've faced with. + * If Python logs are note verbose enough, :ref:`increase the log level `. + * If Spark logs are note verbose enough, :ref:`increase the log level `. +* Read documentation related to a class or method you are using. +* `Google `_ the error message, and carefully read the search result: + * `StackOverflow `_ answers. + * `Spark `_ documentation. + * Documentation of database or filesystem you are connecting to. + * Documentation of underlying connector. +* Search for known `issues `_, or create a new one. +* Always use the most resent versions of onETL, PySpark and connector packages, :ref:`compatible with your environment `. + +.. toctree:: + :maxdepth: 3 + :caption: Troubleshooting + :hidden: + + spark diff --git a/docs/troubleshooting/spark.rst b/docs/troubleshooting/spark.rst new file mode 100644 index 000000000..0c1a20eb9 --- /dev/null +++ b/docs/troubleshooting/spark.rst @@ -0,0 +1,79 @@ +.. _troubleshooting-spark: + +Spark Troubleshooting +===================== + +Restarting Spark session +------------------------ + +Sometimes it is required to stop current Spark session and start a new one, e.g. to add some .jar packages, or change session config. +But PySpark not only starts Spark session, but also starts Java virtual machine (JVM) process in the background, +which. So calling ``sparkSession.stop()`` `does not shutdown JVM `_, +and this can cause some issue. + +Also apart from JVM properties, stopping Spark session does not clear Spark context, which is a global object. So new +Spark sessions are created using the same context object, and thus using the same Spark config options. + +To properly stop Spark session, it is **required** to: +* Stop Spark session by calling ``sparkSession.stop()``. +* **STOP PYTHON INTERPRETER**, e.g. by calling ``sys.exit()``. +* Start new Python interpreter. +* Start new Spark session with config options you need. + +Skipping some of these steps can lead to issues with creating new Spark session. + +Driver log level +---------------- + +Default logging level for Spark session is ``WARN``. To show more verbose logs, use: + +.. code:: python + + spark.sparkContext.setLogLevel("INFO") + +or increase verbosity even more: + +.. code:: python + + spark.sparkContext.setLogLevel("DEBUG") + +After getting all information you need, you can return back the previous log level: + +.. code:: python + + spark.sparkContext.setLogLevel("WARN") + +Executors log level +------------------- + +``sparkContext.setLogLevel`` changes only log level of Spark session on Spark **driver**. +To make Spark executor logs more verbose, perform following steps: + +* Create ``log4j.properties`` file with content like this: + + .. code-block:: jproperties + + log4j.rootCategory=DEBUG, console + + log4j.appender.console=org.apache.log4j.ConsoleAppender + log4j.appender.console.target=System.err + log4j.appender.console.layout=org.apache.log4j.PatternLayout + log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n + +* Stop existing Spark session and create a new one with following options: + + .. code-block:: python + + from pyspark.sql import SparkSession + + spark = ( + SparkSesion.builder.config("spark.files", "file:log4j.properties").config( + "spark.executor.extraJavaOptions", "-Dlog4j.configuration=file:log4j.properties" + ) + # you can apply the same logging settings to Spark driver, by uncommenting the line below + # .config("spark.driver.extraJavaOptions", "-Dlog4j.configuration=file:log4j.properties") + .getOrCreate() + ) + +Each Spark executor will receive a copy of ``log4j.properties`` file during start, and load it to change own log level. +Same approach can be used for Spark driver as well, to investigate issue when Spark session cannot properly start. diff --git a/onetl/VERSION b/onetl/VERSION index 5eef0f10e..d9df1bbc0 100644 --- a/onetl/VERSION +++ b/onetl/VERSION @@ -1 +1 @@ -0.10.2 +0.11.0 diff --git a/onetl/_internal.py b/onetl/_internal.py index 298ecdf95..361bb3e82 100644 --- a/onetl/_internal.py +++ b/onetl/_internal.py @@ -10,8 +10,6 @@ from datetime import datetime from typing import TYPE_CHECKING, Any -from etl_entities.process import ProcessStackManager - try: from pydantic.v1 import SecretStr except (ImportError, AttributeError): @@ -33,19 +31,18 @@ def clear_statement(statement: str) -> str: Examples -------- - .. code:: python - - assert clear_statement("SELECT * FROM mytable") == "SELECT * FROM mytable" - assert clear_statement("SELECT * FROM mytable ; ") == "SELECT * FROM mytable" - assert ( - clear_statement("CREATE TABLE mytable (id NUMBER)") - == "CREATE TABLE mytable (id NUMBER)" - ) - assert clear_statement("BEGIN ... END") == "BEGIN ... END;" + >>> clear_statement("SELECT * FROM mytable") + 'SELECT * FROM mytable' + >>> clear_statement("SELECT * FROM mytable ; ") + 'SELECT * FROM mytable' + >>> clear_statement("CREATE TABLE mytable (id NUMBER)") + 'CREATE TABLE mytable (id NUMBER)' + >>> clear_statement("BEGIN ... END") + 'BEGIN ... END;' """ - statement = statement.rstrip().lstrip("\n\r").rstrip(";") - if statement.lower().strip().endswith("end"): + statement = statement.rstrip().lstrip("\n\r").rstrip(";").rstrip() + if statement.lower().endswith("end"): statement += ";" return statement @@ -57,11 +54,12 @@ def uniq_ignore_case(orig_list: list[str]) -> list[str]: Examples -------- - .. code:: python - - assert uniq_ignore_case(["a", "c"]) == ["a", "c"] - assert uniq_ignore_case(["A", "a", "c"]) == ["A", "c"] - assert uniq_ignore_case(["a", "A", "c"]) == ["a", "c"] + >>> uniq_ignore_case(["a", "c"]) + ['a', 'c'] + >>> uniq_ignore_case(["A", "a", "c"]) + ['A', 'c'] + >>> uniq_ignore_case(["a", "A", "c"]) + ['a', 'c'] """ result: list[str] = [] @@ -90,14 +88,22 @@ def stringify(value: Any, quote: bool = False) -> Any: # noqa: WPS212 Examples -------- - >>> assert stringify(1) == "1" - >>> assert stringify(True) == "true" - >>> assert stringify(False) == "false" - >>> assert stringify(None) == "null" - >>> assert stringify("string") == "string" - >>> assert stringify("string", quote=True) == '"string"' - >>> assert stringify({"abc": 1}) == {"abc": "1"} - >>> assert stringify([1, True, False, None, "string"]) == ["1", "true", "false", "null", "string"] + >>> stringify(1) + '1' + >>> stringify(True) + 'true' + >>> stringify(False) + 'false' + >>> stringify(None) + 'null' + >>> stringify("string") + 'string' + >>> stringify("string", quote=True) + '"string"' + >>> stringify({"abc": 1}) + {'abc': '1'} + >>> stringify([1, True, False, None, "string"]) + ['1', 'true', 'false', 'null', 'string'] """ if isinstance(value, dict): @@ -131,9 +137,8 @@ def to_camel(string: str) -> str: Examples -------- - .. code:: python - - assert to_camel("some_value") == "someValue" + >>> to_camel("some_value") + 'someValue' """ return "".join(word.capitalize() if index > 0 else word for index, word in enumerate(string.split("_"))) @@ -151,24 +156,17 @@ def generate_temp_path(root: PurePath) -> PurePath: Examples -------- - View files - - .. code:: python - - from etl_entities.process import Process - - from pathlib import Path - - assert generate_temp_path(Path("/tmp")) == Path( - "/tmp/onetl/currenthost/myprocess/20230524122150", - ) - - with Process(dag="mydag", task="mytask"): - assert generate_temp_path(Path("/abc")) == Path( - "/abc/onetl/currenthost/mydag.mytask.myprocess/20230524122150", - ) + >>> from etl_entities.process import Process + >>> from pathlib import Path + >>> generate_temp_path(Path("/tmp")) # doctest: +SKIP + Path("/tmp/onetl/currenthost/myprocess/20230524122150") + >>> with Process(dag="mydag", task="mytask"): # doctest: +SKIP + ... generate_temp_path(Path("/abc")) + Path("/abc/onetl/currenthost/mydag.mytask.myprocess/20230524122150") """ + from etl_entities.process import ProcessStackManager + current_process = ProcessStackManager.get_current() current_dt = datetime.now().strftime(DATETIME_FORMAT) return root / "onetl" / current_process.host / current_process.full_name / current_dt diff --git a/onetl/_util/classproperty.py b/onetl/_util/classproperty.py index 8738304dc..e971638ab 100644 --- a/onetl/_util/classproperty.py +++ b/onetl/_util/classproperty.py @@ -15,7 +15,8 @@ class classproperty(property): # noqa: N801 ... def attribute(cls): ... return 123 >>> # no call - >>> assert My.attribute == 123 + >>> My.attribute + 123 """ def __init__(self, f): diff --git a/onetl/_util/hadoop.py b/onetl/_util/hadoop.py index 1faee4188..fdf275de7 100644 --- a/onetl/_util/hadoop.py +++ b/onetl/_util/hadoop.py @@ -17,7 +17,7 @@ def get_hadoop_version(spark_session: SparkSession) -> Version: jvm = spark_session._jvm # noqa: WPS437 version_info = jvm.org.apache.hadoop.util.VersionInfo # type: ignore[union-attr] hadoop_version: str = version_info.getVersion() - return Version.parse(hadoop_version) + return Version(hadoop_version) def get_hadoop_config(spark_session: SparkSession): diff --git a/onetl/_util/scala.py b/onetl/_util/scala.py index ec5d53fd8..397a91576 100644 --- a/onetl/_util/scala.py +++ b/onetl/_util/scala.py @@ -10,5 +10,5 @@ def get_default_scala_version(spark_version: Version) -> Version: Get default Scala version for specific Spark version """ if spark_version.major < 3: - return Version(2, 11) - return Version(2, 12) + return Version("2.11") + return Version("2.12") diff --git a/onetl/_util/spark.py b/onetl/_util/spark.py index 218c8d7de..230abe80e 100644 --- a/onetl/_util/spark.py +++ b/onetl/_util/spark.py @@ -63,14 +63,14 @@ def get_pyspark_version() -> Version: try_import_pyspark() import pyspark - return Version.parse(pyspark.__version__) + return Version(pyspark.__version__) def get_spark_version(spark_session: SparkSession) -> Version: """ Get Spark version from active Spark session """ - return Version.parse(spark_session.version) + return Version(spark_session.version) def get_executor_total_cores(spark_session: SparkSession, include_driver: bool = False) -> tuple[int | float, dict]: diff --git a/onetl/_util/version.py b/onetl/_util/version.py index 87583ee63..85bde1c7a 100644 --- a/onetl/_util/version.py +++ b/onetl/_util/version.py @@ -2,193 +2,226 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations +import re from functools import total_ordering -from string import ascii_lowercase -from typing import Iterator, NamedTuple, Sequence @total_ordering -class Version(NamedTuple): +class Version: """ Version representation. Examples -------- - >>> Version(5, 6, 7) - Version(major=5, minor=6, patch=7) - >>> Version(5, 6) - Version(major=5, minor=6, patch=None) - >>> Version(5) - Version(major=5, minor=None, patch=None) + >>> Version("12.3.1") + Version('12.3.1') + >>> Version("12.3") + Version('12.3') + >>> Version("12.3.4.5") + Version('12.3.4.5') + >>> Version("12.3.4-patch5") + Version('12.3.4-patch5') """ - major: int - minor: int | None = None - patch: int | None = None + def __init__(self, version: str): + self._raw_str = version + self._raw_parts = re.split("[.-]", version) + self._numeric_parts = [int(part) for part in self._raw_parts if part.isdigit()] - def __iter__(self) -> Iterator[int]: + @property + def major(self) -> int: """ - Iterate over version components which are not ``None``. + Return the major version component. Examples -------- - >>> for part in Version(5, 6, 7): - ... print(part) + >>> Version("5.6.7").major 5 - 6 - 7 + """ + return self._numeric_parts[0] if self._numeric_parts else 0 - >>> for part in Version(5, 6): - ... print(part) - 5 - 6 + @property + def minor(self) -> int: + """ + Return the minor version component. - >>> for part in Version(5): - ... print(part) - 5 + Examples + -------- + >>> Version("5.6.7").minor + 6 + >>> Version("5").minor + 0 """ - yield self.major - if self.minor is not None: - yield self.minor - if self.patch is not None: - yield self.patch + return self._numeric_parts[1] if len(self._numeric_parts) > 1 else 0 - def __len__(self): + @property + def patch(self) -> int: """ - Get number of components set. + Return the patch version component. Examples -------- - >>> assert len(Version(5, 6, 7)) == 3 - >>> assert len(Version(5, 6)) == 2 - >>> assert len(Version(5)) == 1 - + >>> Version("5.6.7").patch + 7 + >>> Version("5.6").patch + 0 """ - if self.patch is not None: - return 3 - if self.minor is not None: - return 2 - return 1 + return self._numeric_parts[2] if len(self._numeric_parts) > 2 else 0 - def __str__(self): + @property + def raw_parts(self) -> list[str]: """ - Get version as string. + Returns the parts of the version string as a list of substrings split by '.' or '-'. Examples -------- - - >>> assert str(Version(5, 6, 7)) == "5.6.7" - >>> assert str(Version(5, 6)) == "5.6" - >>> assert str(Version(5)) == "5" + >>> Version("1.2.3-alpha").raw_parts + ['1', '2', '3', 'alpha'] """ - return ".".join(map(str, self)) + return self._raw_parts - def __eq__(self, other): + def __getitem__(self, item): """ - Compare versions. + Allows direct access to the numeric parts of the version by index. Examples -------- + >>> Version("1.2.3")[0] + 1 + >>> Version("1.2.3")[1] + 2 + >>> Version("1.2.3")[2] + 3 + >>> Version("1.2.3-alpha")[3] + Traceback (most recent call last): + ... + IndexError: list index out of range + """ + return self._numeric_parts[item] + + def __len__(self): + """ + Get number of components set. - >>> assert Version(5, 6, 7) == Version(5, 6, 7) + Examples + -------- - >>> # Version could be replaced with tuple[int, ...] - >>> assert Version(5, 6, 7) == (5, 6, 7) - >>> assert Version(5, 6) == (5, 6) - >>> assert Version(5) == (5,) + >>> len(Version("5.6.7")) + 3 + >>> len(Version("5.6")) + 2 + >>> len(Version("5")) + 1 """ - if not isinstance(other, tuple): - return NotImplemented + return len(self._numeric_parts) - return tuple(self) == other + def __repr__(self): + return f"Version('{self._raw_str}')" - def __gt__(self, other): + def __str__(self): """ - Compare versions. + Return a string representation of the version. Examples -------- - >>> assert Version(5, 6, 7) > Version(5, 6, 6) - >>> assert not Version(5, 6, 7) > Version(5, 6, 7) - - >>> # Version could be replaced with tuple[int, ...] - >>> assert Version(5, 6, 7) > (5, 6) - >>> assert not Version(5, 6, 7) > (5, 7) + >>> str(Version("5.6.7")) + '5.6.7' + >>> str(Version("5.6")) + '5.6' + >>> str(Version("5.6.7.8")) + '5.6.7.8' + >>> str(Version("5.6.7-patch8")) + '5.6.7-patch8' - >>> assert Version(5, 6) > (5, 5) - >>> assert not Version(5, 6) > (5, 6) - >>> assert not Version(5, 6) > (5, 7) + """ + return self._raw_str - >>> assert Version(5, 6) > (5,) - >>> assert not Version(5, 6) > (6,) + def __eq__(self, other): + """ + Compare two versions for equality. - >>> assert Version(5) > (4,) - >>> assert not Version(5) > (5,) - >>> assert not Version(5) > (6,) + Examples + -------- + >>> Version("5.6.7") == Version("5.6.7") + True + >>> Version("5.6.7") == Version("5.6.8") + False """ - if not isinstance(other, tuple): + if not isinstance(other, Version): return NotImplemented + return self._numeric_parts == other._numeric_parts - return tuple(self) > other - - @classmethod - def parse(cls, version: int | float | str | Sequence) -> Version: + def __lt__(self, other: Version): """ - Parse input as version object. + Compare two versions using less than. Examples -------- - >>> assert Version.parse("5.6.7") == Version(5, 6, 7) - >>> assert Version.parse("5.6") == Version(5, 6) - >>> assert Version.parse("5") == Version(5) - - >>> assert Version.parse([5, 6, 7]) == Version(5, 6, 7) - >>> assert Version.parse([5, 6]) == Version(5, 6) - >>> assert Version.parse([5]) == Version(5) - - >>> assert Version.parse(5) == Version(5) - >>> assert Version.parse(5.0) == Version(5, 0) - + >>> Version("5.6.7") < Version("5.6.8") + True + >>> Version("5.6.9") < Version("5.6.8") + False """ - if isinstance(version, (int, float)): - version = str(version) - if isinstance(version, str): - version = version.split(".") - return cls(*map(int, version[:3])) + if not isinstance(other, Version): + return NotImplemented + return self._numeric_parts < other._numeric_parts - def digits(self, items: int) -> Version: + def min_digits(self, num_parts: int) -> Version: """ - Return version with exactly N components. + Ensure the version has at least a specified number of numeric components. Raises ------ - AssertionError + ValueError There is not enough components Examples -------- - - >>> assert Version(5, 6, 7).digits(3) == Version(5, 6, 7) - >>> assert Version(5, 6, 7).digits(2) == Version(5, 6) - >>> assert Version(5, 6, 7).digits(1) == Version(5) - >>> Version(5, 6).digits(3) - Traceback (most recent call last): - AssertionError: Version '5.6' does not match format 'a.b.c' - >>> Version(5).digits(2) + >>> Version("5.6.7").min_digits(3) + Version('5.6.7') + >>> Version("5.6.7").min_digits(2) + Version('5.6.7') + >>> Version("5.6").min_digits(3) Traceback (most recent call last): - AssertionError: Version '5' does not match format 'a.b' + ... + ValueError: Version '5.6' does not have enough numeric components for requested format (expected at least 3). + """ + if len(self._numeric_parts) < num_parts: + raise ValueError( + f"Version '{self}' does not have enough numeric components for requested format (expected at least {num_parts}).", + ) + return self + + def format(self, format_string: str) -> str: + """ + Format the version using a custom format string. + + Examples + -------- + >>> v = Version("5.6.7") + >>> v.format("{major}.{minor}.{patch}") + '5.6.7' + >>> v.format("{0}.{1}.{2}") + '5.6.7' + >>> v.format("{0}.{1}.{2} - Complete Version") + '5.6.7 - Complete Version' + >>> v = Version("12.3.4-patch5") + >>> v.format("{major}.{minor}.{patch}") + '12.3.4' """ - if len(self) < items: - expected = ".".join(ascii_lowercase[:items]) - raise AssertionError(f"Version '{self}' does not match format '{expected}'") - return self.parse(tuple(self)[:items]) + return format_string.format( + *self._numeric_parts, + major=self.major, + minor=self.minor, + patch=self.patch, + ) diff --git a/onetl/base/base_db_connection.py b/onetl/base/base_db_connection.py index 670b15ec5..f9c7bcac0 100644 --- a/onetl/base/base_db_connection.py +++ b/onetl/base/base_db_connection.py @@ -132,6 +132,9 @@ def read_source_as_df( ) -> DataFrame: """ Reads the source to dataframe. |support_hooks| + + .. versionchanged:: 0.9.0 + Renamed ``read_df`` → ``read_source_as_df`` """ @abstractmethod @@ -142,4 +145,7 @@ def write_df_to_target( ) -> None: """ Saves dataframe to a specific target. |support_hooks| + + .. versionchanged:: 0.9.0 + Renamed ``write_df`` → ``write_df_to_target`` """ diff --git a/onetl/base/base_file_connection.py b/onetl/base/base_file_connection.py index 3d1aa2357..81d57bfb6 100644 --- a/onetl/base/base_file_connection.py +++ b/onetl/base/base_file_connection.py @@ -15,13 +15,17 @@ class BaseFileConnection(BaseConnection): """ - Implements generic methods for files and directories manipulation on some filesystem (usually remote) + Implements generic methods for files and directories manipulation on some filesystem (usually remote). + + .. versionadded:: 0.8.0 """ @abstractmethod def path_exists(self, path: os.PathLike | str) -> bool: """ - Check if specified path exists on remote filesystem. |support_hooks| + Check if specified path exists on remote filesystem. |support_hooks|. + + .. versionadded:: 0.8.0 Parameters ---------- @@ -35,11 +39,12 @@ def path_exists(self, path: os.PathLike | str) -> bool: Examples -------- - .. code:: python - - assert connection.path_exists("/path/to/file.csv") - assert connection.path_exists("/path/to/dir") - assert not connection.path_exists("/path/to/missing") + >>> connection.path_exists("/path/to/file.csv") + True + >>> connection.path_exists("/path/to/dir") + True + >>> connection.path_exists("/path/to/missing") + False """ @abstractmethod @@ -47,6 +52,8 @@ def is_file(self, path: os.PathLike | str) -> bool: """ Check if specified path is a file. |support_hooks| + .. versionadded:: 0.8.0 + Parameters ---------- path : str or :obj:`os.PathLike` @@ -64,10 +71,10 @@ def is_file(self, path: os.PathLike | str) -> bool: Examples -------- - .. code:: python - - assert connection.is_file("/path/to/dir/file.csv") - assert not connection.is_file("/path/to/dir") + >>> connection.is_file("/path/to/dir/file.csv") + True + >>> connection.is_file("/path/to/dir") + False """ @abstractmethod @@ -75,6 +82,8 @@ def is_dir(self, path: os.PathLike | str) -> bool: """ Check if specified path is a directory. |support_hooks| + .. versionadded:: 0.8.0 + Parameters ---------- path : str or :obj:`os.PathLike` @@ -92,10 +101,10 @@ def is_dir(self, path: os.PathLike | str) -> bool: Examples -------- - .. code:: python - - assert connection.is_dir("/path/to/dir") - assert not connection.is_dir("/path/to/dir/file.csv") + >>> connection.is_dir("/path/to/dir") + True + >>> connection.is_dir("/path/to/dir/file.csv") + False """ @abstractmethod @@ -103,6 +112,8 @@ def get_stat(self, path: os.PathLike | str) -> PathStatProtocol: """ Returns stats for a specific path. |support_hooks| + .. versionadded:: 0.8.0 + Parameters ---------- path : str or :obj:`os.PathLike` @@ -119,11 +130,11 @@ def get_stat(self, path: os.PathLike | str) -> PathStatProtocol: Examples -------- - .. code:: python - - stat = connection.get_stat("/path/to/file.csv") - assert stat.st_size > 0 - assert stat.st_uid == 12345 # owner id + >>> stat = connection.get_stat("/path/to/file.csv") + >>> stat.st_size # in bytes + 1024 + >>> stat.st_uid # owner id or name + 12345 """ @abstractmethod @@ -131,6 +142,8 @@ def resolve_dir(self, path: os.PathLike | str) -> PathWithStatsProtocol: """ Returns directory at specific path, with stats. |support_hooks| + .. versionadded:: 0.8.0 + Parameters ---------- path : str or :obj:`os.PathLike` @@ -151,11 +164,11 @@ def resolve_dir(self, path: os.PathLike | str) -> PathWithStatsProtocol: Examples -------- - .. code:: python - - dir_path = connection.resolve_dir("/path/to/dir") - assert os.fspath(dir_path) == "/path/to/dir" - assert dir_path.stat.st_uid == 12345 # owner id + >>> dir_path = connection.resolve_dir("/path/to/dir") + >>> os.fspath(dir_path) + '/path/to/dir' + >>> dir_path.stat.st_uid # owner id + 12345 """ @abstractmethod @@ -163,6 +176,8 @@ def resolve_file(self, path: os.PathLike | str) -> PathWithStatsProtocol: """ Returns file at specific path, with stats. |support_hooks| + .. versionadded:: 0.8.0 + Parameters ---------- path : str or :obj:`os.PathLike` @@ -183,11 +198,11 @@ def resolve_file(self, path: os.PathLike | str) -> PathWithStatsProtocol: Examples -------- - .. code:: python - - file_path = connection.resolve_file("/path/to/dir/file.csv") - assert os.fspath(file_path) == "/path/to/dir/file.csv" - assert file_path.stat.st_uid == 12345 # owner id + >>> file_path = connection.resolve_file("/path/to/dir/file.csv") + >>> os.fspath(file_path) + '/path/to/dir/file.csv' + >>> file_path.stat.st_uid # owner id + 12345 """ @abstractmethod @@ -195,6 +210,8 @@ def create_dir(self, path: os.PathLike | str) -> PathWithStatsProtocol: """ Creates directory tree on remote filesystem. |support_hooks| + .. versionadded:: 0.8.0 + Parameters ---------- path : str or :obj:`os.PathLike` @@ -212,9 +229,9 @@ def create_dir(self, path: os.PathLike | str) -> PathWithStatsProtocol: Examples -------- - .. code:: python - - dir_path = connection.create_dir("/path/to/dir") + >>> dir_path = connection.create_dir("/path/to/dir") + >>> os.fspath(dir_path) + '/path/to/dir' """ @abstractmethod @@ -228,6 +245,8 @@ def remove_file(self, path: os.PathLike | str) -> bool: Supports only one file removal per call. Directory removal is **NOT** supported, use :obj:`~remove_dir` instead. + .. versionadded:: 0.8.0 + Parameters ---------- path : str or :obj:`os.PathLike` @@ -245,12 +264,12 @@ def remove_file(self, path: os.PathLike | str) -> bool: Examples -------- - .. code:: python - - assert connection.remove_file("/path/to/file.csv") - assert not connection.path_exists("/path/to/dir/file.csv") - - assert not connection.remove_file("/path/to/file.csv") # already deleted + >>> connection.remove_file("/path/to/file.csv") + True + >>> connection.path_exists("/path/to/dir/file.csv") + False + >>> connection.remove_file("/path/to/file.csv") # already deleted, no error + False """ @abstractmethod @@ -260,6 +279,8 @@ def remove_dir(self, path: os.PathLike | str, recursive: bool = False) -> bool: If directory does not exist, no exception is raised. + .. versionadded:: 0.8.0 + Parameters ---------- path : str or :obj:`os.PathLike` @@ -280,13 +301,14 @@ def remove_dir(self, path: os.PathLike | str, recursive: bool = False) -> bool: Examples -------- - .. code:: python - - assert connection.remove_dir("/path/to/dir") - assert not connection.path_exists("/path/to/dir/file.csv") - assert not connection.path_exists("/path/to/dir") - - assert not connection.remove_dir("/path/to/dir") # already deleted + >>> connection.remove_dir("/path/to/dir") + True + >>> connection.path_exists("/path/to/dir") + False + >>> connection.path_exists("/path/to/dir/file.csv") + False + >>> connection.remove_dir("/path/to/dir") # already deleted, no error + False """ @abstractmethod @@ -303,6 +325,8 @@ def rename_file( Supports only one file move per call. Directory move/rename is **NOT** supported. + .. versionadded:: 0.8.0 + Parameters ---------- source_file_path : str or :obj:`os.PathLike` @@ -332,11 +356,13 @@ def rename_file( Examples -------- - .. code:: python - - new_file = connection.rename_file("/path/to/file1.csv", "/path/to/file2.csv") - assert connection.path_exists("/path/to/file2.csv") - assert not connection.path_exists("/path/to/file1.csv") + >>> new_file = connection.rename_file("/path/to/file1.csv", "/path/to/file2.csv") + >>> os.fspath(new_file) + '/path/to/file2.csv' + >>> connection.path_exists("/path/to/file2.csv") + True + >>> connection.path_exists("/path/to/file1.csv") + False """ @abstractmethod @@ -349,6 +375,8 @@ def list_dir( """ Return list of child files/directories in a specific directory. |support_hooks| + .. versionadded:: 0.8.0 + Parameters ---------- path : str or :obj:`os.PathLike` @@ -376,11 +404,11 @@ def list_dir( Examples -------- - .. code:: python - - dir_content = connection.list_dir("/path/to/dir") - assert os.fspath(dir_content[0]) == "/path/to/dir/file.csv" - assert connection.path_exists("/path/to/dir/file.csv") + >>> dir_content = connection.list_dir("/path/to/dir") + >>> os.fspath(dir_content[0]) + '/path/to/dir/file.csv' + >>> connection.path_exists("/path/to/dir/file.csv") + True """ @abstractmethod @@ -396,6 +424,8 @@ def walk( Just like :obj:`os.walk`, but with additional filter/limit logic. + .. versionadded:: 0.8.0 + Parameters ---------- root : str or :obj:`os.PathLike` @@ -428,13 +458,16 @@ def walk( Examples -------- - .. code:: python - - for root, dirs, files in connection.walk("/path/to/dir"): - assert os.fspath(root) == "/path/to/dir" - assert dirs == [] - assert os.fspath(files[0]) == "/path/to/dir/file.csv" - assert connection.path_exists("/path/to/dir/file.csv") + >>> for root, dirs, files in connection.walk("/path/to/dir"): + ... break + >>> os.fspath(root) + '/path/to/dir' + >>> dirs + [] + >>> os.fspath(files[0]) + '/path/to/dir/file.csv' + >>> connection.path_exists("/path/to/dir/file.csv") + True """ @abstractmethod @@ -451,6 +484,8 @@ def download_file( Supports only one file download per call. Directory download is **NOT** supported, use :ref:`file-downloader` instead. + .. versionadded:: 0.8.0 + Parameters ---------- remote_file_path : str or :obj:`os.PathLike` @@ -483,14 +518,18 @@ def download_file( Examples -------- - .. code:: python - - local_file = connection.download_file( - remote_file_path="/path/to/source.csv", local_file_path="/path/to/target.csv" - ) - assert local_file.exists() - assert os.fspath(local_file) == "/path/to/target.csv" - assert local_file.stat().st_size == connection.get_stat("/path/to/source.csv").st_size + >>> local_file = connection.download_file( + ... remote_file_path="/path/to/source.csv", + ... local_file_path="/path/to/target.csv", + ... ) + >>> os.fspath(local_file) + '/path/to/target.csv' + >>> local_file.exists() + True + >>> local_file.stat().st_size # in bytes + 1024 + >>> connection.get_stat("/path/to/source.csv").st_size # same size + 1024 """ @abstractmethod @@ -507,6 +546,8 @@ def upload_file( Supports only one file upload per call. Directory upload is **NOT** supported, use :ref:`file-uploader` instead. + .. versionadded:: 0.8.0 + Parameters ---------- local_file_path : str or :obj:`os.PathLike` @@ -539,14 +580,18 @@ def upload_file( Examples -------- - .. code:: python - - remote_file = connection.upload( - local_file_path="/path/to/source.csv", - remote_file_path="/path/to/target.csv", - ) - assert connection.path_exists("/path/to/target.csv") - assert remote_file.stat().st_size == os.stat("/path/to/source.csv").st_size + >>> remote_file = connection.upload( + ... local_file_path="/path/to/source.csv", + ... remote_file_path="/path/to/target.csv", + ... ) + >>> os.fspath(remote_file) + '/path/to/target.csv' + >>> connection.path_exists("/path/to/target.csv") + True + >>> remote_file.stat().st_size # in bytes + 1024 + >>> os.stat("/path/to/source.csv").st_size # same as source + 1024 """ @abstractmethod @@ -554,6 +599,8 @@ def read_text(self, path: os.PathLike | str, encoding: str = "utf-8") -> str: r""" Returns string content of a file at specific path. |support_hooks| + .. versionadded:: 0.8.0 + Parameters ---------- path : str or :obj:`os.PathLike` @@ -577,10 +624,8 @@ def read_text(self, path: os.PathLike | str, encoding: str = "utf-8") -> str: Examples -------- - .. code:: python - - content = connection.read_text("/path/to/dir/file.csv") - assert content == "some;header\n1;2" + >>> connection.read_text("/path/to/dir/file.csv") + 'some;header\n1;2' """ @abstractmethod @@ -588,6 +633,8 @@ def read_bytes(self, path: os.PathLike | str) -> bytes: """ Returns binary content of a file at specific path. |support_hooks| + .. versionadded:: 0.8.0 + Parameters ---------- path : str or :obj:`os.PathLike` @@ -608,10 +655,8 @@ def read_bytes(self, path: os.PathLike | str) -> bytes: Examples -------- - .. code:: python - - content = connection.read_bytes("/path/to/dir/file.csv") - assert content == b"0xdeadbeef" + >>> connection.read_bytes("/path/to/dir/file.csv") + b'0xdeadbeef' """ @abstractmethod @@ -628,6 +673,8 @@ def write_text( If file already exists, its content will be replaced. + .. versionadded:: 0.8.0 + Parameters ---------- path : str or :obj:`os.PathLike` @@ -654,10 +701,11 @@ def write_text( Examples -------- - .. code:: python - - file_path = connection.write_text("/path/to/dir/file.csv", "some;header\n1;2") - assert file_path.stat.st_size > 0 + >>> file_path = connection.write_text("/path/to/dir/file.csv", "some;header\n1;2") + >>> os.fspath(file_path) + '/path/to/dir/file.csv' + >>> file_path.stat.st_size # in bytes + 1024 """ @abstractmethod @@ -669,6 +717,8 @@ def write_bytes(self, path: os.PathLike | str, content: bytes) -> PathWithStatsP If file already exists, its content will be replaced. + .. versionadded:: 0.8.0 + Parameters ---------- path : str or :obj:`os.PathLike` @@ -692,10 +742,11 @@ def write_bytes(self, path: os.PathLike | str, content: bytes) -> PathWithStatsP Examples -------- - .. code:: python - - file_path = connection.write_bytes("/path/to/dir/file.csv", b"0xdeadbeef") - assert file_path.stat.st_size > 0 + >>> file_path = connection.write_bytes("/path/to/dir/file.csv", b"0xdeadbeef") + >>> os.fspath(file_path) + '/path/to/dir/file.csv' + >>> file_path.stat.st_size # in bytes + 1024 """ @property diff --git a/onetl/base/base_file_df_connection.py b/onetl/base/base_file_df_connection.py index c432bd5a2..c54390ce8 100644 --- a/onetl/base/base_file_df_connection.py +++ b/onetl/base/base_file_df_connection.py @@ -17,7 +17,9 @@ class FileDFReadOptions(ABC): """ - Protocol for objects supporting altering Spark DataFrameReader options + Protocol for objects supporting altering Spark DataFrameReader options. + + .. versionadded:: 0.9.0 """ @abstractmethod @@ -25,6 +27,8 @@ def apply_to_reader(self, reader: DataFrameReader) -> DataFrameReader | ContextM """ Apply provided format to :obj:`pyspark.sql.DataFrameReader`. + .. versionadded:: 0.9.0 + Returns ------- :obj:`pyspark.sql.DataFrameReader` @@ -38,7 +42,9 @@ def apply_to_reader(self, reader: DataFrameReader) -> DataFrameReader | ContextM class FileDFWriteOptions(ABC): """ - Protocol for objects supporting altering Spark DataFrameWriter options + Protocol for objects supporting altering Spark DataFrameWriter options. + + .. versionadded:: 0.9.0 """ @abstractmethod @@ -46,6 +52,8 @@ def apply_to_writer(self, writer: DataFrameWriter) -> DataFrameWriter | ContextM """ Apply provided format to :obj:`pyspark.sql.DataFrameWriter`. + .. versionadded:: 0.9.0 + Returns ------- :obj:`pyspark.sql.DataFrameWriter` @@ -59,7 +67,9 @@ def apply_to_writer(self, writer: DataFrameWriter) -> DataFrameWriter | ContextM class BaseFileDFConnection(BaseConnection): """ - Implements generic methods for reading and writing dataframe as files + Implements generic methods for reading and writing dataframe as files. + + .. versionadded:: 0.9.0 """ @abstractmethod @@ -70,6 +80,8 @@ def check_if_format_supported( """ Validate if specific file format is supported. |support_hooks| + .. versionadded:: 0.9.0 + Raises ------ RuntimeError @@ -80,12 +92,17 @@ def check_if_format_supported( def path_from_string(self, path: os.PathLike | str) -> PurePathProtocol: """ Convert path from string to object. |support_hooks| + + .. versionadded:: 0.9.0 """ @property @abstractmethod def instance_url(self) -> str: - """Instance URL""" + """Instance URL. + + .. versionadded:: 0.9.0 + """ @abstractmethod def read_files_as_df( @@ -98,6 +115,8 @@ def read_files_as_df( ) -> DataFrame: """ Read files in some paths list as dataframe. |support_hooks| + + .. versionadded:: 0.9.0 """ @abstractmethod @@ -110,4 +129,6 @@ def write_df_as_files( ) -> None: """ Write dataframe as files in some path. |support_hooks| + + .. versionadded:: 0.9.0 """ diff --git a/onetl/base/base_file_filter.py b/onetl/base/base_file_filter.py index 641a5b3ed..01a9893f1 100644 --- a/onetl/base/base_file_filter.py +++ b/onetl/base/base_file_filter.py @@ -15,6 +15,8 @@ class BaseFileFilter(ABC): to determine if a file should be handled or not. All filters are stateless. + + .. versionadded:: 0.8.0 """ @abstractmethod @@ -22,14 +24,17 @@ def match(self, path: PathProtocol) -> bool: """ Returns ``True`` if path is matching the filter, ``False`` otherwise + .. versionadded:: 0.8.0 + Examples -------- - .. code:: python - - from onetl.impl import LocalPath + from onetl.impl import LocalPath - assert filter.match(LocalPath("/path/to/file.csv")) - assert not filter.match(LocalPath("/path/to/excluded.csv")) - assert filter.match(LocalPath("/path/to/file.csv")) + >>> filter.match(LocalPath("/path/to/file.csv")) + True + >>> filter.match(LocalPath("/path/to/excluded.csv")) + False + >>> filter.match(LocalPath("/path/to/file.csv")) + True """ diff --git a/onetl/base/base_file_format.py b/onetl/base/base_file_format.py index b1f32aa07..a4c72e3e5 100644 --- a/onetl/base/base_file_format.py +++ b/onetl/base/base_file_format.py @@ -12,6 +12,8 @@ class BaseReadableFileFormat(ABC): """ Representation of readable file format. + + .. versionadded:: 0.9.0 """ @abstractmethod @@ -19,6 +21,8 @@ def check_if_supported(self, spark: SparkSession) -> None: """ Check if Spark session does support this file format. |support_hooks| + .. versionadded:: 0.9.0 + Raises ------ RuntimeError @@ -30,6 +34,8 @@ def apply_to_reader(self, reader: DataFrameReader) -> DataFrameReader | ContextM """ Apply provided format to :obj:`pyspark.sql.DataFrameReader`. |support_hooks| + .. versionadded:: 0.9.0 + Returns ------- :obj:`pyspark.sql.DataFrameReader` @@ -44,6 +50,8 @@ def apply_to_reader(self, reader: DataFrameReader) -> DataFrameReader | ContextM class BaseWritableFileFormat(ABC): """ Representation of writable file format. + + .. versionadded:: 0.9.0 """ @abstractmethod @@ -51,6 +59,8 @@ def check_if_supported(self, spark: SparkSession) -> None: """ Check if Spark session does support this file format. |support_hooks| + .. versionadded:: 0.9.0 + Raises ------ RuntimeError @@ -62,6 +72,8 @@ def apply_to_writer(self, writer: DataFrameWriter) -> DataFrameWriter | ContextM """ Apply provided format to :obj:`pyspark.sql.DataFrameWriter`. |support_hooks| + .. versionadded:: 0.9.0 + Returns ------- :obj:`pyspark.sql.DataFrameWriter` diff --git a/onetl/base/base_file_limit.py b/onetl/base/base_file_limit.py index f71f1910c..d930690d6 100644 --- a/onetl/base/base_file_limit.py +++ b/onetl/base/base_file_limit.py @@ -17,6 +17,8 @@ class BaseFileLimit(ABC): to determine if internal loop should be stopped. Unlike file filters, limits have internal state which can be updated or reset. + + .. versionadded:: 0.8.0 """ @abstractmethod @@ -24,6 +26,8 @@ def reset(self) -> Self: """ Resets the internal limit state. + .. versionadded:: 0.8.0 + Returns ------- Returns a filter of the same type, but with non-reached state. @@ -33,12 +37,11 @@ def reset(self) -> Self: Examples -------- - .. code:: python - - assert limit.is_reached - - new_limit = limit.reset() - assert not new_limit.is_reached + >>> limit.is_reached + True + >>> new_limit = limit.reset() + >>> new_limit.is_reached + False """ @abstractmethod @@ -46,6 +49,8 @@ def stops_at(self, path: PathProtocol) -> bool: """ Update internal state and return current state. + .. versionadded:: 0.8.0 + Parameters ---------- path : :obj:`onetl.base.path_protocol.PathProtocol` @@ -58,21 +63,18 @@ def stops_at(self, path: PathProtocol) -> bool: Examples -------- - .. code:: python - - from onetl.impl import LocalPath - - assert not limit.stops_at(LocalPath("/path/to/file.csv")) - # do this multiple times - ... - - # stopped on some input - assert limit.stops_at(LocalPath("/path/to/another.csv")) - - # at this point, .stops_at() and .is_reached will always return True, - # even on inputs that returned False before. - # it will be in the same state until .reset() is called - assert limit.stops_at(LocalPath("/path/to/file.csv")) + >>> from onetl.impl import LocalPath + >>> # limit is not reached yet + >>> limit.stops_at(LocalPath("/path/to/file.csv")) + False + >>> # after limit is reached + >>> limit.stops_at(LocalPath("/path/to/another.csv")) + True + >>> # at this point, .stops_at() and .is_reached will always return True, + >>> # even on inputs that returned False before. + >>> # it will be in the same state until .reset() is called + >>> limit.stops_at(LocalPath("/path/to/file.csv")) + True """ @property @@ -81,6 +83,8 @@ def is_reached(self) -> bool: """ Check if limit is reached. + .. versionadded:: 0.8.0 + Returns ------- ``True`` if limit is reached, ``False`` otherwise. @@ -88,15 +92,16 @@ def is_reached(self) -> bool: Examples -------- - .. code:: python - - from onetl.impl import LocalPath - - assert not limit.is_reached - - assert not limit.stops_at(LocalPath("/path/to/file.csv")) - assert not limit.is_reached - - assert limit.stops_at(LocalPath("/path/to/file.csv")) - assert limit.is_reached + >>> from onetl.impl import LocalPath + >>> limit.is_reached + False + >>> limit.stops_at(LocalPath("/path/to/file.csv")) + False + >>> limit.is_reached + False + >>> # after limit is reached + >>> limit.stops_at(LocalPath("/path/to/file.csv")) + True + >>> limit.is_reached + True """ diff --git a/onetl/base/supports_rename_dir.py b/onetl/base/supports_rename_dir.py index 703866992..8f0d39712 100644 --- a/onetl/base/supports_rename_dir.py +++ b/onetl/base/supports_rename_dir.py @@ -12,7 +12,9 @@ @runtime_checkable class SupportsRenameDir(Protocol): """ - Protocol for objects containing ``rename_dir`` method + Protocol for objects containing ``rename_dir`` method. + + .. versionadded:: 0.8.0 """ def rename_dir( diff --git a/onetl/connection/db_connection/clickhouse/connection.py b/onetl/connection/db_connection/clickhouse/connection.py index 88de5645a..0097f2862 100644 --- a/onetl/connection/db_connection/clickhouse/connection.py +++ b/onetl/connection/db_connection/clickhouse/connection.py @@ -3,11 +3,18 @@ from __future__ import annotations import logging -import warnings from typing import ClassVar, Optional from onetl._util.classproperty import classproperty +from onetl._util.version import Version from onetl.connection.db_connection.clickhouse.dialect import ClickhouseDialect +from onetl.connection.db_connection.clickhouse.options import ( + ClickhouseExecuteOptions, + ClickhouseFetchOptions, + ClickhouseReadOptions, + ClickhouseSQLOptions, + ClickhouseWriteOptions, +) from onetl.connection.db_connection.jdbc_connection import JDBCConnection from onetl.connection.db_connection.jdbc_mixin import JDBCStatementType from onetl.hooks import slot, support_hooks @@ -28,13 +35,15 @@ class Config: class Clickhouse(JDBCConnection): """Clickhouse JDBC connection. |support_hooks| - Based on Maven package ``ru.yandex.clickhouse:clickhouse-jdbc:0.3.2`` + Based on Maven package `com.clickhouse:clickhouse-jdbc:0.6.0-patch5 `_ (`official Clickhouse JDBC driver `_). - .. warning:: + .. seealso:: Before using this connector please take into account :ref:`clickhouse-prerequisites` + .. versionadded:: 0.1.0 + Parameters ---------- host : str @@ -104,13 +113,37 @@ class Clickhouse(JDBCConnection): Extra = ClickhouseExtra Dialect = ClickhouseDialect - DRIVER: ClassVar[str] = "ru.yandex.clickhouse.ClickHouseDriver" + ReadOptions = ClickhouseReadOptions + WriteOptions = ClickhouseWriteOptions + SQLOptions = ClickhouseSQLOptions + FetchOptions = ClickhouseFetchOptions + ExecuteOptions = ClickhouseExecuteOptions + + DRIVER: ClassVar[str] = "com.clickhouse.jdbc.ClickHouseDriver" @slot @classmethod - def get_packages(cls) -> list[str]: + def get_packages( + cls, + package_version: str | None = None, + apache_http_client_version: str | None = None, + ) -> list[str]: """ - Get package names to be downloaded by Spark. |support_hooks| + Get package names to be downloaded by Spark. Allows specifying custom JDBC and Apache HTTP Client versions. |support_hooks| + + .. versionadded:: 0.9.0 + + Parameters + ---------- + package_version : str, optional + ClickHouse JDBC version client packages. Defaults to ``0.6.0-patch5``. + + .. versionadded:: 0.11.0 + + apache_http_client_version : str, optional + Apache HTTP Client version package. Defaults to ``5.3.1``. + + .. versionadded:: 0.11.0 Examples -------- @@ -119,27 +152,42 @@ def get_packages(cls) -> list[str]: from onetl.connection import Clickhouse - Clickhouse.get_packages() + Clickhouse.get_packages(package_version="0.6.0", apache_http_client_version="5.3.1") """ - return ["ru.yandex.clickhouse:clickhouse-jdbc:0.3.2"] + default_jdbc_version = "0.6.0-patch5" + default_http_version = "5.3.1" + + jdbc_version = Version(package_version or default_jdbc_version).min_digits(3) + http_version = Version(apache_http_client_version or default_http_version).min_digits(3) + + result = [ + f"com.clickhouse:clickhouse-jdbc:{jdbc_version}", + f"com.clickhouse:clickhouse-http-client:{jdbc_version}", + ] + + if jdbc_version >= Version("0.5.0"): + result.append(f"org.apache.httpcomponents.client5:httpclient5:{http_version}") + + return result @classproperty - def package(cls) -> str: - """Get package name to be downloaded by Spark.""" - msg = "`Clickhouse.package` will be removed in 1.0.0, use `Clickhouse.get_packages()` instead" - warnings.warn(msg, UserWarning, stacklevel=3) - return "ru.yandex.clickhouse:clickhouse-jdbc:0.3.2" + def package(self) -> str: + """Get a single string of package names to be downloaded by Spark for establishing a Clickhouse connection.""" + return "com.clickhouse:clickhouse-jdbc:0.6.0-patch5,com.clickhouse:clickhouse-http-client:0.6.0-patch5,org.apache.httpcomponents.client5:httpclient5:5.3.1" @property def jdbc_url(self) -> str: - extra = self.extra.dict(by_alias=True) - parameters = "&".join(f"{k}={v}" for k, v in sorted(extra.items())) - if self.database: - return f"jdbc:clickhouse://{self.host}:{self.port}/{self.database}?{parameters}".rstrip("?") + return f"jdbc:clickhouse://{self.host}:{self.port}/{self.database}" + + return f"jdbc:clickhouse://{self.host}:{self.port}" - return f"jdbc:clickhouse://{self.host}:{self.port}?{parameters}".rstrip("?") + @property + def jdbc_params(self) -> dict: + result = super().jdbc_params + result.update(self.extra.dict(by_alias=True)) + return result @staticmethod def _build_statement( diff --git a/onetl/connection/db_connection/clickhouse/dialect.py b/onetl/connection/db_connection/clickhouse/dialect.py index 187b2e787..2c03620d3 100644 --- a/onetl/connection/db_connection/clickhouse/dialect.py +++ b/onetl/connection/db_connection/clickhouse/dialect.py @@ -26,9 +26,11 @@ def get_min_value(self, value: Any) -> str: return f"minOrNull({result})" def _serialize_datetime(self, value: datetime) -> str: - result = value.strftime("%Y-%m-%d %H:%M:%S") - return f"CAST('{result}' AS DateTime)" + # this requires at least Clickhouse 21.1, see: + # https://github.com/ClickHouse/ClickHouse/issues/16655 + result = value.strftime("%Y-%m-%d %H:%M:%S.%f") + return f"toDateTime64('{result}', 6)" def _serialize_date(self, value: date) -> str: result = value.strftime("%Y-%m-%d") - return f"CAST('{result}' AS Date)" + return f"toDate('{result}')" diff --git a/onetl/connection/db_connection/clickhouse/options.py b/onetl/connection/db_connection/clickhouse/options.py new file mode 100644 index 000000000..5e35c9693 --- /dev/null +++ b/onetl/connection/db_connection/clickhouse/options.py @@ -0,0 +1,33 @@ +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 + + +from onetl.connection.db_connection.jdbc_connection.options import ( + JDBCReadOptions, + JDBCSQLOptions, + JDBCWriteOptions, +) +from onetl.connection.db_connection.jdbc_mixin.options import ( + JDBCExecuteOptions, + JDBCFetchOptions, +) + + +class ClickhouseReadOptions(JDBCReadOptions): + __doc__ = JDBCReadOptions.__doc__ # type: ignore[assignment] + + +class ClickhouseWriteOptions(JDBCWriteOptions): + __doc__ = JDBCWriteOptions.__doc__ # type: ignore[assignment] + + +class ClickhouseSQLOptions(JDBCSQLOptions): + __doc__ = JDBCSQLOptions.__doc__ # type: ignore[assignment] + + +class ClickhouseFetchOptions(JDBCFetchOptions): + __doc__ = JDBCFetchOptions.__doc__ # type: ignore[assignment] + + +class ClickhouseExecuteOptions(JDBCExecuteOptions): + __doc__ = JDBCExecuteOptions.__doc__ # type: ignore[assignment] diff --git a/onetl/connection/db_connection/greenplum/connection.py b/onetl/connection/db_connection/greenplum/connection.py index 4460be149..be62afa5f 100644 --- a/onetl/connection/db_connection/greenplum/connection.py +++ b/onetl/connection/db_connection/greenplum/connection.py @@ -26,12 +26,19 @@ ) from onetl.connection.db_connection.greenplum.dialect import GreenplumDialect from onetl.connection.db_connection.greenplum.options import ( + GreenplumExecuteOptions, + GreenplumFetchOptions, GreenplumReadOptions, + GreenplumSQLOptions, GreenplumTableExistBehavior, GreenplumWriteOptions, ) from onetl.connection.db_connection.jdbc_mixin import JDBCMixin -from onetl.connection.db_connection.jdbc_mixin.options import JDBCOptions +from onetl.connection.db_connection.jdbc_mixin.options import ( + JDBCExecuteOptions, + JDBCFetchOptions, + JDBCOptions, +) from onetl.exception import MISSING_JVM_CLASS_MSG, TooManyParallelJobsError from onetl.hooks import slot, support_hooks from onetl.hwm import Window @@ -70,10 +77,12 @@ class Greenplum(JDBCMixin, DBConnection): Based on package ``io.pivotal:greenplum-spark:2.2.0`` (`VMware Greenplum connector for Spark `_). - .. warning:: + .. seealso:: Before using this connector please take into account :ref:`greenplum-prerequisites` + .. versionadded:: 0.5.0 + Parameters ---------- host : str @@ -150,10 +159,14 @@ class Greenplum(JDBCMixin, DBConnection): port: int = 5432 extra: GreenplumExtra = GreenplumExtra() - Extra = GreenplumExtra - Dialect = GreenplumDialect ReadOptions = GreenplumReadOptions WriteOptions = GreenplumWriteOptions + SQLOptions = GreenplumSQLOptions + FetchOptions = GreenplumFetchOptions + ExecuteOptions = GreenplumExecuteOptions + + Extra = GreenplumExtra + Dialect = GreenplumDialect DRIVER: ClassVar[str] = "org.postgresql.Driver" CONNECTIONS_WARNING_LIMIT: ClassVar[int] = 31 @@ -164,9 +177,9 @@ class Greenplum(JDBCMixin, DBConnection): def get_packages( cls, *, - scala_version: str | Version | None = None, - spark_version: str | Version | None = None, - package_version: str | Version | None = None, + scala_version: str | None = None, + spark_version: str | None = None, + package_version: str | None = None, ) -> list[str]: """ Get package names to be downloaded by Spark. |support_hooks| @@ -175,6 +188,8 @@ def get_packages( You should pass either ``scala_version`` or ``spark_version``. + .. versionadded:: 0.9.0 + Parameters ---------- scala_version : str, optional @@ -190,6 +205,8 @@ def get_packages( package_version : str, optional, default ``2.2.0`` Package version in format ``major.minor.patch`` + .. versionadded:: 0.10.1 + Examples -------- @@ -204,24 +221,24 @@ def get_packages( # Connector version is fixed, so we can perform checks for Scala/Spark version if package_version: - package_ver = Version.parse(package_version) + package_ver = Version(package_version) else: - package_ver = Version(2, 2, 0) + package_ver = Version("2.2.0") if scala_version: - scala_ver = Version.parse(scala_version) + scala_ver = Version(scala_version).min_digits(2) elif spark_version: - spark_ver = Version.parse(spark_version) - if spark_ver.digits(2) > (3, 2) or spark_ver.digits(2) < (2, 3): + spark_ver = Version(spark_version).min_digits(2) + if spark_ver >= Version("3.3") or spark_ver < Version("2.3"): raise ValueError(f"Spark version must be 2.3.x - 3.2.x, got {spark_ver}") scala_ver = get_default_scala_version(spark_ver) else: raise ValueError("You should pass either `scala_version` or `spark_version`") - if scala_ver.digits(2) < (2, 11) or scala_ver.digits(2) > (2, 12): - raise ValueError(f"Scala version must be 2.11 - 2.12, got {scala_ver}") + if scala_ver < Version("2.11") or scala_ver > Version("2.12"): + raise ValueError(f"Scala version must be 2.11 - 2.12, got {scala_ver.format('{0}.{1}')}") - return [f"io.pivotal:greenplum-spark_{scala_ver.digits(2)}:{package_ver.digits(3)}"] + return [f"io.pivotal:greenplum-spark_{scala_ver.format('{0}.{1}')}:{package_ver}"] @classproperty def package_spark_2_3(cls) -> str: @@ -250,15 +267,20 @@ def instance_url(self) -> str: @property def jdbc_url(self) -> str: - extra = { - key: value - for key, value in self.extra.dict(by_alias=True).items() - if not (key.startswith("server.") or key.startswith("pool.")) - } - extra["ApplicationName"] = extra.get("ApplicationName", self.spark.sparkContext.appName) + return f"jdbc:postgresql://{self.host}:{self.port}/{self.database}" - parameters = "&".join(f"{k}={v}" for k, v in sorted(extra.items())) - return f"jdbc:postgresql://{self.host}:{self.port}/{self.database}?{parameters}".rstrip("?") + @property + def jdbc_params(self) -> dict: + result = super().jdbc_params + result.update( + { + key: value + for key, value in self.extra.dict(by_alias=True).items() + if not (key.startswith("server.") or key.startswith("pool.")) + }, + ) + result["ApplicationName"] = result.get("ApplicationName", self.spark.sparkContext.appName) + return result @slot def read_source_as_df( @@ -387,7 +409,7 @@ def _check_java_class_imported(cls, spark): try: try_import_java_class(spark, java_class) except Exception as e: - spark_version = get_spark_version(spark).digits(2) + spark_version = get_spark_version(spark).format("{major}.{minor}") msg = MISSING_JVM_CLASS_MSG.format( java_class=java_class, package_source=cls.__name__, @@ -415,7 +437,7 @@ def _connector_params( **extra, } - def _options_to_connection_properties(self, options: JDBCOptions): + def _options_to_connection_properties(self, options: JDBCOptions | JDBCExecuteOptions | JDBCFetchOptions): # See https://github.com/pgjdbc/pgjdbc/pull/1252 # Since 42.2.9 Postgres JDBC Driver added new option readOnlyMode=transaction # Which is not a desired behavior, because `.fetch()` method should always be read-only @@ -434,7 +456,7 @@ def _get_server_setting(self, name: str) -> Any: log.debug("|%s| Executing SQL query (on driver):") log_lines(log, query, level=logging.DEBUG) - df = self._query_on_driver(query, self.JDBCOptions()) + df = self._query_on_driver(query, self.FetchOptions()) result = df.collect() log.debug( @@ -455,7 +477,7 @@ def _get_occupied_connections_count(self) -> int: log.debug("|%s| Executing SQL query (on driver):") log_lines(log, query, level=logging.DEBUG) - df = self._query_on_driver(query, self.JDBCOptions()) + df = self._query_on_driver(query, self.FetchOptions()) result = df.collect() log.debug( diff --git a/onetl/connection/db_connection/greenplum/options.py b/onetl/connection/db_connection/greenplum/options.py index e100e35f1..e1cd1902a 100644 --- a/onetl/connection/db_connection/greenplum/options.py +++ b/onetl/connection/db_connection/greenplum/options.py @@ -11,7 +11,12 @@ except (ImportError, AttributeError): from pydantic import Field, root_validator # type: ignore[no-redef, assignment] +from onetl.connection.db_connection.jdbc_connection.options import JDBCSQLOptions from onetl.connection.db_connection.jdbc_mixin import JDBCOptions +from onetl.connection.db_connection.jdbc_mixin.options import ( + JDBCExecuteOptions, + JDBCFetchOptions, +) # options from which are populated by Greenplum class methods GENERIC_PROHIBITED_OPTIONS = frozenset( @@ -299,6 +304,8 @@ class Config: * Table exists An error is raised, and no data is written to the table. + .. versionchanged:: 0.9.0 + Renamed ``mode`` → ``if_exists`` """ @root_validator(pre=True) @@ -311,3 +318,15 @@ def _mode_is_deprecated(cls, values): stacklevel=3, ) return values + + +class GreenplumSQLOptions(JDBCSQLOptions): + __doc__ = JDBCSQLOptions.__doc__ # type: ignore[assignment] + + +class GreenplumFetchOptions(JDBCFetchOptions): + __doc__ = JDBCFetchOptions.__doc__ # type: ignore[assignment] + + +class GreenplumExecuteOptions(JDBCExecuteOptions): + __doc__ = JDBCExecuteOptions.__doc__ # type: ignore[assignment] diff --git a/onetl/connection/db_connection/hive/connection.py b/onetl/connection/db_connection/hive/connection.py index f465fd73a..fbedebefa 100644 --- a/onetl/connection/db_connection/hive/connection.py +++ b/onetl/connection/db_connection/hive/connection.py @@ -39,60 +39,26 @@ class Hive(DBConnection): """Spark connection with Hive MetaStore support. |support_hooks| - You don't need a Hive server to use this connector. + .. seealso:: - .. dropdown:: Version compatibility + Before using this connector please take into account :ref:`hive-prerequisites` - * Hive metastore version: 0.12 - 3.1.2 (may require to add proper .jar file explicitly) - * Spark versions: 2.3.x - 3.5.x - * Java versions: 8 - 20 - - .. warning:: - - To use Hive connector you should have PySpark installed (or injected to ``sys.path``) - BEFORE creating the connector instance. - - You can install PySpark as follows: - - .. code:: bash - - pip install onetl[spark] # latest PySpark version - - # or - pip install onetl pyspark=3.5.0 # pass specific PySpark version - - See :ref:`install-spark` installation instruction for more details. - - .. warning:: - - This connector requires some additional configuration files to be present (``hive-site.xml`` and so on), - as well as .jar files with Hive MetaStore client. - - See `Spark Hive Tables documentation `_ - and `this guide `_ for more details. - - .. note:: - - Most of Hadoop instances use Kerberos authentication. In this case, you should call ``kinit`` - **BEFORE** starting Spark session to generate Kerberos ticket. See :ref:`install-kerberos`. - - In case of creating session with ``"spark.master": "yarn"``, you should also pass some additional options - to Spark session, allowing executors to generate their own Kerberos tickets to access HDFS. - See `Spark security documentation `_ - for more details. + .. versionadded:: 0.1.0 Parameters ---------- cluster : str Cluster name. Used for HWM and lineage. + .. versionadded:: 0.7.0 + spark : :obj:`pyspark.sql.SparkSession` Spark session with Hive metastore support enabled Examples -------- - Hive connection initialization + Hive connection initialization: .. code:: python @@ -105,7 +71,11 @@ class Hive(DBConnection): # Create connection hive = Hive(cluster="rnd-dwh", spark=spark).check() - Hive connection initialization with Kerberos support + Hive connection initialization with Kerberos support: + + .. code:: bash + + $ kinit -kt /path/to/keytab user .. code:: python @@ -151,6 +121,8 @@ def get_current(cls, spark: SparkSession): Can be used only if there are some hooks bound to :obj:`Slots.get_current_cluster ` slot. + .. versionadded:: 0.7.0 + Parameters ---------- spark : :obj:`pyspark.sql.SparkSession` @@ -217,32 +189,19 @@ def sql( Same as ``spark.sql(query)``. + .. versionadded:: 0.2.0 + Parameters ---------- query : str - SQL query to be executed, like: - - * ``SELECT ... FROM ...`` - * ``WITH ... AS (...) SELECT ... FROM ...`` - * ``SHOW ...`` queries are also supported, like ``SHOW TABLES`` + SQL query to be executed. Returns ------- df : pyspark.sql.dataframe.DataFrame Spark dataframe - - Examples - -------- - - Read data from Hive table: - - .. code:: python - - connection = Hive(cluster="rnd-dwh", spark=spark) - - df = connection.sql("SELECT * FROM mytable") """ query = clear_statement(query) @@ -262,47 +221,13 @@ def execute( """ Execute DDL or DML statement. |support_hooks| + .. versionadded:: 0.2.0 + Parameters ---------- statement : str - Statement to be executed, like: - - DML statements: - - * ``INSERT INTO target_table SELECT * FROM source_table`` - * ``TRUNCATE TABLE mytable`` - - DDL statements: - - * ``CREATE TABLE mytable (...)`` - * ``ALTER TABLE mytable ...`` - * ``DROP TABLE mytable`` - * ``MSCK REPAIR TABLE mytable`` - - The exact list of supported statements depends on Hive version, - for example some new versions support ``CREATE FUNCTION`` syntax. - - Examples - -------- - - Create table: - - .. code:: python - - connection = Hive(cluster="rnd-dwh", spark=spark) - - connection.execute( - "CREATE TABLE mytable (id NUMBER, data VARCHAR) PARTITIONED BY (date DATE)" - ) - - Drop table partition: - - .. code:: python - - connection = Hive(cluster="rnd-dwh", spark=spark) - - connection.execute("ALTER TABLE mytable DROP PARTITION(date='2023-02-01')") + Statement to be executed. """ statement = clear_statement(statement) diff --git a/onetl/connection/db_connection/hive/options.py b/onetl/connection/db_connection/hive/options.py index 5f687112f..a196487a1 100644 --- a/onetl/connection/db_connection/hive/options.py +++ b/onetl/connection/db_connection/hive/options.py @@ -217,24 +217,8 @@ class Config: """ List of columns should be used for data partitioning. ``None`` means partitioning is disabled. - Each partition is a folder which contains only files with the specific column value, - like ``myschema.db/mytable/col1=value1``, ``myschema.db/mytable/col1=value2``, and so on. - - Multiple partitions columns means nested folder structure, like ``myschema.db/mytable/col1=val1/col2=val2``. - - If ``WHERE`` clause in the query contains expression like ``partition = value``, - Spark will scan only files in a specific partition. - Examples: ``reg_id`` or ``["reg_id", "business_dt"]`` - .. note:: - - Values should be scalars (integers, strings), - and either static (``countryId``) or incrementing (dates, years), with low - number of distinct values. - - Columns like ``userId`` or ``datetime``/``timestamp`` should **NOT** be used for partitioning. - .. warning:: Used **only** while **creating new table**, or in case of ``if_exists=replace_entire_table`` diff --git a/onetl/connection/db_connection/hive/slots.py b/onetl/connection/db_connection/hive/slots.py index ce4ceea97..3044950fd 100644 --- a/onetl/connection/db_connection/hive/slots.py +++ b/onetl/connection/db_connection/hive/slots.py @@ -7,7 +7,10 @@ @support_hooks class HiveSlots: - """:ref:`Slots ` that could be implemented by third-party plugins.""" + """:ref:`Slots ` that could be implemented by third-party plugins. + + .. versionadded:: 0.7.0 + """ @slot @staticmethod @@ -17,6 +20,8 @@ def normalize_cluster_name(cluster: str) -> str | None: If hooks didn't return anything, cluster name is left intact. + .. versionadded:: 0.7.0 + Parameters ---------- cluster : :obj:`str` @@ -53,6 +58,8 @@ def get_known_clusters() -> set[str] | None: Cluster passed into Hive constructor should be present in this list. If hooks didn't return anything, no validation will be performed. + .. versionadded:: 0.7.0 + Returns ------- set[str] | None @@ -84,6 +91,8 @@ def get_current_cluster() -> str | None: Used in :obj:`~check` method to verify that connection is created only from the same cluster. If hooks didn't return anything, no validation will be performed. + .. versionadded:: 0.7.0 + Returns ------- str | None diff --git a/onetl/connection/db_connection/jdbc_connection/connection.py b/onetl/connection/db_connection/jdbc_connection/connection.py index 3133d3671..586b63924 100644 --- a/onetl/connection/db_connection/jdbc_connection/connection.py +++ b/onetl/connection/db_connection/jdbc_connection/connection.py @@ -4,6 +4,7 @@ import logging import secrets +import warnings from typing import TYPE_CHECKING, Any from etl_entities.instance import Host @@ -15,6 +16,7 @@ JDBCLegacyOptions, JDBCPartitioningMode, JDBCReadOptions, + JDBCSQLOptions, JDBCTableExistBehavior, JDBCWriteOptions, ) @@ -50,6 +52,7 @@ class JDBCConnection(JDBCMixin, DBConnection): Dialect = JDBCDialect ReadOptions = JDBCReadOptions + SQLOptions = JDBCSQLOptions WriteOptions = JDBCWriteOptions Options = JDBCLegacyOptions @@ -61,20 +64,22 @@ def instance_url(self) -> str: def sql( self, query: str, - options: JDBCReadOptions | dict | None = None, + options: JDBCSQLOptions | dict | None = None, ) -> DataFrame: """ **Lazily** execute SELECT statement **on Spark executor** and return DataFrame. |support_hooks| Same as ``spark.read.jdbc(query)``. + .. versionadded:: 0.2.0 + Parameters ---------- query : str SQL query to be executed. - options : dict, :obj:`~ReadOptions`, default: ``None`` + options : dict, :obj:`~SQLOptions`, default: ``None`` Spark options to be used while fetching data. @@ -86,12 +91,17 @@ def sql( """ + if isinstance(options, JDBCReadOptions): + msg = "Using `ReadOptions` for `sql` method is deprecated, use `SQLOptions` instead." + warnings.warn(msg, UserWarning, stacklevel=3) + options = self.SQLOptions.parse_obj(options.dict(exclude={"partitioning_mode"}, exclude_none=True)) + query = clear_statement(query) log.info("|%s| Executing SQL query (on executor):", self.__class__.__name__) log_lines(log, query) - df = self._query_on_executor(query, self.ReadOptions.parse(options)) + df = self._query_on_executor(query, self.SQLOptions.parse(options)) log.info("|Spark| DataFrame successfully created from SQL statement ") return df @@ -151,7 +161,12 @@ def read_source_as_df( limit=limit, ) - result = self.sql(query, read_options) + log.info("|%s| Executing SQL query (on executor):", self.__class__.__name__) + log_lines(log, query) + + result = self._query_on_executor(query, self.ReadOptions.parse(read_options)) + + log.info("|Spark| DataFrame successfully created from SQL statement ") if alias: result = result.drop(alias) @@ -165,7 +180,7 @@ def write_df_to_target( options: JDBCWriteOptions | None = None, ) -> None: write_options = self.WriteOptions.parse(options) - jdbc_params = self.options_to_jdbc_params(write_options) + jdbc_properties = self._get_jdbc_properties(write_options, exclude={"if_exists"}, exclude_none=True) mode = ( "overwrite" @@ -173,7 +188,7 @@ def write_df_to_target( else write_options.if_exists.value ) log.info("|%s| Saving data to a table %r", self.__class__.__name__, target) - df.write.jdbc(table=target, mode=mode, **jdbc_params) + df.write.format("jdbc").mode(mode).options(dbtable=target, **jdbc_properties).save() log.info("|%s| Table %r successfully written", self.__class__.__name__, target) @slot @@ -196,38 +211,6 @@ def get_df_schema( return df.schema - def options_to_jdbc_params( - self, - options: JDBCReadOptions | JDBCWriteOptions, - ) -> dict: - # Have to replace the parameter with - # since the method takes the named parameter - # link to source below - # https://github.com/apache/spark/blob/2ef8ced27a6b0170a691722a855d3886e079f037/python/pyspark/sql/readwriter.py#L465 - - partition_column = getattr(options, "partition_column", None) - if partition_column: - options = options.copy( - update={"column": partition_column}, - exclude={"partition_column"}, - ) - - result = self._get_jdbc_properties( - options, - include=READ_TOP_LEVEL_OPTIONS | WRITE_TOP_LEVEL_OPTIONS, - exclude={"if_exists"}, - exclude_none=True, - ) - - result["properties"] = self._get_jdbc_properties( - options, - exclude=READ_TOP_LEVEL_OPTIONS | WRITE_TOP_LEVEL_OPTIONS | {"if_exists"}, - exclude_none=True, - ) - - result["properties"].pop("partitioningMode", None) - return result - @slot def get_min_max_values( self, @@ -273,10 +256,10 @@ def get_min_max_values( def _query_on_executor( self, query: str, - options: JDBCReadOptions, + options: JDBCSQLOptions | JDBCReadOptions, ) -> DataFrame: - jdbc_params = self.options_to_jdbc_params(options) - return self.spark.read.jdbc(table=f"({query}) T", **jdbc_params) + jdbc_properties = self._get_jdbc_properties(options, exclude_none=True) + return self.spark.read.format("jdbc").options(dbtable=f"({query}) T", **jdbc_properties).load() def _exclude_partition_options( self, diff --git a/onetl/connection/db_connection/jdbc_connection/options.py b/onetl/connection/db_connection/jdbc_connection/options.py index 50eda6a51..12ac24193 100644 --- a/onetl/connection/db_connection/jdbc_connection/options.py +++ b/onetl/connection/db_connection/jdbc_connection/options.py @@ -115,6 +115,9 @@ class JDBCReadOptions(JDBCOptions): The set of supported options depends on Spark version. See link above. + .. versionadded:: 0.5.0 + Replace ``Connection.Options`` → ``Connection.ReadOptions`` + Examples -------- @@ -196,6 +199,9 @@ class Config: default ``fetchsize=10``, which is absolutely not usable. Thus we've overridden default value with ``100_000``, which should increase reading performance. + + .. versionchanged:: 0.2.0 + Set explicit default value to ``100_000`` """ partitioning_mode: JDBCPartitioningMode = JDBCPartitioningMode.RANGE @@ -306,6 +312,8 @@ class Config: SELECT ... FROM table WHERE (partition_column mod num_partitions) = num_partitions-1 -- upper_bound + .. versionadded:: 0.5.0 + Examples -------- @@ -383,6 +391,9 @@ class JDBCWriteOptions(JDBCOptions): The set of supported options depends on Spark version. See link above. + .. versionadded:: 0.5.0 + Replace ``Connection.Options`` → ``Connection.WriteOptions`` + Examples -------- @@ -466,6 +477,8 @@ class Config: * Table exists An error is raised, and no data is written to the table. + .. versionchanged:: 0.9.0 + Renamed ``mode`` → ``if_exists`` """ batchsize: int = 20_000 @@ -486,6 +499,9 @@ class Config: You can increase it even more, up to ``50_000``, but it depends on your database load and number of columns in the row. Higher values does not increase performance. + + .. versionchanged:: 0.4.0 + Changed default value from 1000 to 20_000 """ isolation_level: str = "READ_UNCOMMITTED" @@ -515,6 +531,114 @@ def _mode_is_deprecated(cls, values): return values +class JDBCSQLOptions(JDBCOptions): + """Options specifically for SQL queries + + These options allow you to specify configurations for executing SQL queries + without relying on Spark's partitioning mechanisms. + + .. note:: + + You can pass any JDBC configuration + `supported by Spark `_, + tailored to optimize SQL query execution. Option names should be in ``camelCase``! + + .. versionadded:: 0.11.0 + Split up ``ReadOptions`` to ``SQLOptions`` + """ + + partition_column: Optional[str] = None + """Column used to partition data across multiple executors for parallel query processing. + + .. warning:: + It is highly recommended to use primary key, or at least a column with an index + to avoid performance issues. + + Example of using partition_column for range-based partitioning: + + .. code-block:: sql + + -- If partition_column is 'id', with num_partitions=4, lower_bound=1, and upper_bound=100: + -- Executor 1 processes IDs from 1 to 25 + SELECT ... FROM table WHERE id >= 1 AND id < 26 + -- Executor 2 processes IDs from 26 to 50 + SELECT ... FROM table WHERE id >= 26 AND id < 51 + -- Executor 3 processes IDs from 51 to 75 + SELECT ... FROM table WHERE id >= 51 AND id < 76 + -- Executor 4 processes IDs from 76 to 100 + SELECT ... FROM table WHERE id >= 76 AND id <= 100 + + + -- General case for Executor N + SELECT ... FROM table + WHERE partition_column >= (lower_bound + (N-1) * stride) + AND partition_column <= upper_bound + -- Where ``stride`` is calculated as ``(upper_bound - lower_bound) / num_partitions``. + """ + + num_partitions: Optional[int] = None + """Number of jobs created by Spark to read the table content in parallel.""" # noqa: WPS322 + + lower_bound: Optional[int] = None + """Defines the starting boundary for partitioning the query's data. Mandatory if :obj:`~partition_column~ is set""" # noqa: WPS322 + + upper_bound: Optional[int] = None + """Sets the ending boundary for data partitioning. Mandatory if :obj:`~partition_column~ is set""" # noqa: WPS322 + + session_init_statement: Optional[str] = None + '''After each database session is opened to the remote DB and before starting to read data, + this option executes a custom SQL statement (or a PL/SQL block). + + Use this to implement session initialization code. + + Example: + + .. code:: python + + sessionInitStatement = """ + BEGIN + execute immediate + 'alter session set "_serial_direct_read"=true'; + END; + """ + ''' + + fetchsize: int = 100_000 + """Fetch N rows from an opened cursor per one read round. + + Tuning this option can influence performance of reading. + + .. warning:: + + Default value is different from Spark. + + Spark uses driver's own value, and it may be different in different drivers, + and even versions of the same driver. For example, Oracle has + default ``fetchsize=10``, which is absolutely not usable. + + Thus we've overridden default value with ``100_000``, which should increase reading performance. + + .. versionchanged:: 0.2.0 + Set explicit default value to ``100_000`` + """ + + class Config: + known_options = READ_OPTIONS - {"partitioning_mode"} + prohibited_options = JDBCOptions.Config.prohibited_options | {"partitioning_mode"} + alias_generator = to_camel + + @root_validator(pre=True) + def _check_partition_fields(cls, values): + num_partitions = values.get("num_partitions") + lower_bound = values.get("lower_bound") + upper_bound = values.get("upper_bound") + + if num_partitions is not None and num_partitions > 1: + if lower_bound is None or upper_bound is None: + raise ValueError("lower_bound and upper_bound must be set if num_partitions > 1") + return values + + @deprecated( "Deprecated in 0.5.0 and will be removed in 1.0.0. Use 'ReadOptions' or 'WriteOptions' instead", category=UserWarning, diff --git a/onetl/connection/db_connection/jdbc_mixin/connection.py b/onetl/connection/db_connection/jdbc_mixin/connection.py index 76424c30c..f42450b6e 100644 --- a/onetl/connection/db_connection/jdbc_mixin/connection.py +++ b/onetl/connection/db_connection/jdbc_mixin/connection.py @@ -17,6 +17,11 @@ from onetl._internal import clear_statement, stringify from onetl._util.java import get_java_gateway, try_import_java_class from onetl._util.spark import get_spark_version +from onetl._util.version import Version +from onetl.connection.db_connection.jdbc_mixin.options import ( + JDBCExecuteOptions, + JDBCFetchOptions, +) from onetl.connection.db_connection.jdbc_mixin.options import ( JDBCOptions as JDBCMixinOptions, ) @@ -63,6 +68,8 @@ class JDBCMixin(FrozenModel): password: SecretStr JDBCOptions = JDBCMixinOptions + FetchOptions = JDBCFetchOptions + ExecuteOptions = JDBCExecuteOptions DRIVER: ClassVar[str] _CHECK_QUERY: ClassVar[str] = "SELECT 1" @@ -75,6 +82,16 @@ class JDBCMixin(FrozenModel): def jdbc_url(self) -> str: """JDBC Connection URL""" + @property + def jdbc_params(self) -> dict: + """JDBC Connection params""" + return { + "user": self.user, + "password": self.password.get_secret_value() if self.password is not None else "", + "driver": self.DRIVER, + "url": self.jdbc_url, + } + @slot def close(self): """ @@ -96,7 +113,6 @@ def close(self): .. code:: python df = connection.fetch("SELECT * FROM mytable LIMIT 10") - assert df.count() connection.close() # or @@ -130,7 +146,7 @@ def check(self): log_lines(log, self._CHECK_QUERY, level=logging.DEBUG) try: - self._query_optional_on_driver(self._CHECK_QUERY, self.JDBCOptions(fetchsize=1)) # type: ignore + self._query_optional_on_driver(self._CHECK_QUERY, self.FetchOptions(fetchsize=1)) log.info("|%s| Connection is available.", self.__class__.__name__) except Exception as e: log.exception("|%s| Connection is unavailable", self.__class__.__name__) @@ -142,7 +158,7 @@ def check(self): def fetch( self, query: str, - options: JDBCMixinOptions | dict | None = None, + options: JDBCFetchOptions | dict | None = None, ) -> DataFrame: """ **Immediately** execute SELECT statement **on Spark driver** and return in-memory DataFrame. |support_hooks| @@ -158,13 +174,15 @@ def fetch( First call of the method opens the connection to a database. Call ``.close()`` method to close it, or use context manager to do it automatically. + .. versionadded:: 0.2.0 + Parameters ---------- query : str SQL query to be executed. - options : dict, :obj:`~JDBCOptions`, default: ``None`` + options : dict, :obj:`~FetchOptions`, default: ``None`` Options to be passed directly to JDBC driver, like ``fetchsize`` or ``queryTimeout`` @@ -184,7 +202,14 @@ def fetch( log.info("|%s| Executing SQL query (on driver):", self.__class__.__name__) log_lines(log, query) - df = self._query_on_driver(query, self.JDBCOptions.parse(options)) + df = self._query_on_driver( + query, + ( + self.FetchOptions.parse(options.dict()) # type: ignore + if isinstance(options, JDBCMixinOptions) + else self.FetchOptions.parse(options) + ), + ) log.info( "|%s| Query succeeded, resulting in-memory dataframe contains %d rows", @@ -197,7 +222,7 @@ def fetch( def execute( self, statement: str, - options: JDBCMixinOptions | dict | None = None, + options: JDBCExecuteOptions | JDBCMixinOptions | dict | None = None, ) -> DataFrame | None: """ **Immediately** execute DDL, DML or procedure/function **on Spark driver**. |support_hooks| @@ -210,13 +235,15 @@ def execute( First call of the method opens the connection to a database. Call ``.close()`` method to close it, or use context manager to do it automatically. + .. versionadded:: 0.2.0 + Parameters ---------- statement : str Statement to be executed. - options : dict, :obj:`~JDBCOptions`, default: ``None`` + options : dict, :obj:`~JDBCExecuteOptions`, default: ``None`` Options to be passed directly to JDBC driver, like ``queryTimeout`` @@ -239,7 +266,11 @@ def execute( log.info("|%s| Executing statement (on driver):", self.__class__.__name__) log_lines(log, statement) - call_options = self.JDBCOptions.parse(options) + call_options = ( + self.ExecuteOptions.parse(options.dict()) + if isinstance(options, JDBCMixinOptions) + else self.ExecuteOptions.parse(options) + ) df = self._call_on_driver(statement, call_options) if df is not None: @@ -271,7 +302,7 @@ def _check_java_class_imported(cls, spark): def _query_on_driver( self, query: str, - options: JDBCMixinOptions, + options: JDBCMixinOptions | JDBCFetchOptions | JDBCExecuteOptions, ) -> DataFrame: return self._execute_on_driver( statement=query, @@ -284,7 +315,7 @@ def _query_on_driver( def _query_optional_on_driver( self, query: str, - options: JDBCMixinOptions, + options: JDBCMixinOptions | JDBCFetchOptions, ) -> DataFrame | None: return self._execute_on_driver( statement=query, @@ -297,7 +328,7 @@ def _query_optional_on_driver( def _call_on_driver( self, query: str, - options: JDBCMixinOptions, + options: JDBCMixinOptions | JDBCExecuteOptions, ) -> DataFrame | None: return self._execute_on_driver( statement=query, @@ -309,25 +340,17 @@ def _call_on_driver( def _get_jdbc_properties( self, - options: JDBCMixinOptions, + options: JDBCFetchOptions | JDBCExecuteOptions | JDBCMixinOptions, **kwargs, - ) -> dict: + ) -> dict[str, str]: """ Fills up human-readable Options class to a format required by Spark internal methods """ - - result = options.copy( - update={ - "user": self.user, - "password": self.password.get_secret_value() if self.password is not None else "", - "driver": self.DRIVER, - "url": self.jdbc_url, - }, - ).dict(by_alias=True, **kwargs) - + result = self.jdbc_params + result.update(options.dict(by_alias=True, **kwargs)) return stringify(result) - def _options_to_connection_properties(self, options: JDBCMixinOptions): + def _options_to_connection_properties(self, options: JDBCFetchOptions | JDBCExecuteOptions | JDBCMixinOptions): """ Converts human-readable Options class to ``java.util.Properties``. @@ -338,8 +361,7 @@ def _options_to_connection_properties(self, options: JDBCMixinOptions): * https://github.com/apache/spark/blob/v2.3.0/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala#L248-L255 """ - jdbc_properties = self._get_jdbc_properties(options, exclude_unset=True) - + jdbc_properties = self._get_jdbc_properties(options, exclude_none=True) jdbc_utils_package = self.spark._jvm.org.apache.spark.sql.execution.datasources.jdbc # type: ignore jdbc_options = jdbc_utils_package.JDBCOptions( self.jdbc_url, @@ -349,7 +371,7 @@ def _options_to_connection_properties(self, options: JDBCMixinOptions): ) return jdbc_options.asConnectionProperties() - def _get_jdbc_connection(self, options: JDBCMixinOptions): + def _get_jdbc_connection(self, options: JDBCFetchOptions | JDBCExecuteOptions | JDBCMixinOptions): if not self._last_connection_and_options: # connection class can be used in multiple threads. # each Python thread creates its own thread in JVM @@ -391,7 +413,7 @@ def _execute_on_driver( statement: str, statement_type: JDBCStatementType, callback: Callable[..., T], - options: JDBCMixinOptions, + options: JDBCFetchOptions | JDBCExecuteOptions | JDBCMixinOptions, read_only: bool, ) -> T: """ @@ -413,7 +435,7 @@ def _execute_statement( self, jdbc_statement, statement: str, - options: JDBCMixinOptions, + options: JDBCMixinOptions | JDBCFetchOptions | JDBCExecuteOptions, callback: Callable[..., T], read_only: bool, ) -> T: @@ -521,7 +543,7 @@ def _resultset_to_dataframe(self, result_set) -> DataFrame: java_converters = self.spark._jvm.scala.collection.JavaConverters # type: ignore - if get_spark_version(self.spark) >= (3, 4): + if get_spark_version(self.spark) >= Version("3.4"): # https://github.com/apache/spark/commit/2349175e1b81b0a61e1ed90c2d051c01cf78de9b result_schema = jdbc_utils.getSchema(result_set, jdbc_dialect, False, False) # noqa: WPS425 else: diff --git a/onetl/connection/db_connection/jdbc_mixin/options.py b/onetl/connection/db_connection/jdbc_mixin/options.py index 349630719..2504c3641 100644 --- a/onetl/connection/db_connection/jdbc_mixin/options.py +++ b/onetl/connection/db_connection/jdbc_mixin/options.py @@ -4,6 +4,8 @@ from typing import Optional +from typing_extensions import deprecated + try: from pydantic.v1 import Field except (ImportError, AttributeError): @@ -22,6 +24,7 @@ ) +@deprecated("Deprecated in 0.11.0 and will be removed in 1.0.0. Use FetchOptions or ExecuteOptions instead") class JDBCOptions(GenericOptions): """Generic options, related to specific JDBC driver. @@ -30,6 +33,46 @@ class JDBCOptions(GenericOptions): You can pass any value supported by underlying JDBC driver class, even if it is not mentioned in this documentation. + + .. deprecated:: 0.11.0 + Use ``FetchOptions`` or ``ExecuteOptions`` instead + """ + + class Config: + prohibited_options = PROHIBITED_OPTIONS + extra = "allow" + + query_timeout: Optional[int] = Field(default=None, alias="queryTimeout") + """The number of seconds the driver will wait for a statement to execute. + Zero means there is no limit. + + This option depends on driver implementation, + some drivers can check the timeout of each query instead of an entire JDBC batch. + """ + + fetchsize: Optional[int] = None + """How many rows to fetch per round trip. + + Tuning this option can influence performance of reading. + + .. warning:: + + Default value depends on driver. For example, Oracle has + default ``fetchsize=10``. + """ + + +class JDBCFetchOptions(GenericOptions): + """Options related to fetching data from databases via JDBC. + + .. note :: + + You can pass any value + supported by underlying JDBC driver class, + even if it is not mentioned in this documentation. + + .. versionadded:: 0.11.0 + Replace ``Connection.JDBCOptions`` → ``Connection.FetchOptions`` """ class Config: @@ -50,7 +93,42 @@ class Config: Tuning this option can influence performance of reading. .. warning:: + Default value depends on driver. For example, Oracle has + default ``fetchsize=10``. + """ + + +class JDBCExecuteOptions(GenericOptions): + """Options related to executing statements in databases via JDBC. + + .. note :: + You can pass any value + supported by underlying JDBC driver class, + even if it is not mentioned in this documentation. + + .. versionadded:: 0.11.0 + Replace ``Connection.JDBCOptions`` → ``Connection.ExecuteOptions`` + """ + + class Config: + prohibited_options = PROHIBITED_OPTIONS + extra = "allow" + + query_timeout: Optional[int] = Field(default=None, alias="queryTimeout") + """The number of seconds the driver will wait for a statement to execute. + Zero means there is no limit. + + This option depends on driver implementation, + some drivers can check the timeout of each query instead of an entire JDBC batch. + """ + + fetchsize: Optional[int] = None + """How many rows to fetch per round trip. + + Tuning this option can influence performance of reading. + + .. warning:: Default value depends on driver. For example, Oracle has default ``fetchsize=10``. """ diff --git a/onetl/connection/db_connection/kafka/connection.py b/onetl/connection/db_connection/kafka/connection.py index 21a042451..b64fff143 100644 --- a/onetl/connection/db_connection/kafka/connection.py +++ b/onetl/connection/db_connection/kafka/connection.py @@ -57,15 +57,15 @@ class Kafka(DBConnection): Based on `official Kafka Source For Spark `_. - .. note:: + .. seealso:: + + Before using this connector please take into account :ref:`kafka-prerequisites` - This connector is for batch download from kafka and not streaming. + .. note:: - .. dropdown:: Version compatibility + This connector is for **batch** ETL processes, not streaming. - * Apache Kafka versions: 0.10 or higher - * Spark versions: 2.4.x - 3.5.x - * Scala versions: 2.11 - 2.13 + .. versionadded:: 0.9.0 Parameters ---------- @@ -392,8 +392,8 @@ def get_df_schema( @classmethod def get_packages( cls, - spark_version: str | Version, - scala_version: str | Version | None = None, + spark_version: str, + scala_version: str | None = None, ) -> list[str]: """ Get package names to be downloaded by Spark. |support_hooks| @@ -424,17 +424,17 @@ def get_packages( """ # Connector version is same as Spark, do not perform any additional checks - spark_ver = Version.parse(spark_version) - if spark_ver < (2, 4): + spark_ver = Version(spark_version).min_digits(3) + if spark_ver < Version("2.4"): # Kafka connector for Spark 2.3 is build with Kafka client 0.10.0.1 which does not support # passing `sasl.jaas.config` option. It is supported only in 0.10.2.0, # see https://issues.apache.org/jira/browse/KAFKA-4259 # Old client requires generating JAAS file and placing it to filesystem, which is not secure. raise ValueError(f"Spark version must be at least 2.4, got {spark_ver}") - scala_ver = Version.parse(scala_version) if scala_version else get_default_scala_version(spark_ver) + scala_ver = Version(scala_version).min_digits(2) if scala_version else get_default_scala_version(spark_ver) return [ - f"org.apache.spark:spark-sql-kafka-0-10_{scala_ver.digits(2)}:{spark_ver.digits(3)}", + f"org.apache.spark:spark-sql-kafka-0-10_{scala_ver.format('{0}.{1}')}:{spark_ver.format('{0}.{1}.{2}')}", ] def __enter__(self): @@ -592,7 +592,7 @@ def _validate_addresses(cls, value, values): @validator("spark") def _check_spark_version(cls, spark): spark_version = get_spark_version(spark) - if spark_version < (2, 4): + if spark_version < Version("2.4"): raise ValueError(f"Spark version must be at least 2.4, got {spark_version}") return spark @@ -604,7 +604,7 @@ def _check_java_class_imported(cls, spark): try: try_import_java_class(spark, java_class) except Exception as e: - spark_version = get_spark_version(spark).digits(2) + spark_version = get_spark_version(spark).format("{major}.{minor}") msg = MISSING_JVM_CLASS_MSG.format( java_class=java_class, package_source=cls.__name__, diff --git a/onetl/connection/db_connection/kafka/kafka_auth.py b/onetl/connection/db_connection/kafka/kafka_auth.py index f39d39a3e..f1bbdf101 100644 --- a/onetl/connection/db_connection/kafka/kafka_auth.py +++ b/onetl/connection/db_connection/kafka/kafka_auth.py @@ -12,6 +12,8 @@ class KafkaAuth(ABC): """ Interface for Kafka connection Auth classes. + + .. versionadded:: 0.9.0 """ @abstractmethod diff --git a/onetl/connection/db_connection/kafka/kafka_basic_auth.py b/onetl/connection/db_connection/kafka/kafka_basic_auth.py index e364b4123..4038dd02a 100644 --- a/onetl/connection/db_connection/kafka/kafka_basic_auth.py +++ b/onetl/connection/db_connection/kafka/kafka_basic_auth.py @@ -22,6 +22,8 @@ class KafkaBasicAuth(KafkaAuth, GenericOptions): For more details see `Kafka Documentation `_. + .. versionadded:: 0.9.0 + Examples -------- diff --git a/onetl/connection/db_connection/kafka/kafka_kerberos_auth.py b/onetl/connection/db_connection/kafka/kafka_kerberos_auth.py index 6f56ca55d..6a20a31af 100644 --- a/onetl/connection/db_connection/kafka/kafka_kerberos_auth.py +++ b/onetl/connection/db_connection/kafka/kafka_kerberos_auth.py @@ -59,6 +59,8 @@ class KafkaKerberosAuth(KafkaAuth, GenericOptions): * `Kafka Documentation `_ * `Krb5LoginModule documentation `_ + .. versionadded:: 0.9.0 + Examples -------- @@ -111,7 +113,7 @@ class KafkaKerberosAuth(KafkaAuth, GenericOptions): # options without sasl.kerberos. prefix are passed to JAAS config # names are in camel case! "isInitiator": True, - # options with sasl.kerberos. prefix are passed to Kafka client config + # options with `sasl.kerberos.` prefix are passed to Kafka client config as-is "sasl.kerberos.kinit.cmd": "/usr/bin/kinit", } ) diff --git a/onetl/connection/db_connection/kafka/kafka_plaintext_protocol.py b/onetl/connection/db_connection/kafka/kafka_plaintext_protocol.py index fd1384461..2dd3a6a97 100644 --- a/onetl/connection/db_connection/kafka/kafka_plaintext_protocol.py +++ b/onetl/connection/db_connection/kafka/kafka_plaintext_protocol.py @@ -21,6 +21,8 @@ class KafkaPlaintextProtocol(KafkaProtocol, FrozenModel): Not recommended to use on production environments. Prefer :obj:`SSLProtocol `. + .. versionadded:: 0.9.0 + Examples -------- diff --git a/onetl/connection/db_connection/kafka/kafka_protocol.py b/onetl/connection/db_connection/kafka/kafka_protocol.py index 12de89cf8..5d2a328c8 100644 --- a/onetl/connection/db_connection/kafka/kafka_protocol.py +++ b/onetl/connection/db_connection/kafka/kafka_protocol.py @@ -12,6 +12,8 @@ class KafkaProtocol(ABC): """ Interface for Kafka connection Protocol classes. + + .. versionadded:: 0.9.0 """ @abstractmethod diff --git a/onetl/connection/db_connection/kafka/kafka_scram_auth.py b/onetl/connection/db_connection/kafka/kafka_scram_auth.py index 2015123ee..add09f349 100644 --- a/onetl/connection/db_connection/kafka/kafka_scram_auth.py +++ b/onetl/connection/db_connection/kafka/kafka_scram_auth.py @@ -21,10 +21,12 @@ class KafkaScramAuth(KafkaAuth, GenericOptions): """ - Connect to Kafka using ``sasl.mechanism="SCRAM-SHA-*"``. + Connect to Kafka using ``sasl.mechanism="SCRAM-SHA-256"`` or ``sasl.mechanism="SCRAM-SHA-512"``. For more details see `Kafka Documentation `_. + .. versionadded:: 0.9.0 + Examples -------- @@ -51,7 +53,7 @@ class KafkaScramAuth(KafkaAuth, GenericOptions): "user": "me", "password": "abc", "digest": "SHA-512", - # options with sasl.login. prefix are passed to Kafka client config + # options with `sasl.login.` prefix are passed to Kafka client config as-is "sasl.login.class": "com.example.CustomScramLogin", } ) diff --git a/onetl/connection/db_connection/kafka/kafka_ssl_protocol.py b/onetl/connection/db_connection/kafka/kafka_ssl_protocol.py index f7b9ecc15..6149f5aa0 100644 --- a/onetl/connection/db_connection/kafka/kafka_ssl_protocol.py +++ b/onetl/connection/db_connection/kafka/kafka_ssl_protocol.py @@ -30,6 +30,8 @@ class KafkaSSLProtocol(KafkaProtocol, GenericOptions): * `IBM Documentation `_ * `How to use PEM Certificates with Kafka `_ + .. versionadded:: 0.9.0 + Examples -------- @@ -66,7 +68,7 @@ class KafkaSSLProtocol(KafkaProtocol, GenericOptions): protocol = Kafka.SSLProtocol.parse( { - # Just the same options as above, but using Kafka config naming + # Just the same options as above, but using Kafka config naming with dots "ssl.keystore.type": "PEM", "ssl.keystore.certificate_chain": "-----BEGIN CERTIFICATE-----\\nMIIDZjC...\\n-----END CERTIFICATE-----", "ssl.keystore.key": "-----BEGIN PRIVATE KEY-----\\nMIIEvg..\\n-----END PRIVATE KEY-----", @@ -79,7 +81,9 @@ class KafkaSSLProtocol(KafkaProtocol, GenericOptions): .. dropdown :: Not recommended - Pass PEM certificates as files: + These options are error-prone and have several drawbacks, so it is not recommended to use them. + + Passing PEM certificates as files: * ENCRYPT ``user.key`` file with password ``"some password"`` `using PKCS#8 scheme `_. * Save encrypted key to file ``/path/to/user/encrypted_key_with_certificate_chain.pem``. @@ -97,7 +101,7 @@ class KafkaSSLProtocol(KafkaProtocol, GenericOptions): truststore_location="/path/to/server.crt", ) - Pass JKS (Java Key Store) location: + Passing JKS (Java Key Store) location: * `Add user key and certificate to JKS keystore `_. * `Add server certificate to JKS truststore `_. diff --git a/onetl/connection/db_connection/kafka/options.py b/onetl/connection/db_connection/kafka/options.py index 0b04f2cb5..e2a4a8d32 100644 --- a/onetl/connection/db_connection/kafka/options.py +++ b/onetl/connection/db_connection/kafka/options.py @@ -76,6 +76,8 @@ class KafkaReadOptions(GenericOptions): are populated from connection attributes, and cannot be overridden by the user in ``ReadOptions`` to avoid issues. + .. versionadded:: 0.9.0 + Examples -------- @@ -121,6 +123,8 @@ class KafkaWriteOptions(GenericOptions): are populated from connection attributes, and cannot be overridden by the user in ``WriteOptions`` to avoid issues. + .. versionadded:: 0.9.0 + Examples -------- diff --git a/onetl/connection/db_connection/kafka/slots.py b/onetl/connection/db_connection/kafka/slots.py index cd6bfcbe8..2abf00ce0 100644 --- a/onetl/connection/db_connection/kafka/slots.py +++ b/onetl/connection/db_connection/kafka/slots.py @@ -7,7 +7,11 @@ @support_hooks class KafkaSlots: - """Kafka slots that could be implemented by third-party plugins""" + """ + Kafka slots that could be implemented by third-party plugins + + .. versionadded:: 0.9.0 + """ @slot @staticmethod @@ -17,6 +21,8 @@ def normalize_cluster_name(cluster: str) -> str | None: This can be used to ensure that the Kafka cluster name conforms to specific naming conventions. + .. versionadded:: 0.9.0 + Parameters ---------- cluster : str @@ -50,6 +56,8 @@ def get_known_clusters() -> set[str] | None: This can be used to validate if the provided Kafka cluster name is recognized in the system. + .. versionadded:: 0.9.0 + Returns ------- set[str] | None @@ -78,6 +86,8 @@ def normalize_address(address: str, cluster: str) -> str | None: This can be used to format the broker address according to specific rules, such as adding default ports. + .. versionadded:: 0.9.0 + Parameters ---------- address : str @@ -115,6 +125,8 @@ def get_cluster_addresses(cluster: str) -> list[str] | None: This can be used to obtain the broker addresses dynamically. + .. versionadded:: 0.9.0 + Parameters ---------- cluster : str diff --git a/onetl/connection/db_connection/mongodb/connection.py b/onetl/connection/db_connection/mongodb/connection.py index acaaa6539..568cd9537 100644 --- a/onetl/connection/db_connection/mongodb/connection.py +++ b/onetl/connection/db_connection/mongodb/connection.py @@ -50,13 +50,15 @@ class Config: class MongoDB(DBConnection): """MongoDB connection. |support_hooks| - Based on package ``org.mongodb.spark:mongo-spark-connector:10.1.1`` - (`MongoDB connector for Spark `_) + Based on package `org.mongodb.spark:mongo-spark-connector:10.3.0 `_ + (`MongoDB connector for Spark `_) - .. warning:: + .. seealso:: Before using this connector please take into account :ref:`mongodb-prerequisites` + .. versionadded:: 0.7.0 + Parameters ---------- host : str @@ -131,15 +133,14 @@ class MongoDB(DBConnection): @classmethod def get_packages( cls, - scala_version: str | Version | None = None, - spark_version: str | Version | None = None, + scala_version: str | None = None, + spark_version: str | None = None, + package_version: str | None = None, ) -> list[str]: """ - Get package names to be downloaded by Spark. |support_hooks| - - .. warning:: + Get package names to be downloaded by Spark. Allows specifying custom MongoDB Spark connector versions. |support_hooks| - You should pass at least one parameter. + .. versionadded:: 0.9.0 Parameters ---------- @@ -149,38 +150,43 @@ def get_packages( If ``None``, ``spark_version`` is used to determine Scala version. spark_version : str, optional - Spark version in format ``major.minor``. + Spark version in format ``major.minor``. Used only if ``scala_version=None``. - Used only if ``scala_version=None``. + package_version : str, optional + Specifies the version of the MongoDB Spark connector to use. Defaults to ``10.3.0``. + + .. versionadded:: 0.11.0 Examples -------- - .. code:: python from onetl.connection import MongoDB MongoDB.get_packages(scala_version="2.12") - MongoDB.get_packages(spark_version="3.4") + # specify custom connector version + MongoDB.get_packages(scala_version="2.12", package_version="10.3.0") """ - # Connector version is fixed, so we can perform checks for Scala/Spark version + default_package_version = "10.3.0" + if scala_version: - scala_ver = Version.parse(scala_version) + scala_ver = Version(scala_version).min_digits(2) elif spark_version: - spark_ver = Version.parse(spark_version) + spark_ver = Version(spark_version) if spark_ver.major < 3: raise ValueError(f"Spark version must be at least 3.0, got {spark_ver}") scala_ver = get_default_scala_version(spark_ver) else: raise ValueError("You should pass either `scala_version` or `spark_version`") - if scala_ver.digits(2) < (2, 12) or scala_ver.digits(2) > (2, 13): - raise ValueError(f"Scala version must be 2.12 - 2.13, got {scala_ver}") + connector_ver = Version(package_version or default_package_version).min_digits(2) + + if scala_ver < Version("2.12"): + raise ValueError(f"Scala version must be at least 2.12, got {scala_ver}") - # https://mvnrepository.com/artifact/org.mongodb.spark/mongo-spark-connector - return [f"org.mongodb.spark:mongo-spark-connector_{scala_ver.digits(2)}:10.1.1"] + return [f"org.mongodb.spark:mongo-spark-connector_{scala_ver.format('{0}.{1}')}:{connector_ver}"] @classproperty def package_spark_3_2(cls) -> str: @@ -190,7 +196,7 @@ def package_spark_3_2(cls) -> str: "use `MongoDB.get_packages(spark_version='3.2')` instead" ) warnings.warn(msg, UserWarning, stacklevel=3) - return "org.mongodb.spark:mongo-spark-connector_2.12:10.1.1" + return "org.mongodb.spark:mongo-spark-connector_2.12:10.3.0" @classproperty def package_spark_3_3(cls) -> str: @@ -200,7 +206,7 @@ def package_spark_3_3(cls) -> str: "use `MongoDB.get_packages(spark_version='3.3')` instead" ) warnings.warn(msg, UserWarning, stacklevel=3) - return "org.mongodb.spark:mongo-spark-connector_2.12:10.1.1" + return "org.mongodb.spark:mongo-spark-connector_2.12:10.3.0" @classproperty def package_spark_3_4(cls) -> str: @@ -210,7 +216,7 @@ def package_spark_3_4(cls) -> str: "use `MongoDB.get_packages(spark_version='3.4')` instead" ) warnings.warn(msg, UserWarning, stacklevel=3) - return "org.mongodb.spark:mongo-spark-connector_2.12:10.1.1" + return "org.mongodb.spark:mongo-spark-connector_2.12:10.3.0" @slot def pipeline( @@ -237,6 +243,8 @@ def pipeline( This method does not support :ref:`strategy`, use :obj:`DBReader ` instead + .. versionadded:: 0.7.0 + Parameters ---------- @@ -251,7 +259,7 @@ def pipeline( Schema describing the resulting DataFrame. options : PipelineOptions | dict, optional - Additional pipeline options, see :obj:`~PipelineOptions`. + Additional pipeline options, see :obj:`MongoDB.PipelineOptions `. Examples -------- @@ -512,7 +520,7 @@ def _check_java_class_imported(cls, spark): try: try_import_java_class(spark, java_class) except Exception as e: - spark_version = get_spark_version(spark).digits(2) + spark_version = get_spark_version(spark).format("{major}.{minor}") msg = MISSING_JVM_CLASS_MSG.format( java_class=java_class, package_source=cls.__name__, diff --git a/onetl/connection/db_connection/mongodb/options.py b/onetl/connection/db_connection/mongodb/options.py index 2ddc7a068..223e05ec4 100644 --- a/onetl/connection/db_connection/mongodb/options.py +++ b/onetl/connection/db_connection/mongodb/options.py @@ -94,7 +94,7 @@ def _missing_(cls, value: object): # noqa: WPS120 class MongoDBPipelineOptions(GenericOptions): """Aggregation pipeline options for MongoDB connector. - The only difference from :obj:`MongoDBReadOptions` that it is allowed to pass the ``hint`` parameter. + The only difference from :obj:`MongoDB.ReadOptions ` that it is allowed to pass the ``hint`` parameter. .. note :: @@ -109,6 +109,8 @@ class MongoDBPipelineOptions(GenericOptions): Options ``uri``, ``database``, ``collection``, ``pipeline`` are populated from connection attributes, and cannot be overridden by the user in ``PipelineOptions`` to avoid issues. + .. versionadded:: 0.7.0 + Examples -------- @@ -143,6 +145,8 @@ class MongoDBReadOptions(GenericOptions): Options ``uri``, ``database``, ``collection``, ``pipeline``, ``hint`` are populated from connection attributes, and cannot be overridden by the user in ``ReadOptions`` to avoid issues. + .. versionadded:: 0.7.0 + Examples -------- @@ -151,7 +155,7 @@ class MongoDBReadOptions(GenericOptions): .. code:: python MongoDB.ReadOptions( - batchSize=10000, + sampleSize=100, ) """ @@ -177,6 +181,8 @@ class MongoDBWriteOptions(GenericOptions): Options ``uri``, ``database``, ``collection`` are populated from connection attributes, and cannot be overridden by the user in ``WriteOptions`` to avoid issues. + .. versionadded:: 0.7.0 + Examples -------- @@ -246,6 +252,8 @@ class MongoDBWriteOptions(GenericOptions): * Collection exists An error is raised, and no data is written to the collection. + .. versionchanged:: 0.9.0 + Renamed ``mode`` → ``if_exists`` """ class Config: diff --git a/onetl/connection/db_connection/mssql/connection.py b/onetl/connection/db_connection/mssql/connection.py index 608ef7521..2ef1f1f69 100644 --- a/onetl/connection/db_connection/mssql/connection.py +++ b/onetl/connection/db_connection/mssql/connection.py @@ -9,6 +9,13 @@ from onetl._util.version import Version from onetl.connection.db_connection.jdbc_connection import JDBCConnection from onetl.connection.db_connection.mssql.dialect import MSSQLDialect +from onetl.connection.db_connection.mssql.options import ( + MSSQLExecuteOptions, + MSSQLFetchOptions, + MSSQLReadOptions, + MSSQLSQLOptions, + MSSQLWriteOptions, +) from onetl.hooks import slot, support_hooks from onetl.impl import GenericOptions @@ -25,11 +32,11 @@ class Config: class MSSQL(JDBCConnection): """MSSQL JDBC connection. |support_hooks| - Based on Maven package ``com.microsoft.sqlserver:mssql-jdbc:12.2.0.jre8`` + Based on Maven package `com.microsoft.sqlserver:mssql-jdbc:12.6.2.jre8 `_ (`official MSSQL JDBC driver `_). - .. warning:: + .. seealso:: Before using this connector please take into account :ref:`mssql-prerequisites` @@ -139,6 +146,12 @@ class MSSQL(JDBCConnection): port: int = 1433 extra: MSSQLExtra = MSSQLExtra() + ReadOptions = MSSQLReadOptions + WriteOptions = MSSQLWriteOptions + SQLOptions = MSSQLSQLOptions + FetchOptions = MSSQLFetchOptions + ExecuteOptions = MSSQLExecuteOptions + Extra = MSSQLExtra Dialect = MSSQLDialect @@ -149,51 +162,67 @@ class MSSQL(JDBCConnection): @classmethod def get_packages( cls, - java_version: str | Version | None = None, + java_version: str | None = None, + package_version: str | None = None, ) -> list[str]: """ - Get package names to be downloaded by Spark. |support_hooks| + Get package names to be downloaded by Spark. Allows specifying custom JDBC driver versions for MSSQL. |support_hooks| + + .. versionadded:: 0.9.0 Parameters ---------- - java_version : str, default ``8`` - Java major version. + java_version : str, optional + Java major version, defaults to ``8``. Must be ``8`` or ``11``. + package_version : str, optional + Specifies the version of the MSSQL JDBC driver to use. Defaults to ``12.6.2.``. Examples -------- - .. code:: python from onetl.connection import MSSQL MSSQL.get_packages() - MSSQL.get_packages(java_version="8") + # specify Java and package versions + MSSQL.get_packages(java_version="8", package_version="12.6.2.jre11") """ - if java_version is None: - java_version = "8" + default_java_version = "8" + default_package_version = "12.6.2" - java_ver = Version.parse(java_version) + java_ver = Version(java_version or default_java_version) if java_ver.major < 8: raise ValueError(f"Java version must be at least 8, got {java_ver}") jre_ver = "8" if java_ver.major < 11 else "11" - return [f"com.microsoft.sqlserver:mssql-jdbc:12.2.0.jre{jre_ver}"] + full_package_version = Version(package_version or default_package_version).min_digits(3) + + # check if a JRE suffix is already included + if ".jre" in str(full_package_version): + jdbc_version = full_package_version + else: + jdbc_version = Version(f"{full_package_version}.jre{jre_ver}") + + return [f"com.microsoft.sqlserver:mssql-jdbc:{jdbc_version}"] @classproperty def package(cls) -> str: """Get package name to be downloaded by Spark.""" msg = "`MSSQL.package` will be removed in 1.0.0, use `MSSQL.get_packages()` instead" warnings.warn(msg, UserWarning, stacklevel=3) - return "com.microsoft.sqlserver:mssql-jdbc:12.2.0.jre8" + return "com.microsoft.sqlserver:mssql-jdbc:12.6.2.jre8" @property def jdbc_url(self) -> str: - prop = self.extra.dict(by_alias=True) - prop["databaseName"] = self.database - parameters = ";".join(f"{k}={v}" for k, v in sorted(prop.items())) + return f"jdbc:sqlserver://{self.host}:{self.port}" - return f"jdbc:sqlserver://{self.host}:{self.port};{parameters}" + @property + def jdbc_params(self) -> dict: + result = super().jdbc_params + result.update(self.extra.dict(by_alias=True)) + result["databaseName"] = self.database + return result @property def instance_url(self) -> str: diff --git a/onetl/connection/db_connection/mssql/options.py b/onetl/connection/db_connection/mssql/options.py new file mode 100644 index 000000000..c14e38b68 --- /dev/null +++ b/onetl/connection/db_connection/mssql/options.py @@ -0,0 +1,32 @@ +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 + +from onetl.connection.db_connection.jdbc_connection.options import ( + JDBCReadOptions, + JDBCSQLOptions, + JDBCWriteOptions, +) +from onetl.connection.db_connection.jdbc_mixin.options import ( + JDBCExecuteOptions, + JDBCFetchOptions, +) + + +class MSSQLReadOptions(JDBCReadOptions): + __doc__ = JDBCReadOptions.__doc__ # type: ignore[assignment] + + +class MSSQLWriteOptions(JDBCWriteOptions): + __doc__ = JDBCWriteOptions.__doc__ # type: ignore[assignment] + + +class MSSQLSQLOptions(JDBCSQLOptions): + __doc__ = JDBCSQLOptions.__doc__ # type: ignore[assignment] + + +class MSSQLFetchOptions(JDBCFetchOptions): + __doc__ = JDBCFetchOptions.__doc__ # type: ignore[assignment] + + +class MSSQLExecuteOptions(JDBCExecuteOptions): + __doc__ = JDBCExecuteOptions.__doc__ # type: ignore[assignment] diff --git a/onetl/connection/db_connection/mysql/connection.py b/onetl/connection/db_connection/mysql/connection.py index ebf43f668..4e37d1d52 100644 --- a/onetl/connection/db_connection/mysql/connection.py +++ b/onetl/connection/db_connection/mysql/connection.py @@ -6,8 +6,16 @@ from typing import ClassVar, Optional from onetl._util.classproperty import classproperty +from onetl._util.version import Version from onetl.connection.db_connection.jdbc_connection import JDBCConnection from onetl.connection.db_connection.mysql.dialect import MySQLDialect +from onetl.connection.db_connection.mysql.options import ( + MySQLExecuteOptions, + MySQLFetchOptions, + MySQLReadOptions, + MySQLSQLOptions, + MySQLWriteOptions, +) from onetl.hooks import slot, support_hooks from onetl.impl.generic_options import GenericOptions @@ -26,13 +34,15 @@ class Config: class MySQL(JDBCConnection): """MySQL JDBC connection. |support_hooks| - Based on Maven package ``com.mysql:mysql-connector-j:8.0.33`` - (`official MySQL JDBC driver `_). + Based on Maven package `com.mysql:mysql-connector-j:8.4.0 `_ + (`official MySQL JDBC driver `_). - .. warning:: + .. seealso:: Before using this connector please take into account :ref:`mysql-prerequisites` + .. versionadded:: 0.1.0 + Parameters ---------- host : str @@ -97,6 +107,12 @@ class MySQL(JDBCConnection): database: Optional[str] = None extra: MySQLExtra = MySQLExtra() + ReadOptions = MySQLReadOptions + WriteOptions = MySQLWriteOptions + SQLOptions = MySQLSQLOptions + FetchOptions = MySQLFetchOptions + ExecuteOptions = MySQLExecuteOptions + Extra = MySQLExtra Dialect = MySQLDialect @@ -104,35 +120,51 @@ class MySQL(JDBCConnection): @slot @classmethod - def get_packages(cls) -> list[str]: + def get_packages(cls, package_version: str | None = None) -> list[str]: """ - Get package names to be downloaded by Spark. |support_hooks| + Get package names to be downloaded by Spark. Allows specifying a custom JDBC driver version for MySQL. |support_hooks| + + .. versionadded:: 0.9.0 + + Parameters + ---------- + package_version : str, optional + Specifies the version of the MySQL JDBC driver to use. Defaults to ``8.4.0``. + + .. versionadded:: 0.11.0 Examples -------- - .. code:: python from onetl.connection import MySQL MySQL.get_packages() + # specify a custom package version + MySQL.get_packages(package_version="8.2.0") """ - return ["com.mysql:mysql-connector-j:8.0.33"] + default_version = "8.4.0" + version = Version(package_version or default_version).min_digits(3) + + return [f"com.mysql:mysql-connector-j:{version}"] @classproperty def package(cls) -> str: """Get package name to be downloaded by Spark.""" msg = "`MySQL.package` will be removed in 1.0.0, use `MySQL.get_packages()` instead" warnings.warn(msg, UserWarning, stacklevel=3) - return "com.mysql:mysql-connector-j:8.0.33" + return "com.mysql:mysql-connector-j:8.4.0" @property - def jdbc_url(self): - prop = self.extra.dict(by_alias=True) - parameters = "&".join(f"{k}={v}" for k, v in sorted(prop.items())) - + def jdbc_url(self) -> str: if self.database: - return f"jdbc:mysql://{self.host}:{self.port}/{self.database}?{parameters}" + return f"jdbc:mysql://{self.host}:{self.port}/{self.database}" - return f"jdbc:mysql://{self.host}:{self.port}?{parameters}" + return f"jdbc:mysql://{self.host}:{self.port}" + + @property + def jdbc_params(self) -> dict: + result = super().jdbc_params + result.update(self.extra.dict(by_alias=True)) + return result diff --git a/onetl/connection/db_connection/mysql/options.py b/onetl/connection/db_connection/mysql/options.py new file mode 100644 index 000000000..06abd6d2d --- /dev/null +++ b/onetl/connection/db_connection/mysql/options.py @@ -0,0 +1,33 @@ +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 + + +from onetl.connection.db_connection.jdbc_connection.options import ( + JDBCReadOptions, + JDBCSQLOptions, + JDBCWriteOptions, +) +from onetl.connection.db_connection.jdbc_mixin.options import ( + JDBCExecuteOptions, + JDBCFetchOptions, +) + + +class MySQLReadOptions(JDBCReadOptions): + __doc__ = JDBCReadOptions.__doc__ # type: ignore[assignment] + + +class MySQLWriteOptions(JDBCWriteOptions): + __doc__ = JDBCWriteOptions.__doc__ # type: ignore[assignment] + + +class MySQLSQLOptions(JDBCSQLOptions): + __doc__ = JDBCSQLOptions.__doc__ # type: ignore[assignment] + + +class MySQLFetchOptions(JDBCFetchOptions): + __doc__ = JDBCFetchOptions.__doc__ # type: ignore[assignment] + + +class MySQLExecuteOptions(JDBCExecuteOptions): + __doc__ = JDBCExecuteOptions.__doc__ # type: ignore[assignment] diff --git a/onetl/connection/db_connection/oracle/connection.py b/onetl/connection/db_connection/oracle/connection.py index f6b1ad2d9..fd0f9c1f7 100644 --- a/onetl/connection/db_connection/oracle/connection.py +++ b/onetl/connection/db_connection/oracle/connection.py @@ -22,8 +22,19 @@ from onetl._util.version import Version from onetl.connection.db_connection.jdbc_connection import JDBCConnection from onetl.connection.db_connection.jdbc_connection.options import JDBCReadOptions -from onetl.connection.db_connection.jdbc_mixin.options import JDBCOptions +from onetl.connection.db_connection.jdbc_mixin.options import ( + JDBCExecuteOptions, + JDBCFetchOptions, + JDBCOptions, +) from onetl.connection.db_connection.oracle.dialect import OracleDialect +from onetl.connection.db_connection.oracle.options import ( + OracleExecuteOptions, + OracleFetchOptions, + OracleReadOptions, + OracleSQLOptions, + OracleWriteOptions, +) from onetl.hooks import slot, support_hooks from onetl.hwm import Window from onetl.impl import GenericOptions @@ -70,13 +81,15 @@ class Config: class Oracle(JDBCConnection): """Oracle JDBC connection. |support_hooks| - Based on Maven package ``com.oracle.database.jdbc:ojdbc8:23.2.0.0`` + Based on Maven package `com.oracle.database.jdbc:ojdbc8:23.4.0.24.05 `_ (`official Oracle JDBC driver `_). - .. warning:: + .. seealso:: Before using this connector please take into account :ref:`oracle-prerequisites` + .. versionadded:: 0.1.0 + Parameters ---------- host : str @@ -169,6 +182,12 @@ class Oracle(JDBCConnection): service_name: Optional[str] = None extra: OracleExtra = OracleExtra() + ReadOptions = OracleReadOptions + WriteOptions = OracleWriteOptions + SQLOptions = OracleSQLOptions + FetchOptions = OracleFetchOptions + ExecuteOptions = OracleExecuteOptions + Extra = OracleExtra Dialect = OracleDialect @@ -179,15 +198,18 @@ class Oracle(JDBCConnection): @classmethod def get_packages( cls, - java_version: str | Version | None = None, + java_version: str | None = None, + package_version: str | None = None, ) -> list[str]: """ - Get package names to be downloaded by Spark. |support_hooks| + Get package names to be downloaded by Spark. Allows specifying custom JDBC driver versions for Oracle. |support_hooks| Parameters ---------- - java_version : str, default ``8`` - Java major version. + java_version : str, optional + Java major version, defaults to "8". Must be "8" or "11". + package_version : str, optional + Specifies the version of the Oracle JDBC driver to use. Defaults to "23.4.0.24.05". Examples -------- @@ -197,35 +219,42 @@ def get_packages( from onetl.connection import Oracle Oracle.get_packages() - Oracle.get_packages(java_version="8") + # specify Java and package versions + Oracle.get_packages(java_version="8", package_version="23.4.0.24.05") """ - if java_version is None: - java_version = "8" - java_ver = Version.parse(java_version) + default_java_version = "8" + default_package_version = "23.4.0.24.05" + + java_ver = Version(java_version or default_java_version) if java_ver.major < 8: - raise ValueError(f"Java version must be at least 8, got {java_ver}") + raise ValueError(f"Java version must be at least 8, got {java_ver.major}") jre_ver = "8" if java_ver.major < 11 else "11" - return [f"com.oracle.database.jdbc:ojdbc{jre_ver}:23.2.0.0"] + jdbc_version = Version(package_version or default_package_version).min_digits(4) + + return [f"com.oracle.database.jdbc:ojdbc{jre_ver}:{jdbc_version}"] @classproperty def package(cls) -> str: """Get package name to be downloaded by Spark.""" msg = "`Oracle.package` will be removed in 1.0.0, use `Oracle.get_packages()` instead" warnings.warn(msg, UserWarning, stacklevel=3) - return "com.oracle.database.jdbc:ojdbc8:23.2.0.0" + return "com.oracle.database.jdbc:ojdbc8:23.4.0.24.05" @property def jdbc_url(self) -> str: - extra = self.extra.dict(by_alias=True) - parameters = "&".join(f"{k}={v}" for k, v in sorted(extra.items())) - if self.sid: - return f"jdbc:oracle:thin:@{self.host}:{self.port}:{self.sid}?{parameters}".rstrip("?") + return f"jdbc:oracle:thin:@{self.host}:{self.port}:{self.sid}" + + return f"jdbc:oracle:thin:@//{self.host}:{self.port}/{self.service_name}" - return f"jdbc:oracle:thin:@//{self.host}:{self.port}/{self.service_name}?{parameters}".rstrip("?") + @property + def jdbc_params(self) -> dict: + result = super().jdbc_params + result.update(self.extra.dict(by_alias=True)) + return result @property def instance_url(self) -> str: @@ -262,14 +291,14 @@ def get_min_max_values( def execute( self, statement: str, - options: JDBCOptions | dict | None = None, # noqa: WPS437 + options: JDBCOptions | JDBCExecuteOptions | dict | None = None, # noqa: WPS437 ) -> DataFrame | None: statement = clear_statement(statement) log.info("|%s| Executing statement (on driver):", self.__class__.__name__) log_lines(log, statement) - call_options = self.JDBCOptions.parse(options) + call_options = self.ExecuteOptions.parse(options) df = self._call_on_driver(statement, call_options) self._handle_compile_errors(statement.strip(), call_options) @@ -326,7 +355,7 @@ def _get_compile_errors( type_name: str, schema: str, object_name: str, - options: JDBCOptions, + options: JDBCExecuteOptions | JDBCFetchOptions, ) -> list[tuple[ErrorPosition, str]]: """ Get compile errors for the object. @@ -396,7 +425,7 @@ def _build_error_message(self, aggregated_errors: OrderedDict[ErrorPosition, str def _handle_compile_errors( self, statement: str, - options: JDBCOptions, + options: JDBCExecuteOptions, ) -> None: """ Oracle does not return compilation errors immediately. diff --git a/onetl/connection/db_connection/oracle/options.py b/onetl/connection/db_connection/oracle/options.py new file mode 100644 index 000000000..61b82e1b1 --- /dev/null +++ b/onetl/connection/db_connection/oracle/options.py @@ -0,0 +1,33 @@ +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 + + +from onetl.connection.db_connection.jdbc_connection.options import ( + JDBCReadOptions, + JDBCSQLOptions, + JDBCWriteOptions, +) +from onetl.connection.db_connection.jdbc_mixin.options import ( + JDBCExecuteOptions, + JDBCFetchOptions, +) + + +class OracleReadOptions(JDBCReadOptions): + __doc__ = JDBCReadOptions.__doc__ # type: ignore[assignment] + + +class OracleWriteOptions(JDBCWriteOptions): + __doc__ = JDBCWriteOptions.__doc__ # type: ignore[assignment] + + +class OracleSQLOptions(JDBCSQLOptions): + __doc__ = JDBCSQLOptions.__doc__ # type: ignore[assignment] + + +class OracleFetchOptions(JDBCFetchOptions): + __doc__ = JDBCFetchOptions.__doc__ # type: ignore[assignment] + + +class OracleExecuteOptions(JDBCExecuteOptions): + __doc__ = JDBCExecuteOptions.__doc__ # type: ignore[assignment] diff --git a/onetl/connection/db_connection/postgres/connection.py b/onetl/connection/db_connection/postgres/connection.py index a4d27bcea..44a36eb6e 100644 --- a/onetl/connection/db_connection/postgres/connection.py +++ b/onetl/connection/db_connection/postgres/connection.py @@ -6,9 +6,21 @@ from typing import ClassVar from onetl._util.classproperty import classproperty +from onetl._util.version import Version from onetl.connection.db_connection.jdbc_connection import JDBCConnection -from onetl.connection.db_connection.jdbc_mixin.options import JDBCOptions +from onetl.connection.db_connection.jdbc_mixin.options import ( + JDBCExecuteOptions, + JDBCFetchOptions, + JDBCOptions, +) from onetl.connection.db_connection.postgres.dialect import PostgresDialect +from onetl.connection.db_connection.postgres.options import ( + PostgresExecuteOptions, + PostgresFetchOptions, + PostgresReadOptions, + PostgresSQLOptions, + PostgresWriteOptions, +) from onetl.hooks import slot, support_hooks from onetl.impl import GenericOptions @@ -19,6 +31,10 @@ class PostgresExtra(GenericOptions): # allows automatic conversion from text to target column type during write stringtype: str = "unspecified" + # avoid closing connections from server side + # while connector is moving data to executors before insert + tcpKeepAlive: str = "true" # noqa: N815 + class Config: extra = "allow" @@ -27,13 +43,15 @@ class Config: class Postgres(JDBCConnection): """PostgreSQL JDBC connection. |support_hooks| - Based on Maven package ``org.postgresql:postgresql:42.6.0`` + Based on Maven package `org.postgresql:postgresql:42.7.3 `_ (`official Postgres JDBC driver `_). - .. warning:: + .. seealso:: Before using this connector please take into account :ref:`postgres-prerequisites` + .. versionadded:: 0.1.0 + Parameters ---------- host : str @@ -98,6 +116,12 @@ class Postgres(JDBCConnection): port: int = 5432 extra: PostgresExtra = PostgresExtra() + ReadOptions = PostgresReadOptions + WriteOptions = PostgresWriteOptions + SQLOptions = PostgresSQLOptions + FetchOptions = PostgresFetchOptions + ExecuteOptions = PostgresExecuteOptions + Extra = PostgresExtra Dialect = PostgresDialect @@ -105,9 +129,16 @@ class Postgres(JDBCConnection): @slot @classmethod - def get_packages(cls) -> list[str]: + def get_packages(cls, package_version: str | None = None) -> list[str]: """ - Get package names to be downloaded by Spark. |support_hooks| + Get package names to be downloaded by Spark. Allows specifying a custom JDBC driver version. |support_hooks| + + .. versionadded:: 0.9.0 + + Parameters + ---------- + package_version : str, optional + Specifies the version of the PostgreSQL JDBC driver to use. Defaults to ``42.7.3``. Examples -------- @@ -118,29 +149,41 @@ def get_packages(cls) -> list[str]: Postgres.get_packages() + # custom package version + Postgres.get_packages(package_version="42.6.0") + """ - return ["org.postgresql:postgresql:42.6.0"] + default_version = "42.7.3" + version = Version(package_version or default_version).min_digits(3) + + return [f"org.postgresql:postgresql:{version}"] @classproperty def package(cls) -> str: """Get package name to be downloaded by Spark.""" msg = "`Postgres.package` will be removed in 1.0.0, use `Postgres.get_packages()` instead" warnings.warn(msg, UserWarning, stacklevel=3) - return "org.postgresql:postgresql:42.6.0" + return "org.postgresql:postgresql:42.7.3" @property def jdbc_url(self) -> str: - extra = self.extra.dict(by_alias=True) - extra["ApplicationName"] = extra.get("ApplicationName", self.spark.sparkContext.appName) + return f"jdbc:postgresql://{self.host}:{self.port}/{self.database}" - parameters = "&".join(f"{k}={v}" for k, v in sorted(extra.items())) - return f"jdbc:postgresql://{self.host}:{self.port}/{self.database}?{parameters}".rstrip("?") + @property + def jdbc_params(self) -> dict[str, str]: + result = super().jdbc_params + result.update(self.extra.dict(by_alias=True)) + result["ApplicationName"] = result.get("ApplicationName", self.spark.sparkContext.appName) + return result @property def instance_url(self) -> str: return f"{super().instance_url}/{self.database}" - def _options_to_connection_properties(self, options: JDBCOptions): # noqa: WPS437 + def _options_to_connection_properties( + self, + options: JDBCOptions | JDBCFetchOptions | JDBCExecuteOptions, + ): # noqa: WPS437 # See https://github.com/pgjdbc/pgjdbc/pull/1252 # Since 42.2.9 Postgres JDBC Driver added new option readOnlyMode=transaction # Which is not a desired behavior, because `.fetch()` method should always be read-only diff --git a/onetl/connection/db_connection/postgres/options.py b/onetl/connection/db_connection/postgres/options.py new file mode 100644 index 000000000..3a4dd806f --- /dev/null +++ b/onetl/connection/db_connection/postgres/options.py @@ -0,0 +1,32 @@ +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 + +from onetl.connection.db_connection.jdbc_connection.options import ( + JDBCReadOptions, + JDBCSQLOptions, + JDBCWriteOptions, +) +from onetl.connection.db_connection.jdbc_mixin.options import ( + JDBCExecuteOptions, + JDBCFetchOptions, +) + + +class PostgresReadOptions(JDBCReadOptions): + __doc__ = JDBCReadOptions.__doc__ # type: ignore[assignment] + + +class PostgresWriteOptions(JDBCWriteOptions): + __doc__ = JDBCWriteOptions.__doc__ # type: ignore[assignment] + + +class PostgresSQLOptions(JDBCSQLOptions): + __doc__ = JDBCSQLOptions.__doc__ # type: ignore[assignment] + + +class PostgresFetchOptions(JDBCFetchOptions): + __doc__ = JDBCFetchOptions.__doc__ # type: ignore[assignment] + + +class PostgresExecuteOptions(JDBCExecuteOptions): + __doc__ = JDBCExecuteOptions.__doc__ # type: ignore[assignment] diff --git a/onetl/connection/db_connection/teradata/connection.py b/onetl/connection/db_connection/teradata/connection.py index 0dc3e67dd..3ca789a13 100644 --- a/onetl/connection/db_connection/teradata/connection.py +++ b/onetl/connection/db_connection/teradata/connection.py @@ -5,9 +5,18 @@ import warnings from typing import ClassVar, Optional +from onetl._internal import stringify from onetl._util.classproperty import classproperty +from onetl._util.version import Version from onetl.connection.db_connection.jdbc_connection import JDBCConnection from onetl.connection.db_connection.teradata.dialect import TeradataDialect +from onetl.connection.db_connection.teradata.options import ( + TeradataExecuteOptions, + TeradataFetchOptions, + TeradataReadOptions, + TeradataSQLOptions, + TeradataWriteOptions, +) from onetl.hooks import slot from onetl.impl import GenericOptions @@ -29,13 +38,15 @@ class Config: class Teradata(JDBCConnection): """Teradata JDBC connection. |support_hooks| - Based on package ``com.teradata.jdbc:terajdbc:17.20.00.15`` + Based on package `com.teradata.jdbc:terajdbc:17.20.00.15 `_ (`official Teradata JDBC driver `_). - .. warning:: + .. seealso:: Before using this connector please take into account :ref:`teradata-prerequisites` + .. versionadded:: 0.1.0 + Parameters ---------- host : str @@ -116,6 +127,12 @@ class Teradata(JDBCConnection): database: Optional[str] = None extra: TeradataExtra = TeradataExtra() + ReadOptions = TeradataReadOptions + WriteOptions = TeradataWriteOptions + SQLOptions = TeradataSQLOptions + FetchOptions = TeradataFetchOptions + ExecuteOptions = TeradataExecuteOptions + Extra = TeradataExtra Dialect = TeradataDialect @@ -124,21 +141,37 @@ class Teradata(JDBCConnection): @slot @classmethod - def get_packages(cls) -> list[str]: + def get_packages( + cls, + package_version: str | None = None, + ) -> list[str]: """ - Get package names to be downloaded by Spark. |support_hooks| + Get package names to be downloaded by Spark. Allows specifying custom JDBC driver versions for Teradata. |support_hooks| + + .. versionadded:: 0.9.0 + + Parameters + ---------- + package_version : str, optional + Specifies the version of the Teradata JDBC driver to use. Defaults to ``17.20.00.15``. + + .. versionadded:: 0.11.0 Examples -------- - .. code:: python from onetl.connection import Teradata Teradata.get_packages() + # specify custom driver version + Teradata.get_packages(package_version="20.00.00.18") """ - return ["com.teradata.jdbc:terajdbc:17.20.00.15"] + default_package_version = "17.20.00.15" + version = Version(package_version or default_package_version).min_digits(4) + + return [f"com.teradata.jdbc:terajdbc:{version}"] @classproperty def package(cls) -> str: @@ -149,12 +182,22 @@ def package(cls) -> str: @property def jdbc_url(self) -> str: - prop = self.extra.dict(by_alias=True) + # Teradata JDBC driver documentation specifically mentions that params from + # java.sql.DriverManager.getConnection(url, params) are used to only retrieve 'user' and 'password' values. + # Other params should be passed via url + properties = self.extra.dict(by_alias=True) if self.database: - prop["DATABASE"] = self.database + properties["DATABASE"] = self.database + + properties["DBS_PORT"] = self.port - prop["DBS_PORT"] = self.port + connection_params = [] + for key, value in sorted(properties.items()): + string_value = stringify(value) + if "," in string_value: + connection_params.append(f"{key}='{string_value}'") + else: + connection_params.append(f"{key}={string_value}") - conn = ",".join(f"{k}={v}" for k, v in sorted(prop.items())) - return f"jdbc:teradata://{self.host}/{conn}" + return f"jdbc:teradata://{self.host}/{','.join(connection_params)}" diff --git a/onetl/connection/db_connection/teradata/options.py b/onetl/connection/db_connection/teradata/options.py new file mode 100644 index 000000000..eb77f8c87 --- /dev/null +++ b/onetl/connection/db_connection/teradata/options.py @@ -0,0 +1,32 @@ +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 + +from onetl.connection.db_connection.jdbc_connection.options import ( + JDBCReadOptions, + JDBCSQLOptions, + JDBCWriteOptions, +) +from onetl.connection.db_connection.jdbc_mixin.options import ( + JDBCExecuteOptions, + JDBCFetchOptions, +) + + +class TeradataReadOptions(JDBCReadOptions): + __doc__ = JDBCReadOptions.__doc__ # type: ignore[assignment] + + +class TeradataWriteOptions(JDBCWriteOptions): + __doc__ = JDBCWriteOptions.__doc__ # type: ignore[assignment] + + +class TeradataSQLOptions(JDBCSQLOptions): + __doc__ = JDBCSQLOptions.__doc__ # type: ignore[assignment] + + +class TeradataFetchOptions(JDBCFetchOptions): + __doc__ = JDBCFetchOptions.__doc__ # type: ignore[assignment] + + +class TeradataExecuteOptions(JDBCExecuteOptions): + __doc__ = JDBCExecuteOptions.__doc__ # type: ignore[assignment] diff --git a/onetl/connection/file_connection/file_connection.py b/onetl/connection/file_connection/file_connection.py index 3d7437ad8..0a1584095 100644 --- a/onetl/connection/file_connection/file_connection.py +++ b/onetl/connection/file_connection/file_connection.py @@ -87,7 +87,6 @@ def close(self): .. code:: python content = connection.list_dir("/mydir") - assert content connection.close() # or @@ -603,19 +602,19 @@ def _extract_name_from_entry(self, entry) -> str: Get an entry name: - .. code:: python - - for entry in connection._scan_entries(path="/a/path/to/the/directory"): - assert entry == { - "created": "2023-12-08T18:33:39Z", - "owner": None, - "size": "23", - "modified": "2023-12-08 18:33:20", - "isdir": False, - "path": "/path/to/the/file.txt", - } - - assert entry._extract_name_from_entry(entry) == "file.txt" + >>> for entry in connection._scan_entries(path="/a/path/to/the/directory"): + ... break + >>> entry + { + 'created': '2023-12-08T18:33:39Z', + 'owner': None, + 'size': 23, + 'modified': '2023-12-08 18:33:20', + 'isdir': False, + 'path': '/path/to/the/directory/file.txt', + } + >>> entry._extract_name_from_entry(entry) + 'file.txt' """ @abstractmethod @@ -643,10 +642,19 @@ def _is_dir_entry(self, top: RemotePath, entry) -> bool: Show if the entry is a directory: - .. code:: python - - for component in connection._scan_entries(path="/a/path/to/the/directory"): - assert connection._is_dir_entry(root="/a/path/to/the/directory", entry) == True + >>> for entry in connection._scan_entries(path="/a/path/to/the/directory"): + ... break + >>> entry + { + 'created': '2023-12-08T18:33:39Z', + 'owner': None, + 'size': 0, + 'modified': '2023-12-08 18:33:20', + 'isdir': True, + 'path': '/path/to/the/directory', + } + >>> connection._is_dir_entry(root="/a/path/to/the/directory", entry) + True """ @abstractmethod @@ -674,10 +682,19 @@ def _is_file_entry(self, top: RemotePath, entry) -> bool: Show if the entry is a file: - .. code:: python - - for entry in connection._scan_entries(path="/a/path/to/the/directory"): - assert connection._is_file_entry(root="/a/path/to/the/directory", entry) == True + >>> for entry in connection._scan_entries(path="/a/path/to/the/directory"): + ... break + >>> entry + { + 'created': '2023-12-08T18:33:39Z', + 'owner': None, + 'size': 23, + 'modified': '2023-12-08 18:33:20', + 'isdir': False, + 'path': '/path/to/the/directory/file.txt', + } + >>> connection._is_file_entry(root="/a/path/to/the/directory", entry) + True """ @abstractmethod @@ -708,15 +725,25 @@ def _extract_stat_from_entry(self, top: RemotePath, entry) -> PathStatProtocol: Get statistics object from the entry: - .. code:: python - - for entry in connection._scan_entries(path="/a/path/to/the/directory"): - stat = connection._extract_stat_from_entry(root="/a/path/to/the/directory", entry) - - assert stat == RemotePathStat( - st_size=23, st_mtime=1670517693.0, st_mode=None, st_uid=None, st_gid=None - ) - + >>> for entry in connection._scan_entries(path="/a/path/to/the/directory"): + ... break + >>> entry + { + 'created': '2023-12-08T18:33:39Z', + 'owner': None, + 'size': 23, + 'modified': '2023-12-08 18:33:20', + 'isdir': False, + 'path': '/path/to/the/directory/file.txt', + } + >>> connection._extract_stat_from_entry(root="/a/path/to/the/directory", entry) + RemotePathStat( + st_size=23, + st_mtime=1670517693.0, + st_mode=None, + st_uid=None, + st_gid=None, + ) """ def _log_parameters(self): diff --git a/onetl/connection/file_connection/ftp.py b/onetl/connection/file_connection/ftp.py index 07b8b53e1..b457b966f 100644 --- a/onetl/connection/file_connection/ftp.py +++ b/onetl/connection/file_connection/ftp.py @@ -62,6 +62,8 @@ class FTP(FileConnection, RenameDirMixin): See :ref:`install-files` installation instruction for more details. + .. versionadded:: 0.1.0 + Parameters ---------- host : str diff --git a/onetl/connection/file_connection/ftps.py b/onetl/connection/file_connection/ftps.py index 9277b66fd..8cf9aa8fc 100644 --- a/onetl/connection/file_connection/ftps.py +++ b/onetl/connection/file_connection/ftps.py @@ -59,6 +59,8 @@ class FTPS(FTP): See :ref:`install-files` installation instruction for more details. + .. versionadded:: 0.1.0 + Parameters ---------- host : str diff --git a/onetl/connection/file_connection/hdfs/connection.py b/onetl/connection/file_connection/hdfs/connection.py index 7105763fe..056622fbe 100644 --- a/onetl/connection/file_connection/hdfs/connection.py +++ b/onetl/connection/file_connection/hdfs/connection.py @@ -88,6 +88,8 @@ class HDFS(FileConnection, RenameDirMixin): You should pass at least one of these arguments: ``cluster``, ``host``. + .. versionadded:: 0.7.0 + host : str, optional Hadoop namenode host. For example: ``namenode1.domain.com``. @@ -223,6 +225,8 @@ def get_current(cls, **kwargs): Can be used only if there are a some hooks bound to slot :obj:`Slots.get_current_cluster ` + .. versionadded:: 0.7.0 + Parameters ---------- user : str diff --git a/onetl/connection/file_connection/hdfs/slots.py b/onetl/connection/file_connection/hdfs/slots.py index 8bd8431a1..2f75fefad 100644 --- a/onetl/connection/file_connection/hdfs/slots.py +++ b/onetl/connection/file_connection/hdfs/slots.py @@ -7,7 +7,10 @@ @support_hooks class HDFSSlots: - """Slots that could be implemented by third-party plugins""" + """Slots that could be implemented by third-party plugins. + + .. versionadded:: 0.7.0 + """ @slot @staticmethod @@ -17,6 +20,8 @@ def normalize_cluster_name(cluster: str) -> str | None: If hooks didn't return anything, cluster name is left intact. + .. versionadded:: 0.7.0 + Parameters ---------- cluster : :obj:`str` @@ -52,6 +57,8 @@ def normalize_namenode_host(host: str, cluster: str | None) -> str | None: If hooks didn't return anything, host is left intact. + .. versionadded:: 0.7.0 + Parameters ---------- host : :obj:`str` @@ -97,6 +104,8 @@ def get_known_clusters() -> set[str] | None: Cluster passed into HDFS constructor should be present in this list. If hooks didn't return anything, no validation will be performed. + .. versionadded:: 0.7.0 + Returns ------- set[str] | None @@ -128,6 +137,8 @@ def get_cluster_namenodes(cluster: str) -> set[str] | None: Namenode host passed into HDFS constructor should be present in this list. If hooks didn't return anything, no validation will be performed. + .. versionadded:: 0.7.0 + Parameters ---------- cluster : :obj:`str` @@ -166,6 +177,8 @@ def get_current_cluster() -> str | None: Used in :obj:`~get_current_cluster` to automatically fill up ``cluster`` attribute of a connection. If hooks didn't return anything, calling the method above will raise an exception. + .. versionadded:: 0.7.0 + Returns ------- str | None @@ -197,6 +210,8 @@ def get_webhdfs_port(cluster: str) -> int | None: Used by constructor to automatically set port number if omitted. + .. versionadded:: 0.7.0 + Parameters ---------- cluster : :obj:`str` @@ -242,6 +257,8 @@ def is_namenode_active(host: str, cluster: str | None) -> bool | None: :obj:`~check` will determine whether this host is active. + .. versionadded:: 0.7.0 + Parameters ---------- host : :obj:`str` diff --git a/onetl/connection/file_connection/mixins/rename_dir_mixin.py b/onetl/connection/file_connection/mixins/rename_dir_mixin.py index c183ebf56..c110745c1 100644 --- a/onetl/connection/file_connection/mixins/rename_dir_mixin.py +++ b/onetl/connection/file_connection/mixins/rename_dir_mixin.py @@ -23,6 +23,8 @@ def rename_dir( """ Rename or move dir on remote filesystem. + .. versionadded:: 0.8.0 + Parameters ---------- source_dir_path : str or :obj:`os.PathLike` @@ -52,11 +54,13 @@ def rename_dir( Examples -------- - .. code:: python - - new_file = connection.rename_dir("/path/to/dir1", "/path/to/dir2") - assert connection.path_exists("/path/to/dir1") - assert not connection.path_exists("/path/to/dir2") + >>> new_dir = connection.rename_dir("/path/to/dir1", "/path/to/dir2") + >>> os.fspath(new_dir) + '/path/to/dir2' + >>> connection.path_exists("/path/to/dir1") + False + >>> connection.path_exists("/path/to/dir2") + True """ log.debug("|%s| Renaming directory '%s' to '%s'", self.__class__.__name__, source_dir_path, target_dir_path) diff --git a/onetl/connection/file_connection/s3.py b/onetl/connection/file_connection/s3.py index 7811c4c4b..f8f584dcc 100644 --- a/onetl/connection/file_connection/s3.py +++ b/onetl/connection/file_connection/s3.py @@ -62,6 +62,8 @@ class S3(FileConnection): See :ref:`install-files` installation instruction for more details. + .. versionadded:: 0.5.1 + Parameters ---------- host : str @@ -82,6 +84,9 @@ class S3(FileConnection): protocol : str, default : ``https`` Connection protocol. Allowed values: ``https`` or ``http`` + .. versionchanged:: 0.6.0 + Renamed ``secure: bool`` to ``protocol: Literal["https", "http"]`` + session_token : str, optional Session token of your account in S3 service diff --git a/onetl/connection/file_connection/sftp.py b/onetl/connection/file_connection/sftp.py index 3d64669b1..8cd2ac1ed 100644 --- a/onetl/connection/file_connection/sftp.py +++ b/onetl/connection/file_connection/sftp.py @@ -65,6 +65,8 @@ class SFTP(FileConnection, RenameDirMixin): See :ref:`install-files` installation instruction for more details. + .. versionadded:: 0.1.0 + Parameters ---------- host : str diff --git a/onetl/connection/file_connection/webdav.py b/onetl/connection/file_connection/webdav.py index f74eed3e3..aa540567e 100644 --- a/onetl/connection/file_connection/webdav.py +++ b/onetl/connection/file_connection/webdav.py @@ -65,6 +65,8 @@ class WebDAV(FileConnection, RenameDirMixin): See :ref:`install-files` installation instruction for more details. + .. versionadded:: 0.6.0 + Parameters ---------- host : str diff --git a/onetl/connection/file_df_connection/spark_hdfs/connection.py b/onetl/connection/file_df_connection/spark_hdfs/connection.py index 1c0ca6bb3..26c1416eb 100644 --- a/onetl/connection/file_df_connection/spark_hdfs/connection.py +++ b/onetl/connection/file_df_connection/spark_hdfs/connection.py @@ -37,31 +37,9 @@ class SparkHDFS(SparkFileDFConnection): Based on `Spark Generic File Data Source `_. - .. warning:: + .. seealso:: - To use Hive connector you should have PySpark installed (or injected to ``sys.path``) - BEFORE creating the connector instance. - - You can install PySpark as follows: - - .. code:: bash - - pip install onetl[spark] # latest PySpark version - - # or - pip install onetl pyspark=3.5.0 # pass specific PySpark version - - See :ref:`install-spark` installation instruction for more details. - - .. note:: - - Most of Hadoop instances use Kerberos authentication. In this case, you should call ``kinit`` - **BEFORE** starting Spark session to generate Kerberos ticket. See :ref:`install-kerberos`. - - In case of creating session with ``"spark.master": "yarn"``, you should also pass some additional options - to Spark session, allowing executors to generate their own Kerberos tickets to access HDFS. - See `Spark security documentation `_ - for more details. + Before using this connector please take into account :ref:`spark-hdfs-prerequisites` .. note:: @@ -70,6 +48,8 @@ class SparkHDFS(SparkFileDFConnection): Does NOT support file operations, like create, delete, rename, etc. For these operations, use :obj:`HDFS ` connection. + .. versionadded:: 0.9.0 + Parameters ---------- cluster : str @@ -105,7 +85,7 @@ class SparkHDFS(SparkFileDFConnection): Examples -------- - SparkHDFS connection initialization + SparkHDFS connection initialization: .. code:: python @@ -122,11 +102,15 @@ class SparkHDFS(SparkFileDFConnection): spark=spark, ).check() - SparkHDFS connection initialization with Kerberos support + SparkHDFS connection initialization with Kerberos support: + + .. code:: bash + + $ kinit -kt /path/to/keytab user .. code:: python - from onetl.connection import Hive + from onetl.connection import SparkHDFS from pyspark.sql import SparkSession # Create Spark session. @@ -240,6 +224,8 @@ def get_current(cls, spark: SparkSession): Can be used only if there are a some hooks bound to :obj:`Slots.get_current_cluster `. + .. versionadded:: 0.9.0 + Parameters ---------- spark : SparkSession diff --git a/onetl/connection/file_df_connection/spark_hdfs/slots.py b/onetl/connection/file_df_connection/spark_hdfs/slots.py index d16c6527e..4dab6b548 100644 --- a/onetl/connection/file_df_connection/spark_hdfs/slots.py +++ b/onetl/connection/file_df_connection/spark_hdfs/slots.py @@ -7,7 +7,10 @@ @support_hooks class SparkHDFSSlots: - """Spark HDFS slots that could be implemented by third-party plugins""" + """Spark HDFS slots that could be implemented by third-party plugins. + + .. versionadded:: 0.9.0 + """ @slot @staticmethod @@ -17,6 +20,8 @@ def normalize_cluster_name(cluster: str) -> str | None: If hooks didn't return anything, cluster name is left intact. + .. versionadded:: 0.9.0 + Parameters ---------- cluster : :obj:`str` @@ -52,6 +57,8 @@ def normalize_namenode_host(host: str, cluster: str) -> str | None: If hooks didn't return anything, host is left intact. + .. versionadded:: 0.9.0 + Parameters ---------- host : :obj:`str` @@ -97,6 +104,8 @@ def get_known_clusters() -> set[str] | None: Cluster passed into SparkHDFS constructor should be present in this list. If hooks didn't return anything, no validation will be performed. + .. versionadded:: 0.9.0 + Returns ------- set[str] | None @@ -128,6 +137,8 @@ def get_cluster_namenodes(cluster: str) -> set[str] | None: Namenode host passed into SparkHDFS constructor should be present in this list. If hooks didn't return anything, no validation will be performed. + .. versionadded:: 0.9.0 + Parameters ---------- cluster : :obj:`str` @@ -166,6 +177,8 @@ def get_current_cluster() -> str | None: Used in :obj:`~get_current_cluster` to automatically fill up ``cluster`` attribute of a connection. If hooks didn't return anything, calling the method above will raise an exception. + .. versionadded:: 0.9.0 + Returns ------- str | None @@ -197,6 +210,8 @@ def get_ipc_port(cluster: str) -> int | None: Used by constructor to automatically set port number if omitted. + .. versionadded:: 0.9.0 + Parameters ---------- cluster : :obj:`str` @@ -242,6 +257,8 @@ def is_namenode_active(host: str, cluster: str) -> bool | None: :obj:`~check` will determine whether this host is active. + .. versionadded:: 0.9.0 + Parameters ---------- host : :obj:`str` diff --git a/onetl/connection/file_df_connection/spark_local_fs.py b/onetl/connection/file_df_connection/spark_local_fs.py index 0ebd43683..839cbdaec 100644 --- a/onetl/connection/file_df_connection/spark_local_fs.py +++ b/onetl/connection/file_df_connection/spark_local_fs.py @@ -28,18 +28,9 @@ class SparkLocalFS(SparkFileDFConnection): .. warning:: - To use SparkLocalFS connector you should have PySpark installed (or injected to ``sys.path``) + To use SparkHDFS connector you should have PySpark installed (or injected to ``sys.path``) BEFORE creating the connector instance. - You can install PySpark as follows: - - .. code:: bash - - pip install onetl[spark] # latest PySpark version - - # or - pip install onetl pyspark=3.5.0 # pass specific PySpark version - See :ref:`install-spark` installation instruction for more details. .. warning:: @@ -52,6 +43,8 @@ class SparkLocalFS(SparkFileDFConnection): Does NOT support file operations, like create, delete, rename, etc. + .. versionadded:: 0.9.0 + Parameters ---------- spark : :class:`pyspark.sql.SparkSession` diff --git a/onetl/connection/file_df_connection/spark_s3/connection.py b/onetl/connection/file_df_connection/spark_s3/connection.py index 46ce8bb33..04da89e0e 100644 --- a/onetl/connection/file_df_connection/spark_s3/connection.py +++ b/onetl/connection/file_df_connection/spark_s3/connection.py @@ -53,31 +53,9 @@ class SparkS3(SparkFileDFConnection): Based on `Hadoop-AWS module `_ and `Spark integration with Cloud Infrastructures `_. - .. dropdown:: Version compatibility + .. seealso:: - * Spark versions: 3.2.x - 3.5.x (only with Hadoop 3.x libraries) - * Scala versions: 2.12 - 2.13 - * Java versions: 8 - 20 - - .. warning:: - - See :ref:`spark-s3-troubleshooting` guide. - - .. warning:: - - To use SparkS3 connector you should have PySpark installed (or injected to ``sys.path``) - BEFORE creating the connector instance. - - You can install PySpark as follows: - - .. code:: bash - - pip install onetl[spark] # latest PySpark version - - # or - pip install onetl pyspark=3.5.0 # pass specific PySpark version - - See :ref:`install-spark` installation instruction for more details. + Before using this connector please take into account :ref:`spark-s3-prerequisites` .. note:: @@ -86,6 +64,8 @@ class SparkS3(SparkFileDFConnection): Does NOT support file operations, like create, delete, rename, etc. For these operations, use :obj:`S3 ` connection. + .. versionadded:: 0.9.0 + Parameters ---------- host : str @@ -232,12 +212,14 @@ class SparkS3(SparkFileDFConnection): @classmethod def get_packages( cls, - spark_version: str | Version, - scala_version: str | Version | None = None, + spark_version: str, + scala_version: str | None = None, ) -> list[str]: """ Get package names to be downloaded by Spark. |support_hooks| + .. versionadded:: 0.9.0 + Parameters ---------- spark_version : str @@ -260,14 +242,14 @@ def get_packages( """ - spark_ver = Version.parse(spark_version) + spark_ver = Version(spark_version).min_digits(3) if spark_ver.major < 3: # https://issues.apache.org/jira/browse/SPARK-23977 raise ValueError(f"Spark version must be at least 3.x, got {spark_ver}") - scala_ver = Version.parse(scala_version) if scala_version else get_default_scala_version(spark_ver) + scala_ver = Version(scala_version).min_digits(2) if scala_version else get_default_scala_version(spark_ver) # https://mvnrepository.com/artifact/org.apache.spark/spark-hadoop-cloud - return [f"org.apache.spark:spark-hadoop-cloud_{scala_ver.digits(2)}:{spark_ver.digits(3)}"] + return [f"org.apache.spark:spark-hadoop-cloud_{scala_ver.format('{0}.{1}')}:{spark_ver.format('{0}.{1}.{2}')}"] @slot def path_from_string(self, path: os.PathLike | str) -> RemotePath: @@ -365,7 +347,7 @@ def _check_java_class_imported(cls, spark: SparkSession) -> SparkSession: try: try_import_java_class(spark, java_class) except Exception as e: - spark_version = get_spark_version(spark).digits(3) + spark_version = get_spark_version(spark).format("{major}.{minor}.{patch}") msg = MISSING_JVM_CLASS_MSG.format( java_class=java_class, package_source=cls.__name__, diff --git a/onetl/db/db_reader/db_reader.py b/onetl/db/db_reader/db_reader.py index af038c24f..91b3f21b1 100644 --- a/onetl/db/db_reader/db_reader.py +++ b/onetl/db/db_reader/db_reader.py @@ -58,20 +58,13 @@ class DBReader(FrozenModel): .. note:: - This class operates with only one table at a time. It does NOT support executing JOINs. + This class operates with only one source at a time. It does NOT support executing queries + to multiple source, like ``SELECT ... JOIN``. - To get the JOIN result you can instead: + .. versionadded:: 0.1.0 - 1. Use 2 instandes of DBReader with different tables, - call :obj:`~run` of each one to get a table dataframe, - and then use ``df1.join(df2)`` syntax (Hive) - - 2. Use ``connection.execute("INSERT INTO ... SELECT ... JOIN ...")`` - to execute JOIN on RDBMS side, write the result into a temporary table, - and then use DBReader to get the data from this temporary table (MPP systems, like Greenplum) - - 3. Use ``connection.sql(query)`` method to pass SQL query with a JOIN, - and fetch the result (other RDBMS) + .. versionchanged:: 0.8.0 + Moved ``onetl.core.DBReader`` → ``onetl.db.DBReader`` Parameters ---------- @@ -84,6 +77,9 @@ class DBReader(FrozenModel): If connection has schema support, you need to specify the full name of the source including the schema, e.g. ``schema.name``. + .. versionchanged:: 0.7.0 + Renamed ``table`` → ``source`` + columns : list of str, default: None The list of columns to be read. @@ -160,6 +156,9 @@ class DBReader(FrozenModel): Some sources does not support passing expressions and can be used only with column/field names which present in the source. + .. versionchanged:: 0.10.0 + Replaces deprecated ``hwm_column`` and ``hwm_expression`` attributes + hint : Any, default: ``None`` Hint expression used for querying the data. @@ -518,12 +517,13 @@ def has_data(self) -> bool: .. warning:: - If :etl-entities:`hwm ` is used, then method should be called inside :ref:`strategy` context. And vise-versa, if HWM is not used, this method should not be called within strategy. + If :etl-entities:`hwm ` is used, then method should be called inside :ref:`strategy` context. And vise-versa, if HWM is not used, this method should not be called within strategy. + + .. versionadded:: 0.10.0 Raises ------ RuntimeError - Current strategy is not compatible with HWM parameter. Examples @@ -575,14 +575,14 @@ def raise_if_no_data(self) -> None: If :etl-entities:`hwm ` is used, then method should be called inside :ref:`strategy` context. And vise-versa, if HWM is not used, this method should not be called within strategy. + .. versionadded:: 0.10.0 + Raises ------ - RuntimeError - + RuntimeError Current strategy is not compatible with HWM parameter. :obj:`onetl.exception.NoDataError` - There is no data in source. Examples @@ -612,17 +612,13 @@ def run(self) -> DataFrame: If :etl-entities:`hwm ` is used, then method should be called inside :ref:`strategy` context. And vise-versa, if HWM is not used, this method should not be called within strategy. + .. versionadded:: 0.1.0 + Returns ------- df : pyspark.sql.dataframe.DataFrame Spark dataframe - .. note:: - - Keep in mind that with differences in the timezone settings of the source and Spark, - there may be discrepancies in the datetime on the source and in the Spark dataframe. - It depends on the ``spark.sql.session.timeZone`` option set when creating the Spark session. - Examples -------- diff --git a/onetl/db/db_writer/db_writer.py b/onetl/db/db_writer/db_writer.py index eb83c9eb8..666fce87e 100644 --- a/onetl/db/db_writer/db_writer.py +++ b/onetl/db/db_writer/db_writer.py @@ -30,6 +30,11 @@ class DBWriter(FrozenModel): """Class specifies schema and table where you can write your dataframe. |support_hooks| + .. versionadded:: 0.1.0 + + .. versionchanged:: 0.8.0 + Moved ``onetl.core.DBReader`` → ``onetl.db.DBReader`` + Parameters ---------- connection : :obj:`onetl.connection.DBConnection` @@ -41,8 +46,11 @@ class DBWriter(FrozenModel): If connection has schema support, you need to specify the full name of the source including the schema, e.g. ``schema.name``. + .. versionchanged:: 0.7.0 + Renamed ``table`` → ``target`` + options : dict, :obj:`onetl.connection.DBConnection.WriteOptions`, default: ``None`` - Spark write options. + Spark write options. Can be in form of special ``WriteOptions`` object or a dict. For example: ``{"if_exists": "replace_entire_table", "compression": "snappy"}`` @@ -107,9 +115,7 @@ class DBWriter(FrozenModel): spark=spark, ) - options = {"truncate": "true", "batchsize": 1000} - # or (it is the same): - options = Postgres.WriteOptions(truncate=True, batchsize=1000) + options = Postgres.WriteOptions(if_exists="replace_entire_table", batchsize=1000) writer = DBWriter( connection=postgres, @@ -172,6 +178,8 @@ def run(self, df: DataFrame): .. note :: Method does support only **batching** DataFrames. + .. versionadded:: 0.1.0 + Parameters ---------- df : pyspark.sql.dataframe.DataFrame diff --git a/onetl/exception.py b/onetl/exception.py index 45ea80020..03650e9a1 100644 --- a/onetl/exception.py +++ b/onetl/exception.py @@ -23,7 +23,9 @@ class DirectoryNotFoundError(OSError): Like ``FileNotFoundError``, but for directory. Cannot be replaced with ``NotAFileError`` because on some operating systems - (e.g. Linux) there are other file types than regular file and directory - symlink, device, etc + (e.g. Linux) there are other file types than regular file and directory - symlink, device, etc. + + .. versionadded:: 0.3.0 """ @@ -31,87 +33,118 @@ class NotAFileError(OSError): """ Like ``NotADirectoryError``, but for files. - Cannot be replaced with ``FileNotFoundError``, it has different meaning + Cannot be replaced with ``FileNotFoundError``, it has different meaning. + + .. versionadded:: 0.3.0 """ class FileSizeMismatchError(OSError): """ - File size mismatch + File size mismatch. + + .. versionadded:: 0.8.0 """ class DirectoryExistsError(OSError): """ Like ``FileExistsError``, but for directories. + + .. versionadded:: 0.8.0 """ class DirectoryNotEmptyError(OSError): """ - Raised when trying to remove directory contains some files or other directories + Raised when trying to remove directory contains some files or other directories.. + + .. versionadded:: 0.3.0 """ class NoDataError(NeedEvacuation): """ - Raised when there is no data in FileResult or DataFrame + Raised when there is no data in FileResult or DataFrame. + + .. versionadded:: 0.4.0 """ class FilesError(RuntimeError): """ - Raised when something went wrong while working with file collection + Raised when something went wrong while working with file collection. + + .. versionadded:: 0.4.0 """ class SkippedFilesError(FilesError): """ - Raised when file collection contains skipped files + Raised when file collection contains skipped files. + + .. versionadded:: 0.4.0 """ class FailedFilesError(FilesError): """ - Raised when file collection contains failed files + Raised when file collection contains failed files. + + .. versionadded:: 0.4.0 """ class MissingFilesError(FilesError): """ - Raised when file collection contains missing files + Raised when file collection contains missing files. + + .. versionadded:: 0.4.0 """ class ZeroFileSizeError(FilesError): """ - Raised when file collection contains some zero-sized file + Raised when file collection contains some zero-sized file. + + .. versionadded:: 0.4.0 """ class EmptyFilesError(FilesError, NoDataError): """ - Raised when file collection is empty + Raised when file collection is empty. + + .. versionadded:: 0.4.0 """ class SparkError(RuntimeError): """ - Raised when something went wrong while working with Spark + Raised when something went wrong while working with Spark. + + .. versionadded:: 0.5.0 """ class TooManyParallelJobsError(SparkError): """ - Raised when number parallel jobs is too high + Raised when number parallel jobs is too high. + + .. versionadded:: 0.5.0 """ class SignatureError(TypeError): """ - Raised when hook signature is not consistent with slot + Raised when hook signature is not consistent with slot. + + .. versionadded:: 0.7.0 """ class TargetAlreadyExistsError(Exception): - """Raised if the target already exists in source""" + """Raised if the target already exists in source. + + .. versionadded:: 0.9.0 + """ diff --git a/onetl/file/file_df_reader/file_df_reader.py b/onetl/file/file_df_reader/file_df_reader.py index 6967387a6..b18fc1792 100644 --- a/onetl/file/file_df_reader/file_df_reader.py +++ b/onetl/file/file_df_reader/file_df_reader.py @@ -43,6 +43,8 @@ class FileDFReader(FrozenModel): This class does **not** support read strategies. + .. versionadded:: 0.9.0 + Parameters ---------- connection : :obj:`BaseFileDFConnection ` @@ -115,6 +117,8 @@ def run(self, files: Iterable[str | os.PathLike] | None = None) -> DataFrame: """ Method for reading files as DataFrame. |support_hooks| + .. versionadded:: 0.9.0 + Parameters ---------- diff --git a/onetl/file/file_df_reader/options.py b/onetl/file/file_df_reader/options.py index 7fdbe64d9..714cf1a9d 100644 --- a/onetl/file/file_df_reader/options.py +++ b/onetl/file/file_df_reader/options.py @@ -32,6 +32,8 @@ class FileDFReaderOptions(FileDFReadOptions, GenericOptions): The set of supported options depends on Spark version. See link above. + .. versionadded:: 0.9.0 + Examples -------- Created reader options diff --git a/onetl/file/file_downloader/file_downloader.py b/onetl/file/file_downloader/file_downloader.py index f179491e0..3fe45ff40 100644 --- a/onetl/file/file_downloader/file_downloader.py +++ b/onetl/file/file_downloader/file_downloader.py @@ -80,6 +80,11 @@ class FileDownloader(FrozenModel): It does NOT support direct file transfer between filesystems, like ``FTP -> SFTP``. You should use FileDownloader + :ref:`file-uploader` to implement ``FTP -> local dir -> SFTP``. + .. versionadded:: 0.1.0 + + .. versionchanged:: 0.8.0 + Moved ``onetl.core.FileDownloader`` → ``onetl.file.FileDownloader`` + Parameters ---------- connection : :obj:`onetl.connection.FileConnection` @@ -112,15 +117,30 @@ class FileDownloader(FrozenModel): Otherwise instead of ``rename``, remote OS will move file between filesystems, which is NOT atomic operation. + .. versionadded:: 0.5.0 + filters : list of :obj:`BaseFileFilter ` Return only files/directories matching these filters. See :ref:`file-filters` + .. versionchanged:: 0.3.0 + Replaces old ``source_path_pattern: str`` and ``exclude_dirs: str`` options. + + .. versionchanged:: 0.8.0 + Renamed ``filter`` → ``filters`` + limits : list of :obj:`BaseFileLimit ` Apply limits to the list of files/directories, and stop if one of the limits is reached. See :ref:`file-limits` + .. versionadded:: 0.4.0 + + .. versionchanged:: 0.8.0 + Renamed ``limit`` → ``limits`` + options : :obj:`~FileDownloader.Options` | dict | None, default: ``None`` - File downloading options. See :obj:`~FileDownloader.Options` + File downloading options. See :obj:`FileDownloader.Options ` + + .. versionadded:: 0.3.0 hwm : type[HWM] | None, default: ``None`` @@ -129,6 +149,11 @@ class FileDownloader(FrozenModel): .. warning :: Used only in :obj:`IncrementalStrategy `. + .. versionadded:: 0.5.0 + + .. versionchanged:: 0.10.0 + Replaces deprecated ``hwm_type`` attribute + Examples -------- Simple Downloader creation @@ -235,6 +260,8 @@ def run(self, files: Iterable[str | os.PathLike] | None = None) -> DownloadResul This method can return different results depending on :ref:`strategy` + .. versionadded:: 0.1.0 + Parameters ---------- @@ -247,9 +274,11 @@ def run(self, files: Iterable[str | os.PathLike] | None = None) -> DownloadResul If not, download to ``local_path`` **all** input files, **ignoring** filters, limits and HWM. + .. versionadded:: 0.3.0 + Returns ------- - downloaded_files : :obj:`DownloadResult ` + :obj:`DownloadResult ` Download result object @@ -266,76 +295,76 @@ def run(self, files: Iterable[str | os.PathLike] | None = None) -> DownloadResul Examples -------- - Download files from ``source_path`` to ``local_path`` - - .. code:: python - - from onetl.impl import RemoteFile, LocalPath - from onetl.file import FileDownloader - - downloader = FileDownloader(source_path="/remote", local_path="/local", ...) - downloaded_files = downloader.run() + Download files from ``source_path`` to ``local_path``: - assert downloaded_files.successful == { + >>> from onetl.file import FileDownloader + >>> downloader = FileDownloader(source_path="/remote", local_path="/local", ...) + >>> download_result = downloader.run() + >>> download_result + DownloadResult( + successful=FileSet([ LocalPath("/local/file1.txt"), LocalPath("/local/file2.txt"), - LocalPath("/local/nested/path/file3.txt"), # directory structure is preserved - } - assert downloaded_files.failed == {FailedRemoteFile("/remote/failed.file")} - assert downloaded_files.skipped == {RemoteFile("/remote/already.exists")} - assert downloaded_files.missing == {RemotePath("/remote/missing.file")} - - Download only certain files from ``source_path`` - - .. code:: python - - from onetl.impl import RemoteFile, LocalPath - from onetl.file import FileDownloader - - downloader = FileDownloader(source_path="/remote", local_path="/local", ...) - - # paths could be relative or absolute, but all should be in "/remote" - downloaded_files = downloader.run( - [ - "/remote/file1.txt", - "/remote/nested/path/file3.txt", - # excluding "/remote/file2.txt" - ] - ) + # directory structure is preserved + LocalPath("/local/nested/path/file3.txt"), + ]), + failed=FileSet([ + FailedRemoteFile("/remote/failed.file"), + ]), + skipped=FileSet([ + RemoteFile("/remote/already.exists"), + ]), + missing=FileSet([ + RemotePath("/remote/missing.file"), + ]), + ) - assert downloaded_files.successful == { + Download only certain files from ``source_path``: + + >>> from onetl.file import FileDownloader + >>> downloader = FileDownloader(source_path="/remote", local_path="/local", ...) + >>> # paths could be relative or absolute, but all should be in "/remote" + >>> download_result = downloader.run( + ... [ + ... "/remote/file1.txt", + ... "/remote/nested/path/file3.txt", + ... # excluding "/remote/file2.txt" + ... ] + ... ) + >>> download_result + DownloadResult( + successful=FileSet([ LocalPath("/local/file1.txt"), - LocalPath("/local/nested/path/file3.txt"), # directory structure is preserved - } - assert not downloaded_files.failed - assert not downloaded_files.skipped - assert not downloaded_files.missing - - Download certain files from any folder - - .. code:: python - - from onetl.impl import RemoteFile, LocalPath - from onetl.file import FileDownloader - - downloader = FileDownloader(local_path="/local", ...) # no source_path set - - # only absolute paths - downloaded_files = downloader.run( - [ - "/remote/file1.txt", - "/any/nested/path/file2.txt", - ] - ) + # directory structure is preserved + LocalPath("/local/nested/path/file3.txt"), + ]), + failed=FileSet([]), + skipped=FileSet([]), + missing=FileSet([]), + ) - assert downloaded_files.successful == { + Download certain files from any folder: + + >>> from onetl.file import FileDownloader + >>> downloader = FileDownloader(local_path="/local", ...) # no source_path set + >>> # only absolute paths + >>> download_result = downloader.run( + ... [ + ... "/remote/file1.txt", + ... "/any/nested/path/file2.txt", + ... ] + ... ) + >>> download_result + DownloadResult( + successful=FileSet([ LocalPath("/local/file1.txt"), - LocalPath("/local/file2.txt"), # directory structure is NOT preserved without source_path - } - assert not downloaded_files.failed - assert not downloaded_files.skipped - assert not downloaded_files.missing + LocalPath("/local/file2.txt"), + ]), + failed=FileSet([]), + skipped=FileSet([]), + missing=FileSet([]), + ) """ entity_boundary_log(log, f"{self.__class__.__name__}.run() starts") @@ -399,6 +428,8 @@ def view_files(self) -> FileSet[RemoteFile]: This method can return different results depending on :ref:`strategy` + .. versionadded:: 0.3.0 + Raises ------ :obj:`onetl.exception.DirectoryNotFoundError` @@ -417,22 +448,16 @@ def view_files(self) -> FileSet[RemoteFile]: Examples -------- - View files - - .. code:: python - - from onetl.impl import RemoteFile - from onetl.file import FileDownloader - - downloader = FileDownloader(source_path="/remote", ...) - - view_files = downloader.view_files() + View files: - assert view_files == { - RemoteFile("/remote/file1.txt"), - RemoteFile("/remote/file3.txt"), - RemoteFile("/remote/nested/file3.txt"), - } + >>> from onetl.file import FileDownloader + >>> downloader = FileDownloader(source_path="/remote", ...) + >>> downloader.view_files() + FileSet([ + RemoteFile("/remote/file1.txt"), + RemoteFile("/remote/file3.txt"), + RemoteFile("/remote/nested/file3.txt"), + ]) """ if not self.source_path: diff --git a/onetl/file/file_downloader/options.py b/onetl/file/file_downloader/options.py index 07cc2abc3..91dd44d91 100644 --- a/onetl/file/file_downloader/options.py +++ b/onetl/file/file_downloader/options.py @@ -13,7 +13,10 @@ class FileDownloaderOptions(GenericOptions): - """File downloading options""" + """File downloading options. + + .. versionadded:: 0.3.0 + """ if_exists: FileExistBehavior = Field(default=FileExistBehavior.ERROR, alias="mode") """ @@ -24,6 +27,9 @@ class FileDownloaderOptions(GenericOptions): * ``ignore`` - do nothing, mark file as ignored * ``replace_file`` - replace existing file with a new one * ``replace_entire_directory`` - delete local directory content before downloading files + + .. versionchanged:: 0.9.0 + Renamed ``mode`` → ``if_exists`` """ delete_source: bool = False @@ -31,6 +37,11 @@ class FileDownloaderOptions(GenericOptions): If ``True``, remove source file after successful download. If download failed, file will left intact. + + .. versionadded:: 0.2.0 + + .. versionchanged:: 0.3.0 + Move ``FileUploader.delete_local`` to ``FileUploaderOptions`` """ workers: int = Field(default=1, ge=1) @@ -41,6 +52,8 @@ class FileDownloaderOptions(GenericOptions): 2 or more means files will be downloaded in parallel workers. Recommended value is ``min(32, os.cpu_count() + 4)``, e.g. ``5``. + + .. versionadded:: 0.8.1 """ @root_validator(pre=True) diff --git a/onetl/file/file_downloader/result.py b/onetl/file/file_downloader/result.py index 754dcdc04..96d184e9e 100644 --- a/onetl/file/file_downloader/result.py +++ b/onetl/file/file_downloader/result.py @@ -17,42 +17,43 @@ class DownloadResult(FileResult): Container for file paths, divided into certain categories: - * :obj:`successful` - * :obj:`failed` - * :obj:`skipped` - * :obj:`missing` + * :obj:`~successful` + * :obj:`~failed` + * :obj:`~skipped` + * :obj:`~missing` + + .. versionadded:: 0.3.0 Examples -------- - Download files - - .. code:: python - - from onetl.impl import LocalPath, RemoteFile, FailedLocalFile - from onetl.file import FileDownloader, DownloadResult - - downloader = FileDownloader(local_path="/local", ...) - - downloaded_files = downloader.run( - [ - "/remote/file1", - "/remote/file2", - "/failed/file", - "/existing/file", - "/missing/file", - ] - ) - - assert downloaded_files == DownloadResult( - successful={ - LocalPath("/local/file1"), - LocalPath("/local/file2"), - }, - failed={FailedLocalFile("/failed/file")}, - skipped={RemoteFile("/existing/file")}, - missing={RemotePath("/missing/file")}, - ) + >>> from onetl.file import FileDownloader + >>> downloader = FileDownloader(local_path="/local", ...) + >>> download_result = downloader.run( + ... [ + ... "/remote/file1", + ... "/remote/file2", + ... "/failed/file", + ... "/existing/file", + ... "/missing/file", + ... ] + ... ) + >>> download_result + DownloadResult( + successful=FileSet([ + LocalPath("/local/file1"), + LocalPath("/local/file2"), + ]), + failed=FileSet([ + FailedLocalFile("/failed/file") + ]), + skipped=FileSet([ + RemoteFile("/existing/file") + ]), + missing=FileSet([ + RemotePath("/missing/file") + ]), + ) """ successful: FileSet[LocalPath] = Field(default_factory=FileSet) diff --git a/onetl/file/file_mover/file_mover.py b/onetl/file/file_mover/file_mover.py index 1c0b55dc7..0bb2e6669 100644 --- a/onetl/file/file_mover/file_mover.py +++ b/onetl/file/file_mover/file_mover.py @@ -67,6 +67,8 @@ class FileMover(FrozenModel): This class does **not** support read strategies. + .. versionadded:: 0.8.0 + Parameters ---------- connection : :obj:`onetl.connection.FileConnection` @@ -89,7 +91,7 @@ class FileMover(FrozenModel): See :ref:`file-limits` options : :obj:`~FileMover.Options` | dict | None, default: ``None`` - File moving options. See :obj:`~FileMover.Options` + File moving options. See :obj:`FileMover.Options ` Examples -------- @@ -163,6 +165,8 @@ def run(self, files: Iterable[str | os.PathLike] | None = None) -> MoveResult: """ Method for moving files from source to target directory. |support_hooks| + .. versionadded:: 0.8.0 + Parameters ---------- @@ -177,7 +181,7 @@ def run(self, files: Iterable[str | os.PathLike] | None = None) -> MoveResult: Returns ------- - moved_files : :obj:`MoveResult ` + :obj:`MoveResult ` Move result object @@ -194,76 +198,76 @@ def run(self, files: Iterable[str | os.PathLike] | None = None) -> MoveResult: Examples -------- - Move files from ``source_path`` - - .. code:: python - - from onetl.impl import RemoteFile, RemotePath - from onetl.file import FileMover + Move files from ``source_path``: - mover = FileMover(source_path="/source", target_path="/target", ...) - moved_files = mover.run() - - assert moved_files.successful == { + >>> from onetl.file import FileMover + >>> mover = FileMover(source_path="/source", target_path="/target", ...) + >>> move_result = mover.run() + >>> move_result + MoveResult( + successful=FileSet([ RemoteFile("/target/file1.txt"), RemoteFile("/target/file2.txt"), - RemoteFile("/target/nested/path/file3.txt"), # directory structure is preserved - } - assert moved_files.failed == {FailedRemoteFile("/source/failed.file")} - assert moved_files.skipped == {RemoteFile("/source/already.exists")} - assert moved_files.missing == {RemotePath("/source/missing.file")} - - Move only certain files from ``source_path`` - - .. code:: python - - from onetl.impl import RemoteFile - from onetl.file import FileMover - - mover = FileMover(source_path="/source", target_path="/target", ...) - - # paths could be relative or absolute, but all should be in "/source" - moved_files = mover.run( - [ - "/source/file1.txt", - "/source/nested/path/file3.txt", - # excluding "/source/file2.txt" - ] - ) + # directory structure is preserved + RemoteFile("/target/nested/path/file3.txt"), + ]), + failed=FileSet([ + FailedRemoteFile("/source/failed.file"), + ]), + skipped=FileSet([ + RemoteFile("/source/already.exists"), + ]), + missing=FileSet([ + RemotePath("/source/missing.file"), + ]), + ) - assert moved_files.successful == { + Move only certain files from ``source_path``: + + >>> from onetl.file import FileMover + >>> mover = FileMover(source_path="/source", target_path="/target", ...) + >>> # paths could be relative or absolute, but all should be in "/source" + >>> move_result = mover.run( + ... [ + ... "/source/file1.txt", + ... "/source/nested/path/file3.txt", + ... # excluding "/source/file2.txt" + ... ] + ... ) + >>> move_result + MoveResult( + successful=FileSet([ RemoteFile("/target/file1.txt"), - RemoteFile("/target/nested/path/file3.txt"), # directory structure is preserved - } - assert not moved_files.failed - assert not moved_files.skipped - assert not moved_files.missing - - Move certain files from any folder - - .. code:: python - - from onetl.impl import RemoteFile - from onetl.file import FileMover - - mover = FileMover(target_path="/target", ...) # no source_path set - - # only absolute paths - moved_files = mover.run( - [ - "/remote/file1.txt", - "/any/nested/path/file3.txt", - ] - ) + # directory structure is preserved + RemoteFile("/target/nested/path/file3.txt"), + ]), + failed=FileSet([]), + skipped=FileSet([]), + missing=FileSet([]), + ) - assert moved_files.successful == { + Move certain files from any folder: + + >>> from onetl.file import FileMover + >>> mover = FileMover(target_path="/target", ...) # no source_path set + >>> # only absolute paths + >>> move_result = mover.run( + ... [ + ... "/remote/file1.txt", + ... "/any/nested/path/file3.txt", + ... ] + ... ) + >>> move_result + MoveResult( + successful=FileSet([ RemoteFile("/target/file1.txt"), - RemoteFile("/target/file3.txt"), # directory structure is NOT preserved without source_path - } - assert not moved_files.failed - assert not moved_files.skipped - assert not moved_files.missing + RemoteFile("/target/file3.txt"), + ]), + failed=FileSet([]), + skipped=FileSet([]), + missing=FileSet([]), + ) """ entity_boundary_log(log, f"{self.__class__.__name__}.run() starts") @@ -309,6 +313,8 @@ def view_files(self) -> FileSet[RemoteFile]: Get file list in the ``source_path``, after ``filter`` and ``limit`` applied (if any). |support_hooks| + .. versionadded:: 0.8.0 + Raises ------ :obj:`onetl.exception.DirectoryNotFoundError` @@ -327,22 +333,16 @@ def view_files(self) -> FileSet[RemoteFile]: Examples -------- - View files - - .. code:: python - - from onetl.impl import RemoteFile - from onetl.file import FileMover - - mover = FileMover(source_path="/remote", ...) - - view_files = mover.view_files() + View files: - assert view_files == { - RemoteFile("/remote/file1.txt"), - RemoteFile("/remote/file3.txt"), - RemoteFile("/remote/nested/path/file3.txt"), - } + >>> from onetl.file import FileMover + >>> mover = FileMover(source_path="/remote", ...) + >>> mover.view_files() + FileSet([ + RemoteFile("/remote/file1.txt"), + RemoteFile("/remote/file2.txt"), + RemoteFile("/remote/nested/path/file3.txt"), + ]) """ if not self.source_path: diff --git a/onetl/file/file_mover/options.py b/onetl/file/file_mover/options.py index b68326198..ce9c12b24 100644 --- a/onetl/file/file_mover/options.py +++ b/onetl/file/file_mover/options.py @@ -13,7 +13,10 @@ class FileMoverOptions(GenericOptions): - """File moving options""" + """File moving options. + + .. versionadded:: 0.8.0 + """ if_exists: FileExistBehavior = Field(default=FileExistBehavior.ERROR, alias="mode") """ @@ -24,6 +27,11 @@ class FileMoverOptions(GenericOptions): * ``ignore`` - do nothing, mark file as ignored * ``replace_file`` - replace existing file with a new one * ``replace_entire_directory`` - delete directory content before moving files + + .. versionadded:: 0.8.0 + + .. versionchanged:: 0.9.0 + Renamed ``mode`` → ``if_exists`` """ workers: int = Field(default=1, ge=1) @@ -34,6 +42,8 @@ class FileMoverOptions(GenericOptions): 2 or more means files will be moved in parallel workers. Recommended value is ``min(32, os.cpu_count() + 4)``, e.g. ``5``. + + .. versionadded:: 0.8.1 """ @root_validator(pre=True) diff --git a/onetl/file/file_mover/result.py b/onetl/file/file_mover/result.py index 90175a2d4..99313d0f7 100644 --- a/onetl/file/file_mover/result.py +++ b/onetl/file/file_mover/result.py @@ -17,42 +17,43 @@ class MoveResult(FileResult): Container for file paths, divided into certain categories: - * :obj:`successful` - * :obj:`failed` - * :obj:`skipped` - * :obj:`missing` + * :obj:`~successful` + * :obj:`~failed` + * :obj:`~skipped` + * :obj:`~missing` + + .. versionadded:: 0.8.0 Examples -------- - Move files - - .. code:: python - - from onetl.impl import RemotePath, RemoteFile, FailedLocalFile - from onetl.file import FileMover, MoveResult - - mover = FileMover(local_path="/local", ...) - - moved_files = mover.run( - [ - "/source/file1", - "/source/file2", - "/failed/file", - "/existing/file", - "/missing/file", - ] - ) - - assert moved_files == MoveResult( - successful={ - RemoteFile("/target/file1"), - RemoteFile("/target/file2"), - }, - failed={FailedLocalFile("/failed/file")}, - skipped={RemoteFile("/existing/file")}, - missing={RemotePath("/missing/file")}, - ) + >>> from onetl.file import FileMover + >>> mover = FileMover(local_path="/local", ...) + >>> move_result = mover.run( + ... [ + ... "/source/file1", + ... "/source/file2", + ... "/failed/file", + ... "/existing/file", + ... "/missing/file", + ... ] + ... ) + >>> move_result + MoveResult( + successful=FileSet([ + RemoteFile("/target/file1"), + RemoteFile("/target/file2"), + ]), + failed=FileSet([ + FailedLocalFile("/failed/file") + ]), + skipped=FileSet([ + RemoteFile("/existing/file") + ]), + missing=FileSet([ + RemotePath("/missing/file") + ]), + ) """ successful: FileSet[RemoteFile] = Field(default_factory=FileSet) diff --git a/onetl/file/file_result.py b/onetl/file/file_result.py index 2841741f9..d2b9aec6e 100644 --- a/onetl/file/file_result.py +++ b/onetl/file/file_result.py @@ -61,16 +61,13 @@ def successful_count(self) -> int: Examples -------- - .. code:: python - - from onetl.impl import LocalPath - from onet.file.file_result import FileResult - - file_result = FileResult( - successful={LocalPath("/some/file"), LocalPath("/some/another.file")}, - ) - - assert file_result.successful_count == 2 + >>> from onetl.impl import LocalPath + >>> from onetl.file.file_result import FileResult + >>> file_result = FileResult( + ... successful={LocalPath("/some/file"), LocalPath("/some/another.file")}, + ... ) + >>> file_result.successful_count + 2 """ return len(self.successful) @@ -83,16 +80,13 @@ def failed_count(self) -> int: Examples -------- - .. code:: python - - from onetl.impl import RemoteFile - from onet.file.file_result import FileResult - - file_result = FileResult( - failed={RemoteFile("/some/file"), RemoteFile("/some/another.file")}, - ) - - assert file_result.failed_count == 2 + >>> from onetl.impl import RemoteFile + >>> from onetl.file.file_result import FileResult + >>> file_result = FileResult( + ... failed={RemoteFile("/some/file"), RemoteFile("/some/another.file")}, + ... ) + >>> file_result.failed_count + 2 """ return len(self.failed) @@ -105,16 +99,13 @@ def skipped_count(self) -> int: Examples -------- - .. code:: python - - from onetl.impl import LocalPath - from onet.file.file_result import FileResult - - file_result = FileResult( - skipped={LocalPath("/some/file"), LocalPath("/some/another.file")}, - ) - - assert file_result.skipped_count == 2 + >>> from onetl.impl import LocalPath + >>> from onetl.file.file_result import FileResult + >>> file_result = FileResult( + ... skipped={LocalPath("/some/file"), LocalPath("/some/another.file")}, + ... ) + >>> file_result.skipped_count + 2 """ return len(self.skipped) @@ -127,16 +118,13 @@ def missing_count(self) -> int: Examples -------- - .. code:: python - - from onetl.impl import LocalPath - from onet.file.file_result import FileResult - - file_result = FileResult( - missing={LocalPath("/some/file"), LocalPath("/some/another.file")}, - ) - - assert file_result.missing_count == 2 + >>> from onetl.impl import LocalPath + >>> from onetl.file.file_result import FileResult + >>> file_result = FileResult( + ... missing={LocalPath("/some/file"), LocalPath("/some/another.file")}, + ... ) + >>> file_result.missing_count + 2 """ return len(self.missing) @@ -149,19 +137,16 @@ def total_count(self) -> int: Examples -------- - .. code:: python - - from onetl.impl import RemoteFile, LocalPath - from onet.file.file_result import FileResult - - file_result = FileResult( - successful={LocalPath("/local/file"), LocalPath("/local/another.file")}, - failed={RemoteFile("/remote/file"), RemoteFile("/remote/another.file")}, - skipped={LocalPath("/skipped/file")}, - missing={LocalPath("/missing/file")}, - ) - - assert file_result.total_count == 6 + >>> from onetl.impl import RemoteFile + >>> from onetl.file.file_result import FileResult + >>> file_result = FileResult( + ... successful={LocalPath("/local/file"), LocalPath("/local/another.file")}, + ... failed={RemoteFile("/remote/file"), RemoteFile("/remote/another.file")}, + ... skipped={LocalPath("/skipped/file")}, + ... missing={LocalPath("/missing/file")}, + ... ) + >>> file_result.total_count + 6 """ return self.successful_count + self.failed_count + self.missing_count + self.skipped_count @@ -174,16 +159,13 @@ def successful_size(self) -> int: Examples -------- - .. code:: python - - from onetl.impl import LocalPath - from onet.file.file_result import FileResult - - file_result = FileResult( - successful={LocalPath("/some/file"), LocalPath("/some/another.file")}, - ) - - assert file_result.successful_size == 1_000_000 # in bytes + >>> from onetl.impl import LocalPath + >>> from onetl.file.file_result import FileResult + >>> file_result = FileResult( + ... successful={LocalPath("/some/file"), LocalPath("/some/another.file")}, + ... ) + >>> file_result.successful_size # in bytes + 1024 """ return self.successful.total_size @@ -196,16 +178,16 @@ def failed_size(self) -> int: Examples -------- - .. code:: python - - from onetl.impl import RemoteFile - from onet.file.file_result import FileResult - - file_result = FileResult( - failed={RemoteFile("/some/file"), RemoteFile("/some/another.file")}, - ) - - assert file_result.failed_size == 1_000_000 # in bytes + >>> from onetl.impl import RemoteFile, RemotePathStat + >>> from onetl.file.file_result import FileResult + >>> file_result = FileResult( + ... failed={ + ... RemoteFile("/some/file", stats=RemotePathStat(st_size=1024)), + ... RemoteFile("/some/another.file"), stats=RemotePathStat(st_size=1024)), + ... }, + ... ) + >>> file_result.failed_size # in bytes + 2048 """ return self.failed.total_size @@ -218,16 +200,13 @@ def skipped_size(self) -> int: Examples -------- - .. code:: python - - from onetl.impl import LocalPath - from onet.file.file_result import FileResult - - file_result = FileResult( - skipped={LocalPath("/some/file"), LocalPath("/some/another.file")}, - ) - - assert file_result.skipped_size == 1_000_000 # in bytes + >>> from onetl.impl import LocalPath + >>> from onetl.file.file_result import FileResult + >>> file_result = FileResult( + ... skipped={LocalPath("/some/file"), LocalPath("/some/another.file")}, + ... ) + >>> file_result.skipped_size # in bytes + 1024 """ return self.skipped.total_size @@ -240,19 +219,19 @@ def total_size(self) -> int: Examples -------- - .. code:: python - - from onetl.impl import RemoteFile, LocalPath - from onet.file.file_result import FileResult - - file_result = FileResult( - successful={LocalPath("/local/file"), LocalPath("/local/another.file")}, - failed={RemoteFile("/remote/file"), RemoteFile("/remote/another.file")}, - skipped={LocalPath("/skipped/file")}, - missing={LocalPath("/missing/file")}, - ) - - assert file_result.total_size == 10_000_000 # in bytes + >>> from onetl.impl import RemoteFile, RemotePathStat, LocalPath + >>> from onetl.file.file_result import FileResult + >>> file_result = FileResult( + ... successful={LocalPath("/local/file"), LocalPath("/local/another.file")}, + ... failed={ + ... RemoteFile("/remote/file", stats=RemotePathStat(st_size=1024)), + ... RemoteFile("/remote/another.file", stats=RemotePathStat(st_size=1024)) + ... }, + ... skipped={LocalPath("/skipped/file")}, + ... missing={LocalPath("/missing/file")}, + ... ) + >>> file_result.total_size # in bytes + 4096 """ return self.successful_size + self.failed_size + self.skipped_size @@ -270,33 +249,31 @@ def raise_if_failed(self) -> None: Examples -------- - .. code:: python - - from onetl.impl import RemoteFile, LocalPath - from onet.file.file_result import FileResult - - files_with_exception = [ - FailedRemoteFile( - path="/remote/file1", - exception=NotAFileError("'/remote/file1' is not a file"), - ), - FailedRemoteFile( - path="/remote/file2", - exception=FileMissingError("'/remote/file2' does not exist"), - ), - ] - - file_result = FileResult(failed=files_with_exception) - - file_result.raise_if_failed() - # will raise FailedFilesError(''' - # Failed 2 files (10MB): - # '/remote/file1' (1 MB) - # NotAFileError("'/remote/file1' is not a file") - # - # '/remote/file2' (9 MB) - # FileMissingError("'/remote/file2' does not exist") - # ''') + >>> from onetl.impl import FailedRemoteFile, RemotePathStat + >>> from onetl.exception import NotAFileError, FileMissingError + >>> from onetl.file.file_result import FileResult + >>> files_with_exception = [ + ... FailedRemoteFile( + ... path="/remote/file1", + ... stats=RemotePathStat(st_size=0), + ... exception=NotAFileError("'/remote/file1' is not a file"), + ... ), + ... FailedRemoteFile( + ... path="/remote/file2", + ... stats=RemotePathStat(st_size=0), + ... exception=PermissionError("'/remote/file2': [Errno 13] Permission denied"), + ... ), + ... ] + >>> file_result = FileResult(failed=files_with_exception) + >>> file_result.raise_if_failed() + Traceback (most recent call last) + ... + onetl.exception.FailedFilesError: Failed 2 files (size='0 bytes'): + '/remote/file1' (size='0 bytes') + NotAFileError("'/remote/file1' is not a file") + + '/remote/file2' (size='0 Bytes') + PermissionError("'/remote/file2': [Errno 13] Permission denied") """ if self.failed: @@ -315,24 +292,20 @@ def raise_if_missing(self) -> None: Examples -------- - .. code:: python - - from onetl.impl import RemoteFile, LocalPath - from onet.file.file_result import FileResult - - file_result = FileResult( - missing={ - LocalPath("/missing/file1"), - LocalPath("/missing/file2"), - }, - ) - - file_result.raise_if_missing() - # will raise MissingFilesError(''' - # Missing 2 files: - # '/missing/file1' - # '/missing/file2' - # ''') + >>> from onetl.impl import LocalPath + >>> from onetl.file.file_result import FileResult + >>> file_result = FileResult( + ... missing={ + ... LocalPath("/missing/file1"), + ... LocalPath("/missing/file2"), + ... }, + ... ) + >>> file_result.raise_if_missing() + Traceback (most recent call last): + ... + onetl.exception.MissingFilesError: Missing 2 files: + '/missing/file1' + '/missing/file2' """ if self.missing: @@ -351,21 +324,20 @@ def raise_if_skipped(self) -> None: Examples -------- - .. code:: python - - from onetl.impl import RemoteFile, LocalPath - from onet.file.file_result import FileResult - - file_result = FileResult( - skipped={LocalPath("/skipped/file1"), LocalPath("/skipped/file2")}, - ) - - file_result.raise_if_skipped() - # will raise SkippedFilesError(''' - # Skipped 2 files (15 kB): - # '/skipped/file1' (10kB) - # '/skipped/file2' (5 kB) - # ''') + >>> from onetl.impl import LocalPath + >>> from onetl.file.file_result import FileResult + >>> file_result = FileResult( + ... skipped={ + ... LocalPath("/skipped/file1"), + ... LocalPath("/skipped/file2"), + ... }, + ... ) + >>> file_result.raise_if_skipped() + Traceback (most recent call last): + ... + onetl.exception.SkippedFilesError: Skipped 2 files (15 kB): + '/skipped/file1' (10kB) + '/skipped/file2' (5 kB) """ if self.skipped: @@ -384,25 +356,22 @@ def raise_if_contains_zero_size(self) -> None: Examples -------- - .. code:: python - - from onetl.impl import RemoteFile, LocalPath - from onet.file.file_result import FileResult - - file_result = FileResult( - successful={ - LocalPath("/local/empty1.file"), - LocalPath("/local/empty2.file"), - LocalPath("/local/normal.file"), - }, - ) - - file_result.raise_if_contains_zero_size() - # will raise ZeroFileSizeError(''' - # 2 files out of 3 have zero size: - # '/local/empty1.file' - # '/local/empty2.file' - # ''') + >>> from onetl.exception import ZeroFileSizeError + >>> from onetl.impl import LocalPath + >>> from onetl.file.file_result import FileResult + >>> file_result = FileResult( + ... successful={ + ... LocalPath("/local/empty1.file"), + ... LocalPath("/local/empty2.file"), + ... LocalPath("/local/normal.file"), + ... }, + ... ) + >>> file_result.raise_if_contains_zero_size() + Traceback (most recent call last): + ... + onetl.exception.ZeroFileSizeError: 2 files out of 3 have zero size: + '/local/empty1.file' + '/local/empty2.file' """ self.successful.raise_if_contains_zero_size() @@ -415,18 +384,16 @@ def is_empty(self) -> bool: Examples -------- - .. code:: python - - from onetl.impl import LocalPath - from onet.file.file_result import FileResult - - file_result1 = FileResult() - assert file_result1.is_empty - - file_result2 = FileResult( - successful={LocalPath("/local/file"), LocalPath("/local/another.file")}, - ) - assert not file_result2.is_empty + >>> from onetl.impl import LocalPath + >>> from onetl.file.file_result import FileResult + >>> file_result1 = FileResult() + >>> file_result1.is_empty + True + >>> file_result2 = FileResult( + ... successful={LocalPath("/local/file"), LocalPath("/local/another.file")}, + ... ) + >>> file_result2.is_empty + False """ return not self.failed and not self.successful and not self.skipped @@ -444,15 +411,12 @@ def raise_if_empty(self) -> None: Examples -------- - .. code:: python - - from onetl.impl import RemoteFile, LocalPath - from onet.file.file_result import FileResult - - file_result = FileResult() - - file_result.raise_if_empty() - # will raise EmptyFilesError("There are no files in the result") + >>> from onetl.file.file_result import FileResult + >>> file_result = FileResult() + >>> file_result.raise_if_empty() + Traceback (most recent call last): + ... + onetl.exception.EmptyFilesError: There are no files in the result """ if self.is_empty: @@ -460,71 +424,66 @@ def raise_if_empty(self) -> None: @property def details(self) -> str: - ''' + """ Return detailed information about files in the result object Examples -------- - .. code:: python - - from onetl.impl import RemoteFile, LocalPath - from onet.file.file_result import FileResult - - file_result1 = FileResult( - successful={LocalPath("/local/file"), LocalPath("/local/another.file")}, - failed={ - FailedRemoteFile( - path="/remote/file1", - exception=NotAFileError("'/remote/file1' is not a file"), - ), - FailedRemoteFile( - path="/remote/file2", - exception=FileMissingError("'/remote/file2' does not exist"), - ), - }, - skipped={LocalPath("/skipped/file1"), LocalPath("/skipped/file2")}, - missing={LocalPath("/missing/file1"), LocalPath("/missing/file2")}, - ) - - details1 = """ - Total: 8 files (10.4 MB) - - Successful 2 files (30.7 kB): - '/successful1' (10.2 kB) - '/successful2' (20.5 kB) - - Failed 2 files (10MB): - '/remote/file1' (1 MB) - NotAFileError("'/remote/file1' is not a file") - - '/remote/file2' (9 MB) - FileMissingError("'/remote/file2' does not exist") - - Skipped 2 files (15 kB): - '/skipped/file1' (10kB) - '/skipped/file2' (5 kB) - - Missing 2 files: - '/missing/file1' - '/missing/file2' - """ - - assert file_result1.details == details1 - - file_result2 = FileResult() - details2 = """ - No successful files - - No failed files - - No skipped files - - No missing files - """ - - assert file_result2.details == details2 - ''' + >>> from onetl.impl import FailedRemoteFile, LocalPath, RemoteFile, RemotePathStat + >>> from onetl.exception import NotAFileError + >>> from onetl.file.file_result import FileResult + >>> file_result1 = FileResult( + ... successful={ + ... RemoteFile("/local/file", stats=RemotePathStat(st_size=1024)), + ... RemoteFile("/local/another.file", stats=RemotePathStat(st_size=1024)), + ... }, + ... failed={ + ... FailedRemoteFile( + ... path="/remote/file1", + ... stats=RemotePathStat(st_size=0), + ... exception=NotAFileError("'/remote/file1' is not a file"), + ... ), + ... FailedRemoteFile( + ... path="/remote/file2", + ... stats=RemotePathStat(st_size=0), + ... exception=PermissionError("'/remote/file2': [Errno 13] Permission denied"), + ... ), + ... }, + ... skipped={LocalPath("/skipped/file1"), LocalPath("/skipped/file2")}, + ... missing={LocalPath("/missing/file1"), LocalPath("/missing/file2")}, + ... ) + >>> print(file_result1.details) + Total: 8 files (size='2.0 kB') + + Successful 2 files (size='2.0 kB'): + '/local/another.file' (size='1.0 kB') + '/local/file' (size='1.0 kB') + + Failed 2 files (size='0 Bytes'): + '/remote/file2' (size='0 Bytes') + PermissionError("'/remote/file2': [Errno 13] Permission denied") + '/remote/file1' (size='0 Bytes') + NotAFileError("'/remote/file1' is not a file") + + Skipped 2 files (size='0 Bytes'): + '/skipped/file1' + '/skipped/file2' + + Missing 2 files: + '/missing/file2' + '/missing/file1' + + >>> file_result2 = FileResult() + >>> print(file_result2.details) + No successful files + + No failed files + + No skipped files + + No missing files + """ result = [] @@ -540,41 +499,50 @@ def details(self) -> str: @property def summary(self) -> str: - ''' + """ Return short summary about files in the result object Examples -------- - .. code:: python - - from onetl.impl import RemoteFile, LocalPath - from onet.file.file_result import FileResult - - file_result1 = FileResult( - successful={LocalPath("/local/file"), LocalPath("/local/another.file")}, - failed={RemoteFile("/remote/file"), RemoteFile("/remote/another.file")}, - skipped={LocalPath("/skipped/file")}, - missing={LocalPath("/missing/file")}, - ) - - result = """ - Total: 8 files (10.4 MB) - - Successful: 2 files (30.7 kB) - - Failed: 2 files (10MB) - - Skipped: 2 files (15 kB) - - Missing: 2 files - """ - - assert file_result1.summary == result - - file_result2 = FileResult() - assert file_result1.summary == "No files" - ''' + >>> from onetl.impl import FailedRemoteFile, LocalPath, RemoteFile, RemotePathStat + >>> from onetl.exception import NotAFileError + >>> from onetl.file.file_result import FileResult + >>> file_result1 = FileResult( + ... successful={ + ... RemoteFile("/local/file", stats=RemotePathStat(st_size=1024)), + ... RemoteFile("/local/another.file", stats=RemotePathStat(st_size=1024)), + ... }, + ... failed={ + ... FailedRemoteFile( + ... path="/remote/file1", + ... stats=RemotePathStat(st_size=0), + ... exception=NotAFileError("'/remote/file1' is not a file"), + ... ), + ... FailedRemoteFile( + ... path="/remote/file2", + ... stats=RemotePathStat(st_size=0), + ... exception=PermissionError("'/remote/file2': [Errno 13] Permission denied"), + ... ), + ... }, + ... skipped={LocalPath("/skipped/file1"), LocalPath("/skipped/file2")}, + ... missing={LocalPath("/missing/file1"), LocalPath("/missing/file2")}, + ... ) + >>> print(file_result1.summary) + Total: 8 files (size='2.0 kB') + + Successful: 2 files (size='2.0 kB') + + Failed: 2 files (size='0 Bytes') + + Skipped: 2 files (size='0 Bytes') + + Missing: 2 files + + >>> file_result2 = FileResult() + >>> print(file_result2.summary) + No files + """ return self._total_message def __str__(self): diff --git a/onetl/file/file_set.py b/onetl/file/file_set.py index 4cbf86ea5..2447ad04f 100644 --- a/onetl/file/file_set.py +++ b/onetl/file/file_set.py @@ -33,14 +33,11 @@ def total_size(self) -> int: Examples -------- - .. code:: python - - from onetl.impl import LocalPath - from onet.file.file_set import FileSet - - file_set = FileSet({LocalPath("/some/file"), LocalPath("/some/another.file")}) - - assert path_set.total_size == 1_000_000 # in bytes + >>> from onetl.impl import LocalPath + >>> from onet.file.file_set import FileSet + >>> file_set = FileSet({LocalPath("/some/file"), LocalPath("/some/another.file")}) + >>> path_set.total_size # in bytes + 1024 """ return sum( @@ -60,14 +57,12 @@ def raise_if_empty(self) -> None: Examples -------- - .. code:: python - - from onet.file.file_set import FileSet - - file_set = FileSet() - - file_set.raise_if_empty() - # will raise EmptyFilesError("There are no files in the set") + >>> from onet.file.file_set import FileSet + >>> file_set = FileSet() + >>> file_set.raise_if_empty() + Traceback (most recent call last): + ... + onetl.exception.EmptyFilesError: There are no files in the set """ if not self: @@ -86,23 +81,19 @@ def raise_if_contains_zero_size(self) -> None: Examples -------- - .. code:: python - - from onetl.impl import RemoteFile, LocalPath - from onet.file.file_set import FileSet - - file_set = FileSet( - LocalPath("/local/empty1.file"), - LocalPath("/local/empty2.file"), - LocalPath("/local/normal.file"), - ) - - file_set.raise_if_contains_zero_size() - # will raise ZeroFileSizeError(''' - # 2 files out of 3 have zero size: - # '/local/empty1.file' - # '/local/empty2.file' - # ''') + >>> from onetl.impl import RemoteFile, LocalPath + >>> from onet.file.file_set import FileSet + >>> file_set = FileSet( + ... LocalPath("/local/empty1.file"), + ... LocalPath("/local/empty2.file"), + ... LocalPath("/local/normal.file"), + ... ) + >>> file_set.raise_if_contains_zero_size() + Traceback (most recent call last): + ... + onetl.exception.ZeroFileSizeError: 2 files out of 3 have zero size: + '/local/empty1.file' + '/local/empty2.file' """ lines = [] @@ -132,21 +123,19 @@ def summary(self) -> str: Examples -------- - .. code:: python - - from onetl.impl import LocalPath - from onet.file.file_set import FileSet - - path_set1 = FileSet( - [ - LocalPath("/local/file"), - LocalPath("/local/another.file"), - ] - ) - - assert path_set1.summary == "2 files (30.7 kB)" - - assert FileSet().summary == "No files" + >>> from onetl.impl import LocalPath + >>> from onet.file.file_set import FileSet + >>> path_set1 = FileSet( + ... [ + ... LocalPath("/local/file"), + ... LocalPath("/local/another.file"), + ... ] + ... ) + >>> print(path_set1.summary) + 2 files (30.7 kB) + >>> path_set2 = FileSet() + >>> print(path_set2.summary) + No files """ if not self: @@ -156,35 +145,30 @@ def summary(self) -> str: return f"{file_number_str} (size='{naturalsize(self.total_size)}')" @property - def details(self) -> str: - ''' + def details(self) -> str: # noqa: WPS473 + """ Return detailed information about files in the set Examples -------- - .. code:: python - - from onetl.impl import LocalPath - from onet.file.file_set import FileSet - - path_set1 = FileSet( - [ - LocalPath("/local/file"), - LocalPath("/local/another.file"), - ] - ) - - details1 = """ - 2 files (30.7 kB): - '/local/file' (10.2 kB) - '/local/another.file' (20.5 kB) - """ - - assert path_set1.details == details1 - - assert FileSet().details == "No files" - ''' + >>> from onetl.impl import LocalPath + >>> from onet.file.file_set import FileSet + >>> path_set1 = FileSet( + ... [ + ... LocalPath("/local/file"), + ... LocalPath("/local/another.file"), + ... ] + ... ) + >>> print(path_set1.details) + 2 files (30.7 kB): + '/local/file' (10.2 kB) + '/local/another.file' (20.5 kB) + + >>> path_set2 = FileSet() + >>> print(path_set2.details) + No files + """ if not self: return self.summary diff --git a/onetl/file/file_uploader/file_uploader.py b/onetl/file/file_uploader/file_uploader.py index ba107310b..9ab5f088f 100644 --- a/onetl/file/file_uploader/file_uploader.py +++ b/onetl/file/file_uploader/file_uploader.py @@ -63,6 +63,11 @@ class FileUploader(FrozenModel): This class does **not** support read strategies. + .. versionadded:: 0.1.0 + + .. versionchanged:: 0.8.0 + Moved ``onetl.core.FileDownloader`` → ``onetl.file.FileDownloader`` + Parameters ---------- connection : :obj:`onetl.connection.FileConnection` @@ -77,6 +82,8 @@ class FileUploader(FrozenModel): Could be ``None``, but only if you pass absolute file paths directly to :obj:`~run` method + .. versionadded:: 0.3.0 + temp_path : os.PathLike or str, optional, default: ``None`` If set, this path will be used for uploading a file, and then renaming it to the target file path. If ``None`` (default since v0.5.0) is passed, files are uploaded directly to ``target_path``. @@ -95,8 +102,11 @@ class FileUploader(FrozenModel): Otherwise instead of ``rename``, remote OS will move file between filesystems, which is NOT atomic operation. + .. versionchanged:: 0.5.0 + Default changed from ``/tmp`` to ``None`` + options : :obj:`~FileUploader.Options` | dict | None, default: ``None`` - File upload options. See :obj:`~FileUploader.Options` + File upload options. See :obj:`FileUploader.Options ` Examples -------- @@ -151,6 +161,8 @@ def run(self, files: Iterable[str | os.PathLike] | None = None) -> UploadResult: """ Method for uploading files to remote host. |support_hooks| + .. versionadded:: 0.1.0 + Parameters ---------- @@ -161,7 +173,7 @@ def run(self, files: Iterable[str | os.PathLike] | None = None) -> UploadResult: Returns ------- - uploaded_files : :obj:`UploadResult ` + :obj:`UploadResult ` Upload result object @@ -182,85 +194,76 @@ def run(self, files: Iterable[str | os.PathLike] | None = None) -> UploadResult: Examples -------- - Upload files from ``local_path`` to ``target_path`` - - .. code:: python - - from onetl.impl import ( - RemoteFile, - LocalPath, - ) - from onetl.file import FileUploader - - uploader = FileUploader(local_path="/local", target_path="/remote", ...) - uploaded_files = uploader.run() + Upload files from ``local_path`` to ``target_path``: - assert uploaded_files.successful == { + >>> from onetl.file import FileUploader + >>> uploader = FileUploader(local_path="/local", target_path="/remote", ...) + >>> upload_result = uploader.run() + >>> upload_result + UploadResult( + successful=FileSet([ RemoteFile("/remote/file1"), RemoteFile("/remote/file2"), - RemoteFile("/remote/nested/path/file3"), # directory structure is preserved - } - assert uploaded_files.failed == {FailedLocalFile("/local/failed.file")} - assert uploaded_files.skipped == {LocalPath("/local/already.exists")} - assert uploaded_files.missing == {LocalPath("/local/missing.file")} - - Upload only certain files from ``local_path`` - - .. code:: python - - from onetl.impl import ( - RemoteFile, - LocalPath, - ) - from onetl.file import FileUploader - - uploader = FileUploader(local_path="/local", target_path="/remote", ...) - - # paths could be relative or absolute, but all should be in "/local" - uploaded_files = uploader.run( - [ - "/local/file1", - "/local/nested/path/file3", - # excluding "/local/file2", - ] - ) + # directory structure is preserved + RemoteFile("/remote/nested/path/file3") + ]), + failed=FileSet([ + FailedLocalFile("/local/failed.file"), + ]), + skipped=FileSet([ + LocalPath("/local/already.exists"), + ]), + missing=FileSet([ + LocalPath("/local/missing.file"), + ]), + ) - assert uploaded_files.successful == { + Upload only certain files from ``local_path``: + + >>> from onetl.file import FileUploader + >>> uploader = FileUploader(local_path="/local", target_path="/remote", ...) + >>> # paths could be relative or absolute, but all should be in "/local" + >>> upload_result = uploader.run( + ... [ + ... "/local/file1", + ... "/local/nested/path/file3", + ... # excluding "/local/file2", + ... ] + ... ) + >>> upload_result + UploadResult( + successful=FileSet([ RemoteFile("/remote/file1"), - RemoteFile("/remote/nested/path/file3"), # directory structure is preserved - } - assert not uploaded_files.failed - assert not uploaded_files.skipped - assert not uploaded_files.missing - - Upload only certain files from any folder - - .. code:: python - - from onetl.impl import ( - RemoteFile, - LocalPath, - ) - from onetl.file import FileUploader - - uploader = FileUploader(target_path="/remote", ...) # no local_path set - - # only absolute paths - uploaded_files = uploader.run( - [ - "/local/file1.txt", - "/any/nested/path/file3.txt", - ] - ) + # directory structure is preserved + RemoteFile("/remote/nested/path/file3"), + ]), + failed=FileSet([]), + skipped=FileSet([]), + missing=FileSet([]), + ) - assert uploaded_files.successful == { - RemoteFile("/remote/file1"), - RemoteFile("/remote/file3"), + Upload only certain files from any folder: + + >>> from onetl.file import FileUploader + >>> uploader = FileUploader(target_path="/remote", ...) # no local_path set + >>> # only absolute paths + >>> upload_result = uploader.run( + ... [ + ... "/local/file1.txt", + ... "/any/nested/path/file3.txt", + ... ] + ... ) + >>> upload_result + UploadResult( + successful=FileSet([ + RemoteFile("/remote/file1.txt"), # directory structure is NOT preserved without local_path - } - assert not uploaded_files.failed - assert not uploaded_files.skipped - assert not uploaded_files.missing + RemoteFile("/remote/file3.txt"), + ]), + failed=FileSet([]), + skipped=FileSet([]), + missing=FileSet([]), + ) """ entity_boundary_log(log, f"{self.__class__.__name__}.run() starts") @@ -314,6 +317,8 @@ def view_files(self) -> FileSet[LocalPath]: """ Get file list in the ``local_path``. |support_hooks| + .. versionadded:: 0.3.0 + Raises ------ :obj:`onetl.exception.DirectoryNotFoundError` @@ -332,22 +337,16 @@ def view_files(self) -> FileSet[LocalPath]: Examples -------- - View files - - .. code:: python - - from onetl.impl import LocalPath - from onetl.file import FileUploader - - uploader = FileUploader(local_path="/local", ...) - - view_files = uploader.view_files() + View files: - assert view_files == { - LocalPath("/local/file1.txt"), - LocalPath("/local/file3.txt"), - LocalPath("/local/nested/path/file3.txt"), - } + >>> from onetl.file import FileUploader + >>> uploader = FileUploader(local_path="/local", ...) + >>> uploader.view_files() + FileSet([ + LocalPath("/local/file1.txt"), + LocalPath("/local/file3.txt"), + LocalPath("/local/nested/path/file3.txt"), + ]) """ if not self.local_path: diff --git a/onetl/file/file_uploader/options.py b/onetl/file/file_uploader/options.py index 9789d8477..b046ec3a8 100644 --- a/onetl/file/file_uploader/options.py +++ b/onetl/file/file_uploader/options.py @@ -13,7 +13,10 @@ class FileUploaderOptions(GenericOptions): - """File uploading options""" + """File uploading options. + + .. versionadded:: 0.3.0 + """ if_exists: FileExistBehavior = Field(default=FileExistBehavior.ERROR, alias="mode") """ @@ -24,6 +27,9 @@ class FileUploaderOptions(GenericOptions): * ``ignore`` - do nothing, mark file as ignored * ``replace_file`` - replace existing file with a new one * ``replace_entire_directory`` - delete local directory content before downloading files + + .. versionchanged:: 0.9.0 + Renamed ``mode`` → ``if_exists`` """ delete_local: bool = False @@ -31,6 +37,11 @@ class FileUploaderOptions(GenericOptions): If ``True``, remove local file after successful download. If download failed, file will left intact. + + .. versionadded:: 0.2.0 + + .. versionchanged:: 0.3.0 + Move ``FileUploader.delete_local`` to ``FileUploaderOptions`` """ workers: int = Field(default=1, ge=1) @@ -41,6 +52,8 @@ class FileUploaderOptions(GenericOptions): 2 or more means files will be uploaded in parallel workers. Recommended value is ``min(32, os.cpu_count() + 4)``, e.g. ``5``. + + .. versionadded:: 0.8.1 """ @root_validator(pre=True) diff --git a/onetl/file/file_uploader/result.py b/onetl/file/file_uploader/result.py index 9a785c719..34638bae5 100644 --- a/onetl/file/file_uploader/result.py +++ b/onetl/file/file_uploader/result.py @@ -17,42 +17,43 @@ class UploadResult(FileResult): Container for file paths, divided into certain categories: - * :obj:`successful` - * :obj:`failed` - * :obj:`skipped` - * :obj:`missing` + * :obj:`~successful` + * :obj:`~failed` + * :obj:`~skipped` + * :obj:`~missing` + + .. versionadded:: 0.3.0 Examples -------- - Upload files - - .. code:: python - - from onetl.impl import LocalPath, RemoteFile, FailedLocalFile - from onetl.file import FileUploader, UploadResult - - uploader = FileUploader(target_path="/remote", ...) - - uploaded_files = uploader.run( - [ - "/local/file1", - "/local/file2", - "/failed/file", - "/existing/file", - "/missing/file", - ] - ) - - assert uploaded_files == UploadResult( - successful={ - RemoteFile("/remote/file1"), - RemoteFile("/remote/file2"), - }, - failed={FailedLocalFile("/failed/file")}, - skipped={LocalPath("/existing/file")}, - missing={LocalPath("/missing/file")}, - ) + >>> from onetl.file import FileUploader + >>> uploader = FileUploader(target_path="/remote", ...) + >>> upload_result = uploader.run( + ... [ + ... "/local/file1", + ... "/local/file2", + ... "/failed/file", + ... "/existing/file", + ... "/missing/file", + ... ] + ... ) + >>> upload_result + UploadResult( + successful=FileSet([ + RemoteFile("/remote/file1"), + RemoteFile("/remote/file2"), + ]), + failed=FileSet([ + FailedLocalFile("/failed/file") + ]), + skipped=FileSet([ + LocalPath("/existing/file") + ]), + missing=FileSet([ + LocalPath("/missing/file") + ]), + ) """ successful: FileSet[RemoteFile] = Field(default_factory=FileSet) diff --git a/onetl/file/filter/exclude_dir.py b/onetl/file/filter/exclude_dir.py index a7d9007a5..f5b096d2c 100644 --- a/onetl/file/filter/exclude_dir.py +++ b/onetl/file/filter/exclude_dir.py @@ -16,6 +16,9 @@ class ExcludeDir(BaseFileFilter, FrozenModel): """Filter files or directories which are included in a specific directory. + .. versionadded:: 0.8.0 + Replaces deprecated ``onetl.core.FileFilter`` + Parameters ---------- diff --git a/onetl/file/filter/glob.py b/onetl/file/filter/glob.py index 768261e60..db622cfd3 100644 --- a/onetl/file/filter/glob.py +++ b/onetl/file/filter/glob.py @@ -16,6 +16,9 @@ class Glob(BaseFileFilter, FrozenModel): """Filter files or directories with path matching a glob expression. + .. versionadded:: 0.8.0 + Replaces deprecated ``onetl.core.FileFilter`` + Parameters ---------- diff --git a/onetl/file/filter/match_all_filters.py b/onetl/file/filter/match_all_filters.py index e5c6a7d56..484bee93f 100644 --- a/onetl/file/filter/match_all_filters.py +++ b/onetl/file/filter/match_all_filters.py @@ -14,6 +14,8 @@ def match_all_filters(path: PathProtocol, filters: Iterable[BaseFileFilter]) -> """ Check if input path satisfies all the filters. + .. versionadded:: 0.8.0 + Parameters ---------- path : :obj:`onetl.base.path_protocol.PathProtocol` @@ -31,16 +33,15 @@ def match_all_filters(path: PathProtocol, filters: Iterable[BaseFileFilter]) -> Examples -------- - .. code:: python - - from onetl.file.filter import Glob, ExcludeDir, match_all_filters - from onetl.impl import LocalPath - - filters = [Glob("*.csv"), ExcludeDir("/excluded")] - - assert match_all_filters(LocalPath("/path/to/file.csv"), filters) - assert not match_all_filters(LocalPath("/path/to/file.txt"), filters) - assert not match_all_filters(LocalPath("/excluded/path/file.csv"), filters) + >>> from onetl.file.filter import Glob, ExcludeDir, match_all_filters + >>> from onetl.impl import RemoteFile, RemotePathStat + >>> filters = [Glob("*.csv"), ExcludeDir("/excluded")] + >>> match_all_filters(RemoteFile("/path/to/file.csv", stats=RemotePathStat()), filters) + True + >>> match_all_filters(RemoteFile("/path/to/file.txt", stats=RemotePathStat()), filters) + False + >>> match_all_filters(RemoteFile("/excluded/path/file.csv", stats=RemotePathStat()), filters) + False """ empty = True diff --git a/onetl/file/filter/regexp.py b/onetl/file/filter/regexp.py index 0c64001da..48e321ad1 100644 --- a/onetl/file/filter/regexp.py +++ b/onetl/file/filter/regexp.py @@ -17,6 +17,9 @@ class Regexp(BaseFileFilter, FrozenModel): r"""Filter files or directories with path matching a regular expression. + .. versionadded:: 0.8.0 + Replaces deprecated ``onetl.core.FileFilter`` + Parameters ---------- diff --git a/onetl/file/format/avro.py b/onetl/file/format/avro.py index 551c76c9b..3699620b0 100644 --- a/onetl/file/format/avro.py +++ b/onetl/file/format/avro.py @@ -20,7 +20,8 @@ from onetl.hooks import slot, support_hooks if TYPE_CHECKING: - from pyspark.sql import DataFrameReader, DataFrameWriter, SparkSession + from pyspark.sql import Column, DataFrameReader, DataFrameWriter, SparkSession + PROHIBITED_OPTIONS = frozenset( ( @@ -64,7 +65,6 @@ class Avro(ReadWriteFileFormat): * Spark versions: 2.4.x - 3.5.x * Java versions: 8 - 20 - * Scala versions: 2.11 - 2.13 See documentation from link above. @@ -75,6 +75,8 @@ class Avro(ReadWriteFileFormat): The set of supported options depends on Spark version. See link above. + .. versionadded:: 0.9.0 + Examples -------- @@ -97,7 +99,10 @@ class Avro(ReadWriteFileFormat): schema = { "type": "record", "name": "Person", - "fields": [{"name": "name", "type": "string"}], + "fields": [ + {"name": "name", "type": "string"}, + {"name": "age", "type": "int"}, + ], } avro = Avro(schema_dict=schema, compression="snappy") @@ -117,8 +122,8 @@ class Config: @classmethod def get_packages( cls, - spark_version: str | Version, - scala_version: str | Version | None = None, + spark_version: str, + scala_version: str | None = None, ) -> list[str]: """ Get package names to be downloaded by Spark. |support_hooks| @@ -126,6 +131,8 @@ def get_packages( See `Maven package index `_ for all available packages. + .. versionadded:: 0.9.0 + Parameters ---------- spark_version : str @@ -148,15 +155,15 @@ def get_packages( """ - spark_ver = Version.parse(spark_version) - if spark_ver < (2, 4): + spark_ver = Version(spark_version).min_digits(3) + if spark_ver < Version("2.4"): raise ValueError(f"Spark version should be at least 2.4, got {spark_version}") - scala_ver = Version.parse(scala_version) if scala_version else get_default_scala_version(spark_ver) - if scala_ver.digits(2) < (2, 11): - raise ValueError(f"Scala version should be at least 2.11, got {scala_ver}") + scala_ver = Version(scala_version).min_digits(2) if scala_version else get_default_scala_version(spark_ver) + if scala_ver < Version("2.11"): + raise ValueError(f"Scala version should be at least 2.11, got {scala_ver.format('{0}.{1}')}") - return [f"org.apache.spark:spark-avro_{scala_ver.digits(2)}:{spark_ver.digits(3)}"] + return [f"org.apache.spark:spark-avro_{scala_ver.format('{0}.{1}')}:{spark_ver.format('{0}.{1}.{2}')}"] @slot def check_if_supported(self, spark: SparkSession) -> None: @@ -189,8 +196,231 @@ def apply_to_writer(self, writer: DataFrameWriter) -> DataFrameWriter: options["avroSchema"] = json.dumps(self.schema_dict) return writer.format(self.name).options(**options) + def parse_column(self, column: str | Column) -> Column: + """ + Parses an Avro binary column into a structured Spark SQL column using Spark's + `from_avro `_ function, based on the schema provided within the class. + + .. note:: + + Can be used only with Spark 3.x+ + + .. warning:: + + If ``schema_url`` is provided, ``requests`` library is used to fetch the schema from the URL. It should be installed manually, like this: + + .. code:: bash + + pip install requests + + .. versionadded:: 0.11.0 + + Parameters + ---------- + column : str | Column + The name of the column or the column object containing Avro bytes to deserialize. + Schema should match the provided Avro schema. + + Returns + ------- + Column with deserialized data. Schema is matching the provided Avro schema. Column name is the same as input column. + + Raises + ------ + ValueError + If the Spark version is less than 3.x or if neither schema_dict nor schema_url is defined. + ImportError + If ``schema_url`` is used and the ``requests`` library is not installed. + + Examples + -------- + + >>> from pyspark.sql.functions import decode + >>> from onetl.file.format import Avro + >>> df.show() + +----+----------------------+----------+---------+------+-----------------------+-------------+ + |key |value |topic |partition|offset|timestamp |timestampType| + +----+----------------------+----------+---------+------+-----------------------+-------------+ + |[31]|[0A 41 6C 69 63 65 28]|topicAvro |0 |0 |2024-04-24 13:02:25.911|0 | + |[32]|[06 42 6F 62 32] |topicAvro |0 |1 |2024-04-24 13:02:25.922|0 | + +----+----------------------+----------+---------+------+-----------------------+-------------+ + >>> df.printSchema() + root + |-- key: binary (nullable = true) + |-- value: binary (nullable = true) + |-- topic: string (nullable = true) + |-- partition: integer (nullable = true) + |-- offset: integer (nullable = true) + |-- timestamp: timestamp (nullable = true) + |-- timestampType: integer (nullable = true) + >>> avro = Avro( + ... schema_dict={ + ... "type": "record", + ... "name": "Person", + ... "fields": [ + ... {"name": "name", "type": "string"}, + ... {"name": "age", "type": "int"}, + ... ], + ... } + ... ) + >>> parsed_df = df.select(decode("key", "UTF-8").alias("key"), avro.parse_column("value")) + >>> parsed_df.show(truncate=False) + +---+-----------+ + |key|value | + +---+-----------+ + |1 |{Alice, 20}| + |2 |{Bob, 25} | + +---+-----------+ + >>> parsed_df.printSchema() + root + |-- key: string (nullable = true) + |-- value: struct (nullable = true) + | |-- name: string (nullable = true) + | |-- age: integer (nullable = true) + """ + from pyspark.sql import Column, SparkSession # noqa: WPS442 + from pyspark.sql.functions import col + + spark = SparkSession._instantiatedSession # noqa: WPS437 + self.check_if_supported(spark) + self._check_spark_version_for_serialization(spark) + + from pyspark.sql.avro.functions import from_avro + + if isinstance(column, Column): + column_name = column._jc.toString() # noqa: WPS437 + else: + column_name, column = column, col(column).cast("binary") + + schema = self._get_schema_json() + if not schema: + raise ValueError("Avro.parse_column can be used only with defined `schema_dict` or `schema_url`") + + return from_avro(column, schema).alias(column_name) + + def serialize_column(self, column: str | Column) -> Column: + """ + Serializes a structured Spark SQL column into an Avro binary column using Spark's + `to_avro `_ function. + + .. note:: + + Can be used only with Spark 3.x+ + + .. warning:: + + If ``schema_url`` is provided, ``requests`` library is used to fetch the schema from the URL. It should be installed manually, like this: + + .. code:: bash + + pip install requests + + .. versionadded:: 0.11.0 + + Parameters + ---------- + column : str | Column + The name of the column or the column object containing the data to serialize to Avro format. + + Returns + ------- + Column with binary Avro data. Column name is the same as input column. + + Raises + ------ + ValueError + If the Spark version is less than 3.x. + ImportError + If ``schema_url`` is used and the ``requests`` library is not installed. + + Examples + -------- + + >>> from pyspark.sql.functions import decode + >>> from onetl.file.format import Avro + >>> df.show() + +---+-----------+ + |key|value | + +---+-----------+ + |1 |{Alice, 20}| + |2 | {Bob, 25}| + +---+-----------+ + >>> df.printSchema() + root + |-- key: string (nullable = true) + |-- value: struct (nullable = true) + | |-- name: string (nullable = true) + | |-- age: integer (nullable = true) + >>> # serializing data into Avro format + >>> avro = Avro( + ... schema_dict={ + ... "type": "record", + ... "name": "Person", + ... "fields": [ + ... {"name": "name", "type": "string"}, + ... {"name": "age", "type": "int"}, + ... ], + ... } + ... ) + >>> serialized_df = df.select("key", avro.serialize_column("value")) + >>> serialized_df.show(truncate=False) + +---+----------------------+ + |key|value | + +---+----------------------+ + | 1|[0A 41 6C 69 63 65 28]| + | 2|[06 42 6F 62 32] | + +---+----------------------+ + >>> serialized_df.printSchema() + root + |-- key: string (nullable = true) + |-- value: binary (nullable = true) + """ + from pyspark.sql import Column, SparkSession # noqa: WPS442 + from pyspark.sql.functions import col + + spark = SparkSession._instantiatedSession # noqa: WPS437 + self.check_if_supported(spark) + self._check_spark_version_for_serialization(spark) + + from pyspark.sql.avro.functions import to_avro + + if isinstance(column, Column): + column_name = column._jc.toString() # noqa: WPS437 + else: + column_name, column = column, col(column) + + schema = self._get_schema_json() + return to_avro(column, schema).alias(column_name) + @validator("schema_dict", pre=True) def _parse_schema_from_json(cls, value): if isinstance(value, (str, bytes)): return json.loads(value) return value + + def _check_spark_version_for_serialization(self, spark: SparkSession): + spark_version = get_spark_version(spark) + if spark_version.major < 3: + class_name = self.__class__.__name__ + error_msg = ( + f"`{class_name}.parse_column` or `{class_name}.serialize_column` are available " + f"only since Spark 3.x, but got {spark_version}." + ) + raise ValueError(error_msg) + + def _get_schema_json(self) -> str: + if self.schema_dict: + return json.dumps(self.schema_dict) + elif self.schema_url: + try: + import requests + + response = requests.get(self.schema_url) # noqa: S113 + return response.text + except ImportError as e: + raise ImportError( + "The 'requests' library is required to use 'schema_url' but is not installed. " + "Install it with 'pip install requests' or avoid using 'schema_url'.", + ) from e + else: + return "" diff --git a/onetl/file/format/csv.py b/onetl/file/format/csv.py index 41794f7ef..353a8e987 100644 --- a/onetl/file/format/csv.py +++ b/onetl/file/format/csv.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations +import warnings from typing import TYPE_CHECKING, ClassVar try: @@ -9,11 +10,14 @@ except (ImportError, AttributeError): from pydantic import Field # type: ignore[no-redef, assignment] +from onetl._internal import stringify +from onetl._util.spark import get_spark_version from onetl.file.format.file_format import ReadWriteFileFormat from onetl.hooks import slot, support_hooks if TYPE_CHECKING: - from pyspark.sql import SparkSession + from pyspark.sql import Column, SparkSession + from pyspark.sql.types import StructType READ_WRITE_OPTIONS = frozenset( @@ -81,6 +85,8 @@ class CSV(ReadWriteFileFormat): The set of supported options depends on Spark version. See link above. + .. versionadded:: 0.9.0 + Examples -------- @@ -109,3 +115,176 @@ class Config: def check_if_supported(cls, spark: SparkSession) -> None: # always available pass + + def parse_column(self, column: str | Column, schema: StructType) -> Column: + """ + Parses a CSV string column to a structured Spark SQL column using Spark's + `from_csv `_ function, based on the provided schema. + + .. note:: + + Can be used only with Spark 3.x+ + + .. versionadded:: 0.11.0 + + Parameters + ---------- + column : str | Column + The name of the column or the column object containing CSV strings/bytes to parse. + + schema : StructType + The schema to apply when parsing the CSV data. This defines the structure of the output DataFrame column. + + Returns + ------- + Column with deserialized data, with the same structure as the provided schema. Column name is the same as input column. + + Examples + -------- + + >>> from pyspark.sql.types import StructType, StructField, IntegerType, StringType + >>> from onetl.file.format import CSV + >>> df.show() + +--+--------+ + |id|value | + +--+--------+ + |1 |Alice;20| + |2 |Bob;25 | + +--+--------+ + >>> df.printSchema() + root + |-- id: integer (nullable = true) + |-- value: string (nullable = true) + >>> csv = CSV(delimiter=";") + >>> csv_schema = StructType( + ... [ + ... StructField("name", StringType(), nullable=True), + ... StructField("age", IntegerType(), nullable=True), + ... ], + ... ) + >>> parsed_df = df.select("id", csv.parse_column("value", csv_schema)) + >>> parsed_df.show() + +--+-----------+ + |id|value | + +--+-----------+ + |1 |{Alice, 20}| + |2 | {Bob, 25}| + +--+-----------+ + >>> parsed_df.printSchema() + root + |-- id: integer (nullable = true) + |-- value: struct (nullable = true) + | |-- name: string (nullable = true) + | |-- age: integer (nullable = true) + """ + + from pyspark.sql import Column, SparkSession # noqa: WPS442 + + spark = SparkSession._instantiatedSession # noqa: WPS437 + self.check_if_supported(spark) + self._check_spark_version_for_serialization(spark) + self._check_unsupported_serialization_options() + + from pyspark.sql.functions import col, from_csv + + if isinstance(column, Column): + column_name = column._jc.toString() # noqa: WPS437 + else: + column_name, column = column, col(column).cast("string") + + schema_string = schema.simpleString() + options = stringify(self.dict(by_alias=True)) + return from_csv(column, schema_string, options).alias(column_name) + + def serialize_column(self, column: str | Column) -> Column: + """ + Serializes a structured Spark SQL column into a CSV string column using Spark's + `to_csv `_ function. + + .. note:: + + Can be used only with Spark 3.x+ + + .. versionadded:: 0.11.0 + + Parameters + ---------- + column : str | Column + The name of the column or the Column object containing the data to serialize to CSV. + + Returns + ------- + Column with string CSV data. Column name is the same as input column. + + Examples + -------- + + >>> from pyspark.sql.functions import decode + >>> from onetl.file.format import CSV + >>> df.show() + +--+-----------+ + |id|value | + +--+-----------+ + |1 |{Alice, 20}| + |2 | {Bob, 25}| + +--+-----------+ + >>> df.printSchema() + root + |-- id: integer (nullable = true) + |-- value: struct (nullable = true) + | |-- name: string (nullable = true) + | |-- age: integer (nullable = true) + >>> # serializing data into CSV format + >>> csv = CSV(delimiter=";") + >>> serialized_df = df.select("id", csv.serialize_column("value")) + >>> serialized_df.show(truncate=False) + +--+--------+ + |id|value | + +--+--------+ + |1 |Alice;20| + |2 |Bob;25 | + +--+--------+ + >>> serialized_df.printSchema() + root + |-- id: integer (nullable = true) + |-- value: string (nullable = true) + """ + + from pyspark.sql import Column, SparkSession # noqa: WPS442 + + spark = SparkSession._instantiatedSession # noqa: WPS437 + self.check_if_supported(spark) + self._check_spark_version_for_serialization(spark) + self._check_unsupported_serialization_options() + + from pyspark.sql.functions import col, to_csv + + if isinstance(column, Column): + column_name = column._jc.toString() # noqa: WPS437 + else: + column_name, column = column, col(column) + + options = stringify(self.dict(by_alias=True)) + return to_csv(column, options).alias(column_name) + + def _check_spark_version_for_serialization(self, spark: SparkSession): + spark_version = get_spark_version(spark) + if spark_version.major < 3: + class_name = self.__class__.__name__ + error_msg = ( + f"`{class_name}.parse_column` or `{class_name}.serialize_column` are available " + f"only since Spark 3.x, but got {spark_version}" + ) + raise ValueError(error_msg) + + def _check_unsupported_serialization_options(self): + unsupported_options = ["header", "compression", "inferSchema"] + current_options = self.dict() + for option in unsupported_options: + if current_options.get(option): + warnings.warn( + f"Option `{option}` is set but not supported in `CSV.parse_column` or `CSV.serialize_column`. " + "This may lead to unexpected behavior.", + UserWarning, + stacklevel=2, + ) diff --git a/onetl/file/format/excel.py b/onetl/file/format/excel.py index a7342f702..2ec12758a 100644 --- a/onetl/file/format/excel.py +++ b/onetl/file/format/excel.py @@ -53,11 +53,9 @@ class Excel(ReadWriteFileFormat): Supports reading/writing files with ``.xlsx`` (read/write) and ``.xls`` (read only) extensions. - .. versionadded:: 0.9.4 - .. dropdown:: Version compatibility - * Spark versions: 3.2.x - 3.5.x. + * Spark versions: 3.2.x - 3.5.x .. warning:: @@ -65,7 +63,6 @@ class Excel(ReadWriteFileFormat): See `Maven index `_ and `official documentation `_. - * Scala versions: 2.12 - 2.13 * Java versions: 8 - 20 See documentation from link above. @@ -77,6 +74,8 @@ class Excel(ReadWriteFileFormat): The set of supported options depends on Spark version. See link above. + .. versionadded:: 0.9.4 + Examples -------- @@ -114,9 +113,9 @@ class Config: @classmethod def get_packages( cls, - spark_version: str | Version, - scala_version: str | Version | None = None, - package_version: str | Version | None = None, + spark_version: str, + scala_version: str | None = None, + package_version: str | None = None, ) -> list[str]: """ Get package names to be downloaded by Spark. |support_hooks| @@ -127,6 +126,8 @@ def get_packages( See `Maven index `_ and `official documentation `_. + .. versionadded:: 0.9.4 + Parameters ---------- spark_version : str @@ -167,28 +168,30 @@ def get_packages( """ if package_version: - version = Version.parse(package_version) - if version < (0, 15): + version = Version(package_version) + if version < Version("0.15"): # format="com.crealytics.spark.excel" does not support reading folder with files # format="excel" was added only in 0.14, but Maven package for 0.14 has different naming convention than recent versions. # So using 0.15 as the lowest supported version. raise ValueError(f"Package version should be at least 0.15, got {package_version}") log.warning("Passed custom package version %r, it is not guaranteed to be supported", package_version) else: - version = Version(0, 20, 3) + version = Version("0.20.3") - spark_ver = Version.parse(spark_version) - if spark_ver < (3, 2): + spark_ver = Version(spark_version).min_digits(3) + if spark_ver < Version("3.2"): # Actually, Spark 2.4 is supported, but packages are built only for Scala 2.12 # when default pyspark==2.4.1 is built with Scala 2.11. # See https://github.com/crealytics/spark-excel/issues/426 raise ValueError(f"Spark version should be at least 3.2, got {spark_version}") - scala_ver = Version.parse(scala_version) if scala_version else get_default_scala_version(spark_ver) - if scala_ver.digits(2) < (2, 12): - raise ValueError(f"Scala version should be at least 2.12, got {scala_ver}") + scala_ver = Version(scala_version).min_digits(2) if scala_version else get_default_scala_version(spark_ver) + if scala_ver < Version("2.12"): + raise ValueError(f"Scala version should be at least 2.12, got {scala_ver.format('{0}.{1}')}") - return [f"com.crealytics:spark-excel_{scala_ver.digits(2)}:{spark_ver.digits(3)}_{version.digits(3)}"] + return [ + f"com.crealytics:spark-excel_{scala_ver.format('{0}.{1}')}:{spark_ver.format('{0}.{1}.{2}')}_{version}", + ] @slot def check_if_supported(self, spark: SparkSession) -> None: diff --git a/onetl/file/format/json.py b/onetl/file/format/json.py index 3ea82d7e6..698874424 100644 --- a/onetl/file/format/json.py +++ b/onetl/file/format/json.py @@ -6,11 +6,13 @@ from typing_extensions import Literal +from onetl._internal import stringify from onetl.file.format.file_format import ReadOnlyFileFormat from onetl.hooks import slot, support_hooks if TYPE_CHECKING: - from pyspark.sql import SparkSession + from pyspark.sql import Column, SparkSession + from pyspark.sql.types import ArrayType, MapType, StructType READ_WRITE_OPTIONS = frozenset( @@ -78,6 +80,8 @@ class JSON(ReadOnlyFileFormat): The set of supported options depends on Spark version. See link above. + .. versionadded:: 0.9.0 + Examples -------- @@ -103,3 +107,140 @@ class Config: def check_if_supported(self, spark: SparkSession) -> None: # always available pass + + def parse_column(self, column: str | Column, schema: StructType | ArrayType | MapType) -> Column: + """ + Parses a JSON string column to a structured Spark SQL column using Spark's `from_json `_ function, based on the provided schema. + + .. versionadded:: 0.11.0 + + Parameters + ---------- + column : str | Column + The name of the column or the column object containing JSON strings/bytes to parse. + + schema : StructType | ArrayType | MapType + The schema to apply when parsing the JSON data. This defines the structure of the output DataFrame column. + + Returns + ------- + Column with deserialized data, with the same structure as the provided schema. Column name is the same as input column. + + Examples + -------- + + >>> from pyspark.sql.types import StructType, StructField, IntegerType, StringType + >>> from pyspark.sql.functions import decode + >>> from onetl.file.format import JSON + >>> df.show() + +----+--------------------+----------+---------+------+-----------------------+-------------+ + |key |value |topic |partition|offset|timestamp |timestampType| + +----+--------------------+----------+---------+------+-----------------------+-------------+ + |[31]|[7B 22 6E 61 6D 6...|topicJSON |0 |0 |2024-04-24 16:51:11.739|0 | + |[32]|[7B 22 6E 61 6D 6...|topicJSON |0 |1 |2024-04-24 16:51:11.749|0 | + +----+--------------------+----------+---------+------+-----------------------+-------------+ + >>> df.printSchema() + root + |-- key: binary (nullable = true) + |-- value: binary (nullable = true) + |-- topic: string (nullable = true) + |-- partition: integer (nullable = true) + |-- offset: integer (nullable = true) + |-- timestamp: timestamp (nullable = true) + |-- timestampType: integer (nullable = true) + >>> json = JSON() + >>> json_schema = StructType( + ... [ + ... StructField("name", StringType(), nullable=True), + ... StructField("age", IntegerType(), nullable=True), + ... ], + ... ) + >>> parsed_df = df.select(decode("key", "UTF-8").alias("key"), json.parse_column("value", json_schema)) + >>> parsed_df.show() + +---+-----------+ + |key|value | + +---+-----------+ + |1 |{Alice, 20}| + |2 | {Bob, 25}| + +---+-----------+ + >>> parsed_df.printSchema() + root + |-- key: string (nullable = true) + |-- value: struct (nullable = true) + | |-- name: string (nullable = true) + | |-- age: integer (nullable = true) + """ + from pyspark.sql import Column, SparkSession # noqa: WPS442 + from pyspark.sql.functions import col, from_json + + self.check_if_supported(SparkSession._instantiatedSession) # noqa: WPS437 + + if isinstance(column, Column): + column_name, column = column._jc.toString(), column.cast("string") # noqa: WPS437 + else: + column_name, column = column, col(column).cast("string") + + options = stringify(self.dict(by_alias=True)) + return from_json(column, schema, options).alias(column_name) + + def serialize_column(self, column: str | Column) -> Column: + """ + Serializes a structured Spark SQL column into a JSON string column using Spark's + `to_json `_ function. + + .. versionadded:: 0.11.0 + + Parameters + ---------- + column : str | Column + The name of the column or the column object containing the data to serialize to JSON format. + + Returns + ------- + Column with string JSON data. Column name is the same as input column. + + Examples + -------- + + >>> from pyspark.sql.functions import decode + >>> from onetl.file.format import JSON + >>> df.show() + +---+-----------+ + |key|value | + +---+-----------+ + |1 |{Alice, 20}| + |2 | {Bob, 25}| + +---+-----------+ + >>> df.printSchema() + root + |-- key: string (nullable = true) + |-- value: struct (nullable = true) + | |-- name: string (nullable = true) + | |-- age: integer (nullable = true) + >>> # serializing data into JSON format + >>> json = JSON() + >>> serialized_df = df.select("key", json.serialize_column("value")) + >>> serialized_df.show(truncate=False) + +---+-------------------------+ + |key|value | + +---+-------------------------+ + | 1|{"name":"Alice","age":20}| + | 2|{"name":"Bob","age":25} | + +---+-------------------------+ + >>> serialized_df.printSchema() + root + |-- key: string (nullable = true) + |-- value: string (nullable = true) + """ + from pyspark.sql import Column, SparkSession # noqa: WPS442 + from pyspark.sql.functions import col, to_json + + self.check_if_supported(SparkSession._instantiatedSession) # noqa: WPS437 + + if isinstance(column, Column): + column_name = column._jc.toString() # noqa: WPS437 + else: + column_name, column = column, col(column) + + options = stringify(self.dict(by_alias=True)) + return to_json(column, options).alias(column_name) diff --git a/onetl/file/format/jsonline.py b/onetl/file/format/jsonline.py index 9bfd84159..1d1c910d1 100644 --- a/onetl/file/format/jsonline.py +++ b/onetl/file/format/jsonline.py @@ -72,6 +72,8 @@ class JSONLine(ReadWriteFileFormat): The set of supported options depends on Spark version. See link above. + .. versionadded:: 0.9.0 + Examples -------- diff --git a/onetl/file/format/orc.py b/onetl/file/format/orc.py index a292b6c83..f108a1506 100644 --- a/onetl/file/format/orc.py +++ b/onetl/file/format/orc.py @@ -45,6 +45,8 @@ class ORC(ReadWriteFileFormat): The set of supported options depends on Spark version. See link above. + .. versionadded:: 0.9.0 + Examples -------- diff --git a/onetl/file/format/parquet.py b/onetl/file/format/parquet.py index 460819463..f96ad4445 100644 --- a/onetl/file/format/parquet.py +++ b/onetl/file/format/parquet.py @@ -49,6 +49,8 @@ class Parquet(ReadWriteFileFormat): The set of supported options depends on Spark version. See link above. + .. versionadded:: 0.9.0 + Examples -------- diff --git a/onetl/file/format/xml.py b/onetl/file/format/xml.py index 47190ea77..cc7cd4777 100644 --- a/onetl/file/format/xml.py +++ b/onetl/file/format/xml.py @@ -19,7 +19,8 @@ from onetl.hooks import slot, support_hooks if TYPE_CHECKING: - from pyspark.sql import SparkSession + from pyspark.sql import Column, SparkSession + from pyspark.sql.types import StructType PROHIBITED_OPTIONS = frozenset( @@ -87,8 +88,7 @@ class XML(ReadWriteFileFormat): .. dropdown:: Version compatibility - * Spark versions: 3.2.x - 3.5.x. - * Scala versions: 2.12 - 2.13 + * Spark versions: 3.2.x - 3.5.x * Java versions: 8 - 20 See documentation from link above. @@ -107,6 +107,8 @@ class XML(ReadWriteFileFormat): Using ``mode=FAILFAST`` will throw an exception instead of producing ``null`` values. `Follow `_ + .. versionadded:: 0.9.5 + Examples -------- Describe options how to read from/write to XML file with specific options: @@ -141,13 +143,15 @@ class Config: @classmethod def get_packages( # noqa: WPS231 cls, - spark_version: str | Version, - scala_version: str | Version | None = None, - package_version: str | Version | None = None, + spark_version: str, + scala_version: str | None = None, + package_version: str | None = None, ) -> list[str]: """ Get package names to be downloaded by Spark. |support_hooks| + .. versionadded:: 0.9.5 + Parameters ---------- spark_version : str @@ -159,7 +163,7 @@ def get_packages( # noqa: WPS231 If ``None``, ``spark_version`` is used to determine Scala version. package_version : str, optional - Package version in format ``major.minor.patch``. Default is ``0.17.0``. + Package version in format ``major.minor.patch``. Default is ``0.18.0``. See `Maven index `_ for list of available versions. @@ -185,30 +189,30 @@ def get_packages( # noqa: WPS231 XML.get_packages( spark_version="3.5.0", scala_version="2.12", - package_version="0.17.0", + package_version="0.18.0", ) """ if package_version: - version = Version.parse(package_version) - if version < (0, 14): + version = Version(package_version).min_digits(3) + if version < Version("0.14"): raise ValueError(f"Package version must be above 0.13, got {version}") log.warning("Passed custom package version %r, it is not guaranteed to be supported", package_version) else: - version = Version(0, 17, 0) + version = Version("0.18.0").min_digits(3) - spark_ver = Version.parse(spark_version) - scala_ver = Version.parse(scala_version) if scala_version else get_default_scala_version(spark_ver) + spark_ver = Version(spark_version) + scala_ver = Version(scala_version).min_digits(2) if scala_version else get_default_scala_version(spark_ver) # Ensure compatibility with Spark and Scala versions - if spark_ver < (3, 0): + if spark_ver < Version("3.0"): raise ValueError(f"Spark version must be 3.x, got {spark_ver}") - if scala_ver < (2, 12) or scala_ver > (2, 13): - raise ValueError(f"Scala version must be 2.12 or 2.13, got {scala_ver}") + if scala_ver < Version("2.12"): + raise ValueError(f"Scala version must be at least 2.12, got {scala_ver.format('{0}.{1}')}") - return [f"com.databricks:spark-xml_{scala_ver.digits(2)}:{version.digits(3)}"] + return [f"com.databricks:spark-xml_{scala_ver.format('{0}.{1}')}:{version}"] @slot def check_if_supported(self, spark: SparkSession) -> None: @@ -226,3 +230,129 @@ def check_if_supported(self, spark: SparkSession) -> None: if log.isEnabledFor(logging.DEBUG): log.debug("Missing Java class", exc_info=e, stack_info=True) raise ValueError(msg) from e + + def parse_column(self, column: str | Column, schema: StructType) -> Column: + """ + Parses an XML string column into a structured Spark SQL column using the ``from_xml`` function + provided by the `Databricks Spark XML library `_ + based on the provided schema. + + .. note:: + + This method assumes that the ``spark-xml`` package is installed: :obj:`XML.get_packages `. + + .. note:: + + This method parses each DataFrame row individually. Therefore, for a specific column, each row must contain exactly one occurrence of the ``rowTag`` specified. + If your XML data includes a root tag that encapsulates multiple row tags, you can adjust the schema to use an ``ArrayType`` to keep all child elements under the single root. + + .. code-block:: xml + + + Book OneAuthor A + Book TwoAuthor B + + + And the corresponding schema in Spark using an ``ArrayType``: + + .. code-block:: python + + from pyspark.sql.types import StructType, StructField, ArrayType, StringType + from onetl.file.format import XML + + # each DataFrame row has exactly one tag + xml = XML(rowTag="books") + # each tag have multiple tags, so using ArrayType for such field + schema = StructType( + [ + StructField( + "book", + ArrayType( + StructType( + [ + StructField("title", StringType(), True), + StructField("author", StringType(), True), + ], + ), + ), + True, + ), + ], + ) + + .. versionadded:: 0.11.0 + + Parameters + ---------- + column : str | Column + The name of the column or the column object containing XML strings/bytes to parse. + + schema : StructType + The schema to apply when parsing the XML data. This defines the structure of the output DataFrame column. + + Returns + ------- + Column with deserialized data, with the same structure as the provided schema. Column name is the same as input column. + + Examples + -------- + + >>> from pyspark.sql.types import StructType, StructField, IntegerType, StringType + >>> from onetl.file.format import XML + >>> df.show() + +--+------------------------------------------------+ + |id|value | + +--+------------------------------------------------+ + |1 |Alice20| + |2 |Bob25 | + +--+------------------------------------------------+ + >>> df.printSchema() + root + |-- id: integer (nullable = true) + |-- value: string (nullable = true) + >>> xml = XML(rowTag="person") + >>> xml_schema = StructType( + ... [ + ... StructField("name", StringType(), nullable=True), + ... StructField("age", IntegerType(), nullable=True), + ... ], + ... ) + >>> parsed_df = df.select("key", xml.parse_column("value", xml_schema)) + >>> parsed_df.show() + +--+-----------+ + |id|value | + +--+-----------+ + |1 |{Alice, 20}| + |2 | {Bob, 25}| + +--+-----------+ + >>> parsed_df.printSchema() + root + |-- id: integer (nullable = true) + |-- value: struct (nullable = true) + | |-- name: string (nullable = true) + | |-- age: integer (nullable = true) + """ + from pyspark.sql import Column, SparkSession # noqa: WPS442 + + spark = SparkSession._instantiatedSession # noqa: WPS437 + self.check_if_supported(spark) + + from pyspark.sql.column import _to_java_column # noqa: WPS450 + from pyspark.sql.functions import col + + if isinstance(column, Column): + column_name, column = column._jc.toString(), column.cast("string") # noqa: WPS437 + else: + column_name, column = column, col(column).cast("string") + + java_column = _to_java_column(column) + java_schema = spark._jsparkSession.parseDataType(schema.json()) # noqa: WPS437 + scala_options = spark._jvm.org.apache.spark.api.python.PythonUtils.toScalaMap( # noqa: WPS219, WPS437 + self.dict(), + ) + jc = spark._jvm.com.databricks.spark.xml.functions.from_xml( # noqa: WPS219, WPS437 + java_column, + java_schema, + scala_options, + ) + return Column(jc).alias(column_name) diff --git a/onetl/file/limit/limits_reached.py b/onetl/file/limit/limits_reached.py index fe3d98b85..27d7fb502 100644 --- a/onetl/file/limit/limits_reached.py +++ b/onetl/file/limit/limits_reached.py @@ -14,6 +14,8 @@ def limits_reached(limits: Iterable[BaseFileLimit]) -> bool: """ Check if any of limits reached. + .. versionadded:: 0.8.0 + Parameters ---------- limits : Iterable of :obj:`onetl.base.base_file_limit.BaseFileLimit` @@ -28,18 +30,17 @@ def limits_reached(limits: Iterable[BaseFileLimit]) -> bool: Examples -------- - .. code:: python - - from onetl.file.limit import MaxFilesCount, limits_reached, limits_stop_at - from onetl.impl import LocalPath - - limits = [MaxFilesCount(2)] - assert not limits_reached(limits) - - assert not limits_stop_at(LocalPath("/path/to/file.csv"), limits) - assert limits_stop_at(LocalPath("/path/to/file.csv"), limits) - - assert limits_reached(limits) + >>> from onetl.file.limit import MaxFilesCount, limits_reached, limits_stop_at + >>> from onetl.impl import LocalPath + >>> limits = [MaxFilesCount(2)] + >>> limits_reached(limits) + False + >>> limits_stop_at(LocalPath("/path/to/file.csv"), limits) + False + >>> limits_stop_at(LocalPath("/path/to/file.csv"), limits) + True + >>> limits_reached(limits) + True """ debug = log.isEnabledFor(logging.DEBUG) diff --git a/onetl/file/limit/limits_stop_at.py b/onetl/file/limit/limits_stop_at.py index 472c94426..035ac6424 100644 --- a/onetl/file/limit/limits_stop_at.py +++ b/onetl/file/limit/limits_stop_at.py @@ -14,6 +14,8 @@ def limits_stop_at(path: PathProtocol, limits: Iterable[BaseFileLimit]) -> bool: """ Check if some of limits stops at given path. + .. versionadded:: 0.8.0 + Parameters ---------- path : :obj:`onetl.base.path_protocol.PathProtocol` @@ -31,15 +33,13 @@ def limits_stop_at(path: PathProtocol, limits: Iterable[BaseFileLimit]) -> bool: Examples -------- - .. code:: python - - from onetl.file.limit import MaxFilesCount, limits_stop_at - from onetl.impl import LocalPath - - limits = [MaxFilesCount(1)] - - assert not limits_stop_at(LocalPath("/path/to/file.csv"), limits) - assert limits_stop_at(LocalPath("/path/to/file.csv"), limits) + >>> from onetl.file.limit import MaxFilesCount, limits_stop_at + >>> from onetl.impl import LocalPath + >>> limits = [MaxFilesCount(2)] + >>> limits_stop_at(LocalPath("/path/to/file.csv"), limits) + False + >>> limits_stop_at(LocalPath("/path/to/file.csv"), limits) + True """ reached = [] for limit in limits: diff --git a/onetl/file/limit/max_files_count.py b/onetl/file/limit/max_files_count.py index 5f2f584d6..ec604ff41 100644 --- a/onetl/file/limit/max_files_count.py +++ b/onetl/file/limit/max_files_count.py @@ -13,6 +13,9 @@ class MaxFilesCount(BaseFileLimit, FrozenModel): """Limits the total number of files handled by :ref:`file-downloader` or :ref:`file-mover`. + .. versionadded:: 0.8.0 + Replaces deprecated ``onetl.core.FileLimit`` + Parameters ---------- diff --git a/onetl/file/limit/reset_limits.py b/onetl/file/limit/reset_limits.py index 4a30ca445..de9201da8 100644 --- a/onetl/file/limit/reset_limits.py +++ b/onetl/file/limit/reset_limits.py @@ -29,18 +29,18 @@ def reset_limits(limits: Iterable[BaseFileLimit]) -> list[BaseFileLimit]: Examples -------- - .. code:: python - - from onetl.file.limit import MaxFilesCount, limits_reached, reset_limits - from onetl.impl import LocalPath - - limits = [MaxFilesCount(1)] - - assert not limits_reached(limits) - # do something - assert limits_reached(limits) - - new_limits = reset_limits(limits) - assert not limits_reached(new_limits) + >>> from onetl.file.limit import MaxFilesCount, limits_reached, limits_stop_at, reset_limits + >>> from onetl.impl import LocalPath + >>> limits = [MaxFilesCount(1)] + >>> limits_reached(limits) + False + >>> # do something + >>> limits_stop_at(LocalPath("/path/to/file.csv"), limits) + True + >>> limits_reached(limits) + True + >>> new_limits = reset_limits(limits) + >>> limits_reached(new_limits) + False """ return [limit.reset() for limit in limits] diff --git a/onetl/hooks/hook.py b/onetl/hooks/hook.py index c0e9a9f25..452003848 100644 --- a/onetl/hooks/hook.py +++ b/onetl/hooks/hook.py @@ -24,6 +24,8 @@ class HookPriority(int, Enum): Hook priority enum. All hooks within the same priority are executed in the same order they were registered. + + .. versionadded:: 0.7.0 """ FIRST = -1 @@ -41,6 +43,8 @@ class Hook(Generic[T]): # noqa: WPS338 """ Hook representation. + .. versionadded:: 0.7.0 + Parameters ---------- @@ -81,16 +85,18 @@ def enable(self): """ Enable the hook. + .. versionadded:: 0.7.0 + Examples -------- - .. code:: python - - hook = Hook(..., enabled=False) - assert not hook.enabled - - hook.enable() - assert hook.enabled + >>> def func1(): ... + >>> hook = Hook(callback=func1, enabled=False) + >>> hook.enabled + False + >>> hook.enable() + >>> hook.enabled + True """ if self.enabled: logger.log( @@ -107,16 +113,18 @@ def disable(self): """ Disable the hook. + .. versionadded:: 0.7.0 + Examples -------- - .. code:: python - - hook = Hook(..., enabled=True) - assert hook.enabled - - hook.disable() - assert not hook.enabled + >>> def func1(): ... + >>> hook = Hook(callback=func1, enabled=True) + >>> hook.enabled + True + >>> hook.disable() + >>> hook.enabled + False """ if self.enabled: logger.log(NOTICE, "|Hooks| Disable hook '%s.%s'", self.callback.__module__, self.callback.__qualname__) @@ -141,35 +149,40 @@ def skip(self): You should call :obj:`~enable` explicitly to change its state. + .. versionadded:: 0.7.0 + Examples -------- .. tabs:: - .. code-tab:: py Context manager syntax - - hook = Hook(..., enabled=True) - assert hook.enabled - - with hook.skip(): - assert not hook.enabled - - # hook state is restored as it was before entering the context manager - assert hook.enabled - - .. code-tab:: py Decorator syntax - - hook = Hook(..., enabled=True) - assert hook.enabled - - @hook.skip() - def hook_disabled(): - assert not hook.enabled - - hook_disabled() - - # hook state is restored as it was before entering the context manager - assert hook.enabled + .. tab:: Context manager syntax + + >>> def func1(): ... + >>> hook = Hook(callback=func1, enabled=True) + >>> hook.enabled + True + >>> with hook.skip(): + ... print(hook.enabled) + False + >>> # hook state is restored as it was before entering the context manager + >>> hook.enabled + True + + .. tab:: Decorator syntax + + >>> def func1(): ... + >>> hook = Hook(callback=func1, enabled=True) + >>> hook.enabled + True + >>> @hook.skip() + ... def hook_disabled(): + ... print(hook.enabled) + >>> hook_disabled() + False + >>> # hook state is restored as it was before entering the context manager + >>> hook.enabled + True """ if not self.enabled: logger.log( @@ -205,18 +218,17 @@ def __call__(self, *args, **kwargs) -> T | ContextDecorator: Examples -------- - .. code:: python - - from onetl.hooks.hook import Hook, HookPriority - - - def some_func(*args, **kwargs): ... - - - hook = Hook(callback=some_func) - - result = hook(1, "abc", some="arg") - assert result == some_func(1, "abc", some="arg") + >>> from onetl.hooks.hook import Hook, HookPriority + >>> def some_func(*args, **kwargs): + ... print(args) + ... print(kwargs) + ... return "func result" + >>> hook = Hook(callback=some_func) + >>> result = hook(1, "abc", some="arg") + (1, 'abc') + {'some': 'arg'} + >>> result + 'func result' """ result = self.callback(*args, **kwargs) if isinstance(result, Generator): @@ -363,6 +375,8 @@ def hook(inp: Callable[..., T] | None = None, enabled: bool = True, priority: Ho """ Initialize hook from callable/context manager. + .. versionadded:: 0.7.0 + Examples -------- diff --git a/onetl/hooks/hook_collection.py b/onetl/hooks/hook_collection.py index 6c6643122..d715086c6 100644 --- a/onetl/hooks/hook_collection.py +++ b/onetl/hooks/hook_collection.py @@ -16,6 +16,8 @@ class HookCollection: """ Representation of hooks collection. + .. versionadded:: 0.7.0 + Examples -------- @@ -34,26 +36,27 @@ def __init__(self, hooks: list[Hook] | HookCollection | None = None): @property def active(self): """ - Return HookCollection but containing only hooks with enabled state. + Return new HookCollection but containing only hooks with ``enabled=True`` state. If called after :obj:`~stop` or inside :obj:`~skip`, empty collection will be returned. + .. versionadded:: 0.7.0 + Examples -------- - .. code:: python - - from onetl.hooks.hook import Hook - from onetl.hooks.hook_collection import HookCollection - - hooks = HookCollection( - [ - Hook(callback=func1, enabled=True), - Hook(callback=func2, enabled=False), - ] - ) - - assert hooks.active == HookCollection([Hook(callback=func1)]) + >>> from onetl.hooks.hook import Hook + >>> from onetl.hooks.hook_collection import HookCollection + >>> def func1(): ... + >>> def func2(): ... + >>> hooks = HookCollection( + ... [ + ... Hook(callback=func1, enabled=True), + ... Hook(callback=func2, enabled=False), + ... ], + ... ) + >>> len(hooks.active) + 1 """ if self._enabled: @@ -65,23 +68,24 @@ def stop(self) -> None: """ Stop all hooks in the collection. + .. versionadded:: 0.7.0 + Examples -------- - .. code:: python - - from onetl.hooks.hook import Hook - from onetl.hooks.hook_collection import HookCollection - - hooks = HookCollection( - [ - Hook(callback=func1), - Hook(callback=func2), - ] - ) - - hooks.stop() - assert hooks.active == HookCollection() + >>> from onetl.hooks.hook import Hook + >>> from onetl.hooks.hook_collection import HookCollection + >>> def func1(): ... + >>> def func2(): ... + >>> hooks = HookCollection( + ... [ + ... Hook(callback=func1), + ... Hook(callback=func2), + ... ], + ... ) + >>> hooks.stop() + >>> hooks.active + HookCollection([]) """ self._enabled = False @@ -94,29 +98,29 @@ def resume(self) -> None: If hook is disabled by :obj:`onetl.hooks.hook.Hook.disable`, it will stay disabled. You should call :obj:`onetl.hooks.hook.Hook.enable` explicitly. + .. versionadded:: 0.7.0 + Examples -------- - .. code:: python - - from onetl.hooks.hook import Hook - from onetl.hooks.hook_collection import HookCollection - - hooks = HookCollection( - [ - Hook(callback=func1), - Hook(callback=func2), - ] - ) - - hooks.resume() - - assert hooks.active == HookCollection( - [ - Hook(callback=func1), - Hook(callback=func2), - ] - ) + >>> from onetl.hooks.hook import Hook + >>> from onetl.hooks.hook_collection import HookCollection + >>> def func1(): ... + >>> def func2(): ... + >>> hooks = HookCollection( + ... [ + ... Hook(callback=func1), + ... Hook(callback=func2), + ... ], + ... ) + >>> hooks.resume() + >>> hooks.active # doctest: +SKIP + HookCollection( + [ + Hook(callback=func1), + Hook(callback=func2), + ] + ) """ self._enabled = True @@ -132,28 +136,31 @@ def skip(self): after exiting the context/decorated function. You should call :obj:`~resume` explicitly. + .. versionadded:: 0.7.0 + Examples -------- - .. code:: python - - from onetl.hooks.hook import Hook - from onetl.hooks.hook_collection import HookCollection - - hooks = HookCollection( - [ - Hook(callback=func1), - Hook(callback=func2), - ] - ) - - # hooks state is same as created by constructor - - with hooks.skip(): - # all hooks are disabled here - ... - - # hooks state is restored as it was before entering the context manager + >>> from onetl.hooks.hook import Hook + >>> from onetl.hooks.hook_collection import HookCollection + >>> def func1(): ... + >>> def func2(): ... + >>> hooks = HookCollection( + ... [ + ... Hook(callback=func1), + ... Hook(callback=func2), + ... ], + ... ) + >>> # hooks state is same as created by constructor + >>> len(hooks.active) + 2 + >>> with hooks.skip(): + ... # all hooks are disabled here + ... print(len(hooks.active)) + 0 + >>> # hooks state is restored as it was before entering the context manager + >>> len(hooks.active) + 2 """ if not self._enabled: @@ -168,102 +175,100 @@ def skip(self): def add(self, item: Hook): """Appends hook to the collection. + .. versionadded:: 0.7.0 + Examples -------- - .. code:: python - - from onetl.hooks.hook import Hook - from onetl.hooks.hook_collection import HookCollection - - hooks = HookCollection( - [ - Hook(callback=func1, enabled=True), - ] - ) - - new_hook = Hook(callback=func2, enabled=False) - hooks.add(new_hook) - - assert hooks == HookCollection( - [ - Hook(callback=func1, enabled=True), - Hook(callback=func2, enabled=False), - ] - ) + >>> from onetl.hooks.hook import Hook + >>> from onetl.hooks.hook_collection import HookCollection + >>> def func1(): ... + >>> def func2(): ... + >>> hooks = HookCollection( + ... [ + ... Hook(callback=func1), + ... ], + ... ) + + >>> new_hook = Hook(callback=func2) + >>> hooks.add(new_hook) + >>> len(hooks.active) + 2 """ self._hooks.append(item) def extend(self, hooks: Iterable[Hook]): """Extends collection using a iterator. + .. versionadded:: 0.7.0 + Examples -------- - .. code:: python - - from onetl.hooks.hook import Hook - from onetl.hooks.hook_collection import HookCollection - - hooks = HookCollection( - [ - Hook(callback=func1, enabled=True), - ] - ) - - new_hooks = [Hook(callback=func2, enabled=False)] - hooks.extend(new_hook) - - assert hooks == HookCollection( - [ - Hook(callback=func1, enabled=True), - Hook(callback=func2, enabled=False), - ] - ) + >>> from onetl.hooks.hook import Hook + >>> from onetl.hooks.hook_collection import HookCollection + >>> def func1(): ... + >>> def func2(): ... + >>> hooks = HookCollection( + ... [ + ... Hook(callback=func1), + ... ], + ... ) + + >>> new_hooks = [Hook(callback=func2)] + >>> hooks.extend(new_hooks) + >>> len(hooks.active) + 2 """ self._hooks.extend(hooks) def __iter__(self): """Iterate over hooks in the collection. + .. versionadded:: 0.7.0 + Examples -------- - .. code:: python - - from onetl.hooks.hook import Hook - from onetl.hooks.hook_collection import HookCollection - - hooks = HookCollection( - [ - Hook(callback=func1, enabled=True), - Hook(callback=func2, enabled=True), - ] - ) - - for hook in hooks: - assert hook.enabled + >>> from onetl.hooks.hook import Hook + >>> from onetl.hooks.hook_collection import HookCollection + >>> def func1(): ... + >>> def func2(): ... + >>> hooks = HookCollection( + ... [ + ... Hook(callback=func1, enabled=True), + ... Hook(callback=func2, enabled=False), + ... ], + ... ) + >>> for hook in hooks: + ... print(hook.enabled) + True + False """ return iter(self._hooks) def __len__(self): """Return collection length. + .. versionadded:: 0.7.0 + Examples -------- - .. code:: python - - from onetl.hooks.hook import Hook - from onetl.hooks.hook_collection import HookCollection - - hooks = HookCollection( - [ - Hook(callback=func1, enabled=True), - Hook(callback=func2, enabled=True), - ] - ) - - assert len(hooks) == 2 + >>> from onetl.hooks.hook import Hook + >>> from onetl.hooks.hook_collection import HookCollection + >>> def func1(): ... + >>> def func2(): ... + >>> hooks = HookCollection( + ... [ + ... Hook(callback=func1), + ... Hook(callback=func2), + ... ], + ... ) + >>> len(hooks) + 2 """ return len(self._hooks) + + def __repr__(self): + return f"HookCollection({self._hooks})" diff --git a/onetl/hooks/hooks_state.py b/onetl/hooks/hooks_state.py index de46b33c0..53a2c0c39 100644 --- a/onetl/hooks/hooks_state.py +++ b/onetl/hooks/hooks_state.py @@ -22,6 +22,8 @@ def stop(cls) -> None: """ Stop all hooks for all classes. + .. versionadded:: 0.7.0 + Examples -------- @@ -51,6 +53,8 @@ def resume(cls) -> None: This function does not enable hooks which were disabled by :obj:`onetl.hooks.hook.Hook.disable`, or stopped by :obj:`onetl.hooks.support_hooks.suspend_hooks`. + .. versionadded:: 0.7.0 + Examples -------- @@ -84,6 +88,8 @@ def skip(cls): after exiting the context/decorated function. You should call :obj:`~resume_all_hooks` explicitly. + .. versionadded:: 0.7.0 + Examples -------- diff --git a/onetl/hooks/method_inheritance_stack.py b/onetl/hooks/method_inheritance_stack.py index c65a2924e..99fef076c 100644 --- a/onetl/hooks/method_inheritance_stack.py +++ b/onetl/hooks/method_inheritance_stack.py @@ -20,30 +20,28 @@ class MethodInheritanceStack: Examples -------- - .. code:: python - - @support_hooks - class BaseClass: - @slot - def some_method(self, *args, **kwargs): - pass - - - @support_hooks - class MyClass(BaseClass): - @slot - def some_method(self, *args, **kwargs): - self.do_something() - super().some_method(*args, **kwargs) - - - # caused by MyClass.some_method() call - with MethodInheritanceStack(MyClass, "some_method") as method_call_stack: - assert method_call_stack.level == 0 - - # MyClass.some_method() called super().some_method() - with MethodInheritanceStack(BaseClass, "some_method") as method_call_stack: - assert method_call_stack.level == 1 + >>> from onetl.hooks import support_hooks, slot + >>> from onetl.hooks.method_inheritance_stack import MethodInheritanceStack + >>> @support_hooks + ... class BaseClass: + ... @slot + ... def some_method(self, *args, **kwargs): + ... pass + >>> @support_hooks + ... class MyClass(BaseClass): + ... @slot + ... def some_method(self, *args, **kwargs): + ... self.do_something() + ... super().some_method(*args, **kwargs) + + >>> # caused by MyClass.some_method() call + >>> with MethodInheritanceStack(MyClass, "some_method") as method_call_stack: + ... print("MyClass", method_call_stack.level) + ... # MyClass.some_method() called super().some_method() + ... with MethodInheritanceStack(BaseClass, "some_method") as method_call_stack: + ... print("BaseClass", method_call_stack.level) + MyClass 0 + BaseClass 1 """ _stack: dict[type, dict[str, int]] = defaultdict(lambda: defaultdict(int)) diff --git a/onetl/hooks/slot.py b/onetl/hooks/slot.py index cf9171f63..6d4b0b879 100644 --- a/onetl/hooks/slot.py +++ b/onetl/hooks/slot.py @@ -67,6 +67,8 @@ def bind_hook(method: Callable, inp=None): See :ref:`hooks-design` for more details. + .. versionadded:: 0.7.0 + Examples -------- @@ -255,6 +257,8 @@ def register_slot(cls: type, method_name: str): # noqa: WPS231, WPS213, WPS212 Also ``@classmethod`` is a descriptor, and it can be called only my accessing the class itself, which is not possible within a decorator. + .. versionadded:: 0.7.0 + Examples -------- @@ -465,7 +469,10 @@ def is_slot(method: Callable) -> bool: class Slot(Protocol): - """Protocol which is implemented by a method after applying :obj:`~slot` decorator.""" + """Protocol which is implemented by a method after applying :obj:`~slot` decorator. + + .. versionadded:: 0.7.0 + """ def __call__(self, *args, **kwargs): ... @@ -637,6 +644,8 @@ def slot(method: Method) -> Method: It is not allowed to use this decorator over ``_private`` and ``__protected`` methods and ``@property``. But is allowed to use on ``__dunder__`` methods, like ``__init__``. + .. versionadded:: 0.7.0 + Examples --------- diff --git a/onetl/hooks/support_hooks.py b/onetl/hooks/support_hooks.py index 052ac64fd..33d440b55 100644 --- a/onetl/hooks/support_hooks.py +++ b/onetl/hooks/support_hooks.py @@ -91,6 +91,8 @@ def with_all_hooks_disabled(): # running outside a decorated function restores previous behavior obj = MyClass() obj.my_method(2) # will execute callback(obj, 2) + + .. versionadded:: 0.7.0 """ slots = get_slots(cls) @@ -126,6 +128,8 @@ def callback(self, arg): ... MyClass.suspend_hooks() obj.my_method(2) # will NOT execute callback + + .. versionadded:: 0.7.0 """ slots = get_slots(cls) @@ -161,6 +165,8 @@ def callback(self, arg): ... MyClass.resume_hooks() obj.my_method(2) # will execute callback(obj, 2) + + .. versionadded:: 0.7.0 """ slots = get_slots(cls) @@ -177,6 +183,8 @@ def support_hooks(cls: Klass) -> Klass: Adds :obj:`~skip_hooks`, :obj:`~suspend_hooks` and :obj:`~resume_hooks` to the class. + .. versionadded:: 0.7.0 + Examples --------- diff --git a/onetl/hwm/store/hwm_class_registry.py b/onetl/hwm/store/hwm_class_registry.py index ccd074e40..82b0eef2e 100644 --- a/onetl/hwm/store/hwm_class_registry.py +++ b/onetl/hwm/store/hwm_class_registry.py @@ -13,18 +13,16 @@ class SparkTypeToHWM: Examples -------- - .. code:: python - - from etl_entities.hwm import ColumnIntHWM, ColumnDateHWM - from onetl.hwm.store import SparkTypeToHWM - - assert SparkTypeToHWM.get("integer") == ColumnIntHWM - assert SparkTypeToHWM.get("short") == ColumnIntHWM # multiple type names are supported - - assert SparkTypeToHWM.get("date") == ColumnDateHWM - - assert SparkTypeToHWM.get("unknown") is None - + >>> from etl_entities.hwm import ColumnIntHWM, ColumnDateHWM + >>> from onetl.hwm.store import SparkTypeToHWM + >>> SparkTypeToHWM.get("integer") + + >>> # multiple type names are supported + >>> SparkTypeToHWM.get("short") + + >>> SparkTypeToHWM.get("date") + + >>> SparkTypeToHWM.get("unknown") """ _mapping: ClassVar[dict[str, type[HWM]]] = { @@ -57,20 +55,15 @@ def register_spark_type_to_hwm_type_mapping(*type_names: str): Examples -------- - .. code:: python - - from etl_entities import HWM - from onetl.hwm.store import SparkTypeToHWM - from onetl.hwm.store import SparkTypeToHWM, register_spark_type_to_hwm_type_mapping - - - @register_spark_type_to_hwm_type_mapping("somename", "anothername") - class MyHWM(HWM): ... - - - assert SparkTypeToHWM.get("somename") == MyClass - assert SparkTypeToHWM.get("anothername") == MyClass - + >>> from etl_entities.hwm import ColumnHWM + >>> from onetl.hwm.store import SparkTypeToHWM + >>> from onetl.hwm.store import SparkTypeToHWM, register_spark_type_to_hwm_type_mapping + >>> @register_spark_type_to_hwm_type_mapping("somename", "anothername") + ... class MyHWM(ColumnHWM): ... + >>> SparkTypeToHWM.get("somename") + + >>> SparkTypeToHWM.get("anothername") + """ def wrapper(cls: type[HWM]): diff --git a/onetl/log.py b/onetl/log.py index 32c239b8b..aeed6403f 100644 --- a/onetl/log.py +++ b/onetl/log.py @@ -56,6 +56,9 @@ def setup_notebook_logging(level: int | str = logging.INFO) -> None: Should **NOT** be used in applications, you should set up logging settings manually, according to your framework documentation. + .. deprecated:: 0.5.0 + Use :obj:`~setup_logging` instead + Parameters ---------- level : ``int`` or ``str``, default ``INFO`` @@ -80,6 +83,9 @@ def setup_logging(level: int | str = logging.INFO, enable_clients: bool = False) Should be used only in IDEs (like Jupyter notebooks or PyCharm), or scripts (ETL pipelines). + .. versionchanged:: 0.5.0 + Renamed ``setup_notebook_logging`` → ``setup_logging`` + Parameters ---------- level : ``int`` or ``str``, default ``INFO`` @@ -92,6 +98,8 @@ def setup_logging(level: int | str = logging.INFO, enable_clients: bool = False) .. note:: For ``level="DEBUG"`` it is recommended to use ``enable_clients=True`` + + .. versionadded:: 0.9.0 """ logging.basicConfig(level=level) @@ -118,6 +126,9 @@ def setup_clients_logging(level: int | str = DISABLED) -> None: Can be used in applications, but it is recommended to set up these loggers according to your framework documentation. + .. versionchanged:: 0.9.0 + Renamed ``disable_clients_logging`` → ``setup_clients_logging`` + Parameters ---------- level : ``int`` or ``str``, default ``DISABLED`` @@ -127,6 +138,8 @@ def setup_clients_logging(level: int | str = DISABLED) -> None: For ``py4j``, logging level with maximum verbosity is ``INFO`` because ``DEBUG`` logs are totally unreadable. + + .. versionadded:: 0.9.0 """ for client_module in CLIENT_MODULES: diff --git a/onetl/strategy/incremental_strategy.py b/onetl/strategy/incremental_strategy.py index 267b01613..0397514b6 100644 --- a/onetl/strategy/incremental_strategy.py +++ b/onetl/strategy/incremental_strategy.py @@ -75,21 +75,26 @@ class IncrementalStrategy(OffsetMixin, HWMStrategy): .. code:: python - assert download_result == DownloadResult( - successful=[ - "/path/my/file1", - "/path/my/file2", - ] + DownloadResult( + ..., + successful={ + LocalFile("/downloaded/file1"), + LocalFile("/downloaded/file2"), + }, ) Then the downloaded files list is saved as ``FileListHWM`` object into :ref:`HWM Store `: .. code:: python - [ - "/path/my/file1", - "/path/my/file2", - ] + FileListHWM( + ..., + entity="/path", + value=[ + "/path/my/file1", + "/path/my/file2", + ], + ) Next incremental run will download only new files from the source: @@ -104,22 +109,26 @@ class IncrementalStrategy(OffsetMixin, HWMStrategy): .. code:: python # only files which are not in FileListHWM - - assert download_result == DownloadResult( - successful=[ - "/path/my/file3", - ] + DownloadResult( + ..., + successful={ + LocalFile("/downloaded/file3"), + }, ) New files will be added to the ``FileListHWM`` and saved to :ref:`HWM Store `: .. code:: python - [ - "/path/my/file1", - "/path/my/file2", - "/path/my/file3", - ] + FileListHWM( + ..., + entity="/path", + value=[ + "/path/my/file1", + "/path/my/file2", + "/path/my/file3", + ], + ) .. warning:: @@ -130,6 +139,8 @@ class IncrementalStrategy(OffsetMixin, HWMStrategy): * FileDownloader creates files on local filesystem, and file content may differ for different :obj:`modes `. * It can remove files from the source if :obj:`delete_source ` is set to ``True``. + .. versionadded:: 0.1.0 + Parameters ---------- offset : Any, default: ``None`` @@ -391,6 +402,8 @@ class IncrementalBatchStrategy(OffsetMixin, BatchHWMStrategy): supports batch strategy. For example, Kafka connection doesn't support it. Make sure the connection you use is compatible with the IncrementalBatchStrategy. + .. versionadded:: 0.1.0 + Parameters ---------- step : Any diff --git a/onetl/strategy/snapshot_strategy.py b/onetl/strategy/snapshot_strategy.py index 5368b2039..77ed4b35b 100644 --- a/onetl/strategy/snapshot_strategy.py +++ b/onetl/strategy/snapshot_strategy.py @@ -38,13 +38,16 @@ class SnapshotStrategy(BaseStrategy): .. code:: python - assert download_result == DownloadResult( - successful=[ - "/path/my/file1", - "/path/my/file2", - ] + DownloadResult( + ..., + successful={ + LocalFile("/downloaded/file1"), + LocalFile("/downloaded/file2"), + }, ) + .. versionadded:: 0.1.0 + Examples -------- @@ -157,6 +160,8 @@ class SnapshotBatchStrategy(BatchHWMStrategy): supports batch strategy. For example, Kafka connection doesn't support it. Make sure the connection you use is compatible with the SnapshotBatchStrategy. + .. versionadded:: 0.1.0 + Parameters ---------- step : Any diff --git a/pytest.ini b/pytest.ini index 3c71e8eb6..a1143d118 100644 --- a/pytest.ini +++ b/pytest.ini @@ -27,3 +27,10 @@ markers = samba: Samba tests teradata: Teradata tests webdav: WebDAV tests + csv: CSV tests + json: JSON tests + orc: ORC tests + parquet: Parquet tests + xml: XML tests + avro: Avro tests + excel: Excel tests diff --git a/requirements/tests/base.txt b/requirements/tests/base.txt index 17e7ef712..b702fca16 100644 --- a/requirements/tests/base.txt +++ b/requirements/tests/base.txt @@ -4,3 +4,5 @@ pytest<8 pytest-lazy-fixture pytest-mock pytest-rerunfailures +requests +responses diff --git a/requirements/tests/spark-3.3.3.txt b/requirements/tests/spark-3.3.4.txt similarity index 80% rename from requirements/tests/spark-3.3.3.txt rename to requirements/tests/spark-3.3.4.txt index 259340bf6..55629ed65 100644 --- a/requirements/tests/spark-3.3.3.txt +++ b/requirements/tests/spark-3.3.4.txt @@ -1,5 +1,5 @@ numpy>=1.16,<1.24 pandas>=1.0,<2 pyarrow>=1.0 -pyspark==3.3.3 +pyspark==3.3.4 sqlalchemy<2.0 diff --git a/requirements/tests/spark-3.4.2.txt b/requirements/tests/spark-3.4.3.txt similarity index 76% rename from requirements/tests/spark-3.4.2.txt rename to requirements/tests/spark-3.4.3.txt index c7173637d..5ea738d58 100644 --- a/requirements/tests/spark-3.4.2.txt +++ b/requirements/tests/spark-3.4.3.txt @@ -1,5 +1,5 @@ numpy>=1.16 pandas>=1.0 pyarrow>=1.0 -pyspark==3.4.2 +pyspark==3.4.3 sqlalchemy diff --git a/requirements/tests/spark-3.5.1.txt b/requirements/tests/spark-3.5.1.txt new file mode 100644 index 000000000..d1e812f7a --- /dev/null +++ b/requirements/tests/spark-3.5.1.txt @@ -0,0 +1,5 @@ +numpy>=1.16 +pandas>=1.0 +pyarrow>=1.0 +pyspark==3.5.1 +sqlalchemy diff --git a/setup.cfg b/setup.cfg index 96b763b7c..b72ff1b1d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -272,8 +272,10 @@ ignore = # https://github.com/wemake-services/wemake-python-styleguide/issues/2847 # E704 multiple statements on one line: def func(): ... E704, -# WPS220 Found too deep nesting: 46 > 20 - WPS220 +# WPS474 Found import object collision + WPS474, +# WPS318 Found extra indentation + WPS318 # http://flake8.pycqa.org/en/latest/user/options.html?highlight=per-file-ignores#cmdoption-flake8-per-file-ignores per-file-ignores = diff --git a/setup.py b/setup.py index d428bd826..c7ce5d0dc 100644 --- a/setup.py +++ b/setup.py @@ -37,7 +37,14 @@ def parse_requirements(file: Path) -> list[str]: requirements_hdfs = parse_requirements(here / "requirements" / "hdfs.txt") requirements_s3 = parse_requirements(here / "requirements" / "s3.txt") requirements_webdav = parse_requirements(here / "requirements" / "webdav.txt") -requirements_files = [*requirements_ftp, *requirements_sftp, *requirements_hdfs, *requirements_s3, *requirements_webdav] +requirements_files = [ + *requirements_ftp, + *requirements_sftp, + *requirements_hdfs, + *requirements_s3, + *requirements_webdav, + *requirements_samba, +] requirements_kerberos = parse_requirements(here / "requirements" / "kerberos.txt") requirements_spark = parse_requirements(here / "requirements" / "spark.txt") diff --git a/tests/.coveragerc b/tests/.coveragerc index 5d7fb16f9..08633e6cc 100644 --- a/tests/.coveragerc +++ b/tests/.coveragerc @@ -17,6 +17,7 @@ exclude_lines = class .*\bProtocol\): @(abc\.)?abstractmethod if pyspark_version + if spark_version spark = SparkSession._instantiatedSession if log.isEnabledFor(logging.DEBUG): if sys.version_info diff --git a/tests/fixtures/processing/clickhouse.py b/tests/fixtures/processing/clickhouse.py index 1205fad6a..bf0b2f3e7 100644 --- a/tests/fixtures/processing/clickhouse.py +++ b/tests/fixtures/processing/clickhouse.py @@ -20,7 +20,7 @@ class ClickhouseProcessing(BaseProcessing): "text_string": "String", "hwm_int": "Int32", "hwm_date": "Date", - "hwm_datetime": "DateTime", + "hwm_datetime": "DateTime64(6)", "float_value": "Float32", } @@ -152,3 +152,19 @@ def get_expected_dataframe( order_by: str | None = None, ) -> pandas.DataFrame: return self.connection.query_dataframe(self.get_expected_dataframe_ddl(schema, table, order_by)) + + def fix_pandas_df( + self, + df: pandas.DataFrame, + ) -> pandas.DataFrame: + df = super().fix_pandas_df(df) + + for column in df.columns: + column_name = column.lower() + + if "float" in column_name: + # somethere in chain Clickhouse -> Spark -> Pandas Float32 column is being converted to Float64, + # causing tests to fail. Convert it back to original type + df[column] = df[column].astype("float32") + + return df diff --git a/tests/fixtures/spark.py b/tests/fixtures/spark.py index b2559d520..dbc03ba33 100644 --- a/tests/fixtures/spark.py +++ b/tests/fixtures/spark.py @@ -5,6 +5,7 @@ import pytest from onetl._util.spark import get_pyspark_version +from onetl._util.version import Version @pytest.fixture(scope="session") @@ -31,7 +32,7 @@ def ivysettings_path(): @pytest.fixture(scope="session") -def maven_packages(): +def maven_packages(request): from onetl.connection import ( MSSQL, Clickhouse, @@ -47,43 +48,63 @@ def maven_packages(): from onetl.file.format import XML, Avro, Excel pyspark_version = get_pyspark_version() - packages = ( - Clickhouse.get_packages() - + MSSQL.get_packages() - + MySQL.get_packages() - + Oracle.get_packages() - + Postgres.get_packages() - + Teradata.get_packages() - ) - with_greenplum = os.getenv("ONETL_DB_WITH_GREENPLUM", "false").lower() == "true" - if with_greenplum: - # Greenplum connector jar is not publicly available, + # get markers from all downstream tests + markers = set() + for func in request.session.items: + markers.update(marker.name for marker in func.iter_markers()) + + packages: list[str] = [] + if "clickhouse" in markers: + packages.extend(Clickhouse.get_packages()) + + if "mssql" in markers: + packages.extend(MSSQL.get_packages()) + + if "mysql" in markers: + packages.extend(MySQL.get_packages()) + + if "oracle" in markers: + packages.extend(Oracle.get_packages()) + + if "postgres" in markers: + packages.extend(Postgres.get_packages()) + + if "teradata" in markers: + packages.extend(Teradata.get_packages()) + + if "greenplum" in markers: packages.extend( Greenplum.get_packages( - spark_version=pyspark_version, + spark_version=str(pyspark_version), package_version=os.getenv("ONETL_GP_PACKAGE_VERSION") or None, ), ) - if pyspark_version >= (2, 4): - # There is no Avro package for Spark 2.3 - packages.extend(Avro.get_packages(spark_version=pyspark_version)) - # Kafka connector for Spark 2.3 is too old and not supported - packages.extend(Kafka.get_packages(spark_version=pyspark_version)) - - if pyspark_version >= (3, 2): - # There is no SparkS3 connector for Spark less than 3 - packages.extend(SparkS3.get_packages(spark_version=pyspark_version)) - - # There is no XML files support for Spark less than 3 - packages.extend(XML.get_packages(spark_version=pyspark_version)) - - # There is no MongoDB connector for Spark less than 3.2 - packages.extend(MongoDB.get_packages(spark_version=pyspark_version)) - - # There is no Excel files support for Spark less than 3.2 - packages.extend(Excel.get_packages(spark_version=pyspark_version)) + if pyspark_version >= Version("2.4"): + if "avro" in markers: + # There is no Avro package for Spark 2.3 + packages.extend(Avro.get_packages(spark_version=str(pyspark_version))) + if "kafka" in markers: + # Kafka connector for Spark 2.3 is too old and not supported + packages.extend(Kafka.get_packages(spark_version=str(pyspark_version))) + + if pyspark_version >= Version("3.2"): + if "s3" in markers: + # There is no SparkS3 connector for Spark less than 3 + packages.extend(SparkS3.get_packages(spark_version=str(pyspark_version))) + + if "xml" in markers: + # There is no XML files support for Spark less than 3 + packages.extend(XML.get_packages(spark_version=str(pyspark_version))) + + if "mongodb" in markers: + # There is no MongoDB connector for Spark less than 3.2 + packages.extend(MongoDB.get_packages(spark_version=str(pyspark_version))) + + if "excel" in markers: + # There is no Excel files support for Spark less than 3.2 + packages.extend(Excel.get_packages(spark_version=str(pyspark_version))) return packages diff --git a/tests/tests_integration/test_file_format_integration/test_avro_integration.py b/tests/tests_integration/test_file_format_integration/test_avro_integration.py index eaffd6499..464727a2a 100644 --- a/tests/tests_integration/test_file_format_integration/test_avro_integration.py +++ b/tests/tests_integration/test_file_format_integration/test_avro_integration.py @@ -4,18 +4,24 @@ Do not test all the possible options and combinations, we are not testing Spark here. """ +import contextlib + import pytest +import responses from onetl._util.spark import get_spark_version +from onetl._util.version import Version from onetl.file import FileDFReader, FileDFWriter from onetl.file.format import Avro try: + from pyspark.sql.functions import col + from tests.util.assert_df import assert_equal_df except ImportError: - pytest.skip("Missing pandas", allow_module_level=True) + pytest.skip("Missing pandas or pyspark", allow_module_level=True) -pytestmark = [pytest.mark.local_fs, pytest.mark.file_df_connection, pytest.mark.connection] +pytestmark = [pytest.mark.local_fs, pytest.mark.file_df_connection, pytest.mark.connection, pytest.mark.avro] @pytest.fixture() @@ -29,8 +35,11 @@ def avro_schema(): {"name": "id", "type": ["null", "int"]}, {"name": "str_value", "type": ["null", "string"]}, {"name": "int_value", "type": ["null", "int"]}, - {"name": "date_value", "type": ["null", "int"]}, - {"name": "datetime_value", "type": ["null", "long"]}, + {"name": "date_value", "type": ["null", {"type": "int", "logicalType": "date"}]}, + { + "name": "datetime_value", + "type": ["null", {"type": "long", "logicalType": "timestamp-millis"}], + }, {"name": "float_value", "type": ["null", "double"]}, ], } @@ -54,8 +63,8 @@ def test_avro_reader( ): """Reading Avro files working as expected on any Spark, Python and Java versions""" spark_version = get_spark_version(spark) - if spark_version < (2, 4): - pytest.skip("Avro files are supported on Spark 3.2+ only") + if spark_version < Version("2.4"): + pytest.skip("Avro files are supported on Spark 2.4+ only") local_fs, source_path, _ = local_fs_file_df_connection_with_path_and_files df = file_df_dataframe @@ -91,8 +100,8 @@ def test_avro_writer( ): """Written files can be read by Spark""" spark_version = get_spark_version(spark) - if spark_version < (2, 4): - pytest.skip("Avro files are supported on Spark 3.2+ only") + if spark_version < Version("2.4"): + pytest.skip("Avro files are supported on Spark 2.4+ only") file_df_connection, source_path = local_fs_file_df_connection_with_path df = file_df_dataframe @@ -116,3 +125,114 @@ def test_avro_writer( assert read_df.count() assert read_df.schema == df.schema assert_equal_df(read_df, df, order_by="id") + + +@pytest.mark.parametrize("column_type", [str, col]) +def test_avro_serialize_and_parse_column( + spark, + file_df_dataframe, + avro_schema, + column_type, +): + from pyspark.sql.functions import struct + from pyspark.sql.types import BinaryType + + spark_version = get_spark_version(spark) + if spark_version < Version("2.4"): + pytest.skip("Avro files are supported on Spark 2.4+ only") + + if spark_version.major < 3: + msg = ( + f"`Avro.parse_column` or `Avro.serialize_column` are available " + f"only since Spark 3.x, but got {spark_version}" + ) + context_manager = pytest.raises(ValueError, match=msg) + else: + context_manager = contextlib.nullcontext() + + df = file_df_dataframe + avro = Avro(schema_dict=avro_schema) + + combined_df = df.withColumn("combined", struct([col(c) for c in df.columns])) + + with context_manager: + serialized_df = combined_df.select(avro.serialize_column(column_type("combined"))) + assert isinstance(serialized_df.schema["combined"].dataType, BinaryType) + parsed_df = serialized_df.select(avro.parse_column(column_type("combined"))) + assert combined_df.select("combined").collect() == parsed_df.collect() + + +@pytest.mark.parametrize("column_type", [str, col]) +def test_avro_serialize_and_parse_no_schema( + spark, + file_df_dataframe, + column_type, +): + from pyspark.sql.functions import struct + from pyspark.sql.types import BinaryType + + spark_version = get_spark_version(spark) + if spark_version < Version("2.4"): + pytest.skip("Avro files are supported on Spark 2.4+ only") + + if spark_version.major < 3: + msg = ( + f"`Avro.parse_column` or `Avro.serialize_column` are available " + f"only since Spark 3.x, but got {spark_version}" + ) + context_manager = pytest.raises(ValueError, match=msg) + else: + context_manager = contextlib.nullcontext() + + df = file_df_dataframe + avro = Avro() + + with context_manager: + combined_df = df.withColumn("combined", struct([col(c) for c in df.columns])) + serialized_df = combined_df.select(avro.serialize_column(column_type("combined"))) + assert isinstance(serialized_df.schema["combined"].dataType, BinaryType) + + with pytest.raises( + ValueError, + match="Avro.parse_column can be used only with defined `schema_dict` or `schema_url`", + ): + serialized_df.select(avro.parse_column(column_type("combined"))) + + +@pytest.mark.parametrize("column_type", [str, col]) +@responses.activate +def test_avro_serialize_and_parse_with_schema_url( + spark, + file_df_dataframe, + column_type, + avro_schema, +): + from pyspark.sql.functions import struct + from pyspark.sql.types import BinaryType + + spark_version = get_spark_version(spark) + if spark_version < Version("2.4"): + pytest.skip("Avro files are supported on Spark 2.4+ only") + + if spark_version.major < 3: + msg = ( + f"`Avro.parse_column` or `Avro.serialize_column` are available " + f"only since Spark 3.x, but got {spark_version}" + ) + context_manager = pytest.raises(ValueError, match=msg) + else: + context_manager = contextlib.nullcontext() + + # mocking the request to return a JSON schema + schema_url = "http://example.com/avro_schema" + responses.add(responses.GET, schema_url, json=avro_schema, status=200) + + df = file_df_dataframe + avro = Avro(schema_url=schema_url) + + combined_df = df.withColumn("combined", struct([col(c) for c in df.columns])) + with context_manager: + serialized_df = combined_df.select(avro.serialize_column(column_type("combined"))) + assert isinstance(serialized_df.schema["combined"].dataType, BinaryType) + parsed_df = serialized_df.select(avro.parse_column(column_type("combined"))) + assert combined_df.select("combined").collect() == parsed_df.collect() diff --git a/tests/tests_integration/test_file_format_integration/test_csv_integration.py b/tests/tests_integration/test_file_format_integration/test_csv_integration.py index 5dbfd20e1..bae6f1d4c 100644 --- a/tests/tests_integration/test_file_format_integration/test_csv_integration.py +++ b/tests/tests_integration/test_file_format_integration/test_csv_integration.py @@ -4,21 +4,27 @@ Do not test all the possible options and combinations, we are not testing Spark here. """ +import contextlib +import re + import pytest from onetl._util.spark import get_spark_version +from onetl._util.version import Version from onetl.file import FileDFReader, FileDFWriter from onetl.file.format import CSV try: - from pyspark.sql.functions import col + from pyspark.sql import Row + from pyspark.sql.functions import col, struct + from pyspark.sql.types import IntegerType, StringType, StructField, StructType from tests.util.assert_df import assert_equal_df from tests.util.spark_df import reset_column_names except ImportError: pytest.skip("Missing pandas or pyspark", allow_module_level=True) -pytestmark = [pytest.mark.local_fs, pytest.mark.file_df_connection, pytest.mark.connection] +pytestmark = [pytest.mark.local_fs, pytest.mark.file_df_connection, pytest.mark.connection, pytest.mark.csv] def test_csv_reader_with_infer_schema( @@ -46,7 +52,7 @@ def test_csv_reader_with_infer_schema( if spark_version.major < 3: # Spark 2 infers "date_value" as timestamp instead of date expected_df = df.withColumn("date_value", col("date_value").cast("timestamp")) - elif spark_version < (3, 3): + elif spark_version < Version("3.3"): # Spark 3.2 cannot infer "date_value", and return it as string expected_df = df.withColumn("date_value", col("date_value").cast("string")) @@ -132,3 +138,117 @@ def test_csv_writer_with_options( assert read_df.count() assert read_df.schema == df.schema assert_equal_df(read_df, df, order_by="id") + + +@pytest.mark.parametrize( + "csv_string, schema, options, expected", + [ + ( + "1,Anne", + StructType([StructField("id", IntegerType()), StructField("name", StringType())]), + {"delimiter": ",", "header": False}, + Row(id=1, name="Anne"), + ), + ( + "1;Anne", + StructType([StructField("id", IntegerType()), StructField("name", StringType())]), + {"delimiter": ";", "header": False}, + Row(id=1, name="Anne"), + ), + ( + '"1","Anne"', + StructType([StructField("id", IntegerType()), StructField("name", StringType())]), + {"delimiter": ",", "quote": '"', "header": False}, + Row(id=1, name="Anne"), + ), + ], + ids=["comma-delimited", "semicolon-delimited", "quoted-comma-delimited"], +) +@pytest.mark.parametrize("column_type", [str, col]) +def test_csv_parse_column(spark, csv_string, schema, options, expected, column_type): + spark_version = get_spark_version(spark) + if spark_version.major < 3: + msg = ( + f"`CSV.parse_column` or `CSV.serialize_column` are available " + f"only since Spark 3.x, but got {spark_version}" + ) + context_manager = pytest.raises(ValueError, match=re.escape(msg)) + else: + context_manager = contextlib.nullcontext() + + csv_handler = CSV(**options) + df = spark.createDataFrame([(csv_string,)], ["csv_string"]) + + with context_manager: + parsed_df = df.select(csv_handler.parse_column(column_type("csv_string"), schema)) + assert parsed_df.columns == ["csv_string"] + assert parsed_df.first()["csv_string"] == expected + + +@pytest.mark.parametrize( + "data, schema, options, expected_csv", + [ + ( + Row(id=1, name="Alice"), + StructType([StructField("id", IntegerType()), StructField("name", StringType())]), + {"delimiter": ","}, + "1,Alice", + ), + ( + Row(id=1, name="Alice"), + StructType([StructField("id", IntegerType()), StructField("name", StringType())]), + {"delimiter": ";"}, + "1;Alice", + ), + ], + ids=["comma-delimited", "semicolon-delimited"], +) +@pytest.mark.parametrize("column_type", [str, col]) +def test_csv_serialize_column(spark, data, schema, options, expected_csv, column_type): + spark_version = get_spark_version(spark) + if spark_version.major < 3: + msg = ( + f"`CSV.parse_column` or `CSV.serialize_column` are available " + f"only since Spark 3.x, but got {spark_version}" + ) + context_manager = pytest.raises(ValueError, match=re.escape(msg)) + else: + context_manager = contextlib.nullcontext() + + csv_handler = CSV(**options) + df = spark.createDataFrame([data], schema) + df = df.withColumn("csv_column", struct("id", "name")) + + with context_manager: + serialized_df = df.select(csv_handler.serialize_column(column_type("csv_column"))) + assert serialized_df.columns == ["csv_column"] + assert serialized_df.first()["csv_column"] == expected_csv + + +@pytest.mark.parametrize( + "options", + [ + ({"header": True}), + ({"compression": "gzip"}), + ({"inferSchema": True}), + ], + ids=["with-header", "with-compression", "with-inferSchema"], +) +def test_csv_serialize_column_unsupported_options_warning(spark, options): + spark_version = get_spark_version(spark) + if spark_version.major < 3: + pytest.skip("CSV.serialize_column in supported on Spark 3.x only") + + schema = StructType([StructField("id", IntegerType()), StructField("name", StringType())]) + df = spark.createDataFrame([Row(id=1, name="Alice")], schema) + df = df.withColumn("csv_column", struct("id", "name")) + + csv_handler = CSV(**options) + msg = ( + f"Option `{list(options.keys())[0]}` is set but not supported in `CSV.parse_column` or `CSV.serialize_column`." + ) + + with pytest.warns(UserWarning) as record: + df.select(csv_handler.serialize_column("csv_column")).collect() + assert record + assert msg in str(record[0].message) diff --git a/tests/tests_integration/test_file_format_integration/test_excel_integration.py b/tests/tests_integration/test_file_format_integration/test_excel_integration.py index 9228abd3d..890d9a9b6 100644 --- a/tests/tests_integration/test_file_format_integration/test_excel_integration.py +++ b/tests/tests_integration/test_file_format_integration/test_excel_integration.py @@ -7,6 +7,7 @@ import pytest from onetl._util.spark import get_spark_version +from onetl._util.version import Version from onetl.file import FileDFReader, FileDFWriter from onetl.file.format import Excel @@ -18,7 +19,7 @@ except ImportError: pytest.skip("Missing pandas or pyspark", allow_module_level=True) -pytestmark = [pytest.mark.local_fs, pytest.mark.file_df_connection, pytest.mark.connection] +pytestmark = [pytest.mark.local_fs, pytest.mark.file_df_connection, pytest.mark.connection, pytest.mark.excel] @pytest.mark.parametrize("format", ["xlsx", "xls"]) @@ -30,7 +31,7 @@ def test_excel_reader_with_infer_schema( ): """Reading CSV files with inferSchema=True working as expected on any Spark, Python and Java versions""" spark_version = get_spark_version(spark) - if spark_version < (3, 2): + if spark_version < Version("3.2"): pytest.skip("Excel files are supported on Spark 3.2+ only") file_df_connection, source_path, _ = local_fs_file_df_connection_with_path_and_files @@ -79,7 +80,7 @@ def test_excel_reader_with_options( ): """Reading Excel files working as expected on any Spark, Python and Java versions""" spark_version = get_spark_version(spark) - if spark_version < (3, 2): + if spark_version < Version("3.2"): pytest.skip("Excel files are supported on Spark 3.2+ only") local_fs, source_path, _ = local_fs_file_df_connection_with_path_and_files @@ -115,7 +116,7 @@ def test_excel_writer( ): """Written files can be read by Spark""" spark_version = get_spark_version(spark) - if spark_version < (3, 2): + if spark_version < Version("3.2"): pytest.skip("Excel files are supported on Spark 3.2+ only") file_df_connection, source_path = local_fs_file_df_connection_with_path diff --git a/tests/tests_integration/test_file_format_integration/test_json_integration.py b/tests/tests_integration/test_file_format_integration/test_json_integration.py index f1fbd1380..dcdbbc039 100644 --- a/tests/tests_integration/test_file_format_integration/test_json_integration.py +++ b/tests/tests_integration/test_file_format_integration/test_json_integration.py @@ -6,15 +6,31 @@ import pytest +from onetl._util.spark import get_spark_version +from onetl._util.version import Version from onetl.file import FileDFReader, FileDFWriter from onetl.file.format import JSON +try: + from pyspark.sql import Row + from pyspark.sql.functions import col + from pyspark.sql.types import ( + ArrayType, + IntegerType, + MapType, + StringType, + StructField, + StructType, + ) +except ImportError: + pytest.skip("Missing pyspark", allow_module_level=True) + try: from tests.util.assert_df import assert_equal_df except ImportError: pytest.skip("Missing pandas", allow_module_level=True) -pytestmark = [pytest.mark.local_fs, pytest.mark.file_df_connection, pytest.mark.connection] +pytestmark = [pytest.mark.local_fs, pytest.mark.file_df_connection, pytest.mark.connection, pytest.mark.json] @pytest.mark.parametrize( @@ -62,3 +78,69 @@ def test_json_writer_is_not_supported( format=JSON(), target_path=json_root, ) + + +@pytest.mark.parametrize( + "json_string, schema, expected", + [ + ( + '{"id": 1, "name": "Alice"}', + StructType([StructField("id", IntegerType()), StructField("name", StringType())]), + Row(id=1, name="Alice"), + ), + ( + '[{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}]', + ArrayType(StructType([StructField("id", IntegerType()), StructField("name", StringType())])), + [Row(id=1, name="Alice"), Row(id=2, name="Bob")], + ), + ( + '{"key1": "value1", "key2": "value2"}', + MapType(StringType(), StringType()), + {"key1": "value1", "key2": "value2"}, + ), + ], + ids=["struct", "map", "array"], +) +@pytest.mark.parametrize("column_type", [str, col]) +def test_json_parse_column(spark, json_string, schema, expected, column_type): + spark_version = get_spark_version(spark) + if spark_version < Version("2.4") and isinstance(schema, MapType): + pytest.skip("JSON.parse_column accepts MapType only in Spark 2.4+") + + json = JSON() + df = spark.createDataFrame([(json_string,)], ["json_column"]) + parsed_df = df.select(json.parse_column(column_type("json_column"), schema)) + assert parsed_df.columns == ["json_column"] + assert parsed_df.select("json_column").first()["json_column"] == expected + + +@pytest.mark.parametrize( + "data, schema, expected_json", + [ + ( + {"id": 1, "name": "Alice"}, + StructType([StructField("id", IntegerType(), True), StructField("name", StringType(), True)]), + '{"id":1,"name":"Alice"}', + ), + ( + [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}], + ArrayType(StructType([StructField("id", IntegerType(), True), StructField("name", StringType(), True)])), + '[{"id":1,"name":"Alice"},{"id":2,"name":"Bob"}]', + ), + ( + {"key1": "value1", "key2": "value2"}, + MapType(StringType(), StringType()), + '{"key1":"value1","key2":"value2"}', + ), + ], + ids=["struct", "array", "map"], +) +@pytest.mark.parametrize("column_type", [str, col]) +def test_json_serialize_column(spark, data, schema, expected_json, column_type): + json = JSON() + df_schema = StructType([StructField("json_string", schema)]) + df = spark.createDataFrame([(data,)], df_schema) + serialized_df = df.select(json.serialize_column(column_type("json_string"))) + actual_json = serialized_df.select("json_string").first()["json_string"] + assert actual_json == expected_json + assert serialized_df.columns == ["json_string"] diff --git a/tests/tests_integration/test_file_format_integration/test_jsonline_integration.py b/tests/tests_integration/test_file_format_integration/test_jsonline_integration.py index f4678e17d..a246843e4 100644 --- a/tests/tests_integration/test_file_format_integration/test_jsonline_integration.py +++ b/tests/tests_integration/test_file_format_integration/test_jsonline_integration.py @@ -14,7 +14,7 @@ except ImportError: pytest.skip("Missing pandas", allow_module_level=True) -pytestmark = [pytest.mark.local_fs, pytest.mark.file_df_connection, pytest.mark.connection] +pytestmark = [pytest.mark.local_fs, pytest.mark.file_df_connection, pytest.mark.connection, pytest.mark.json] @pytest.mark.parametrize( diff --git a/tests/tests_integration/test_file_format_integration/test_orc_integration.py b/tests/tests_integration/test_file_format_integration/test_orc_integration.py index a848f0f25..40902cef9 100644 --- a/tests/tests_integration/test_file_format_integration/test_orc_integration.py +++ b/tests/tests_integration/test_file_format_integration/test_orc_integration.py @@ -14,7 +14,7 @@ except ImportError: pytest.skip("Missing pandas", allow_module_level=True) -pytestmark = [pytest.mark.local_fs, pytest.mark.file_df_connection, pytest.mark.connection] +pytestmark = [pytest.mark.local_fs, pytest.mark.file_df_connection, pytest.mark.connection, pytest.mark.orc] @pytest.mark.parametrize( diff --git a/tests/tests_integration/test_file_format_integration/test_parquet_integration.py b/tests/tests_integration/test_file_format_integration/test_parquet_integration.py index 41d492c43..ea5844eb1 100644 --- a/tests/tests_integration/test_file_format_integration/test_parquet_integration.py +++ b/tests/tests_integration/test_file_format_integration/test_parquet_integration.py @@ -14,7 +14,7 @@ except ImportError: pytest.skip("Missing pandas", allow_module_level=True) -pytestmark = [pytest.mark.local_fs, pytest.mark.file_df_connection, pytest.mark.connection] +pytestmark = [pytest.mark.local_fs, pytest.mark.file_df_connection, pytest.mark.connection, pytest.mark.parquet] @pytest.mark.parametrize( diff --git a/tests/tests_integration/test_file_format_integration/test_xml_integration.py b/tests/tests_integration/test_file_format_integration/test_xml_integration.py index 2be9d33a4..5ebaaf1ab 100644 --- a/tests/tests_integration/test_file_format_integration/test_xml_integration.py +++ b/tests/tests_integration/test_file_format_integration/test_xml_integration.py @@ -4,6 +4,8 @@ Do not test all the possible options and combinations, we are not testing Spark here. """ +import datetime + import pytest from onetl._util.spark import get_spark_version @@ -11,11 +13,14 @@ from onetl.file.format import XML try: + from pyspark.sql import Row + from pyspark.sql.functions import col + from tests.util.assert_df import assert_equal_df except ImportError: - pytest.skip("Missing pandas", allow_module_level=True) + pytest.skip("Missing pandas or pyspark", allow_module_level=True) -pytestmark = [pytest.mark.local_fs, pytest.mark.file_df_connection, pytest.mark.connection] +pytestmark = [pytest.mark.local_fs, pytest.mark.file_df_connection, pytest.mark.connection, pytest.mark.xml] @pytest.fixture() @@ -43,7 +48,7 @@ def test_xml_reader( ): """Reading XML files working as expected on any Spark, Python and Java versions""" spark_version = get_spark_version(spark) - if spark_version < (3, 0): + if spark_version.major < 3: pytest.skip("XML files are supported on Spark 3.x only") local_fs, source_path, _ = local_fs_file_df_connection_with_path_and_files @@ -70,7 +75,7 @@ def test_xml_reader_with_infer_schema( ): """Reading XML files with inferSchema=True working as expected on any Spark, Python and Java versions""" spark_version = get_spark_version(spark) - if spark_version < (3, 0): + if spark_version.major < 3: pytest.skip("XML files are supported on Spark 3.x only") file_df_connection, source_path, _ = local_fs_file_df_connection_with_path_and_files @@ -108,7 +113,7 @@ def test_xml_writer( ): """Written files can be read by Spark""" spark_version = get_spark_version(spark) - if spark_version < (3, 0): + if spark_version.major < 3: pytest.skip("XML files are supported on Spark 3.x only") file_df_connection, source_path = local_fs_file_df_connection_with_path @@ -150,7 +155,7 @@ def test_xml_reader_with_attributes( ): """Reading XML files with attributes works as expected""" spark_version = get_spark_version(spark) - if spark_version < (3, 0): + if spark_version.major < 3: pytest.skip("XML files are supported on Spark 3.x only") local_fs, source_path, _ = local_fs_file_df_connection_with_path_and_files @@ -166,3 +171,45 @@ def test_xml_reader_with_attributes( assert read_df.count() assert read_df.schema == expected_xml_attributes_df.schema assert_equal_df(read_df, expected_xml_attributes_df, order_by="id") + + +@pytest.mark.parametrize( + "xml_input, expected_row", + [ + ( + """ + 1 + Alice + 123 + 2021-01-01 + 2021-01-01T07:01:01Z + 1.23 + """, + Row( + xml_string=Row( + id=1, + str_value="Alice", + int_value=123, + date_value=datetime.date(2021, 1, 1), + datetime_value=datetime.datetime(2021, 1, 1, 7, 1, 1), + float_value=1.23, + ), + ), + ), + ], + ids=["basic-case"], +) +@pytest.mark.parametrize("column_type", [str, col]) +def test_xml_parse_column(spark, xml_input: str, expected_row: Row, column_type, file_df_schema): + from onetl.file.format import XML + + spark_version = get_spark_version(spark) + if spark_version.major < 3: + pytest.skip("XML files are supported on Spark 3.x only") + + xml = XML(row_tag="item") + df = spark.createDataFrame([(xml_input,)], ["xml_string"]) + parsed_df = df.select(xml.parse_column(column_type("xml_string"), schema=file_df_schema)) + result_row = parsed_df.first() + + assert result_row == expected_row diff --git a/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_clickhouse_reader_integration.py b/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_clickhouse_reader_integration.py index e38de7413..72314b5b3 100644 --- a/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_clickhouse_reader_integration.py +++ b/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_clickhouse_reader_integration.py @@ -155,6 +155,7 @@ def test_clickhouse_reader_snapshot_with_columns(spark, processing, load_table_d assert count_df.collect()[0][0] == table_df.count() +@pytest.mark.xfail(reason="Clickhouse <24 deduplicated column names, but 24+ does not") def test_clickhouse_reader_snapshot_with_columns_duplicated(spark, processing, prepare_schema_table): clickhouse = Clickhouse( host=processing.host, @@ -180,9 +181,9 @@ def test_clickhouse_reader_snapshot_with_columns_duplicated(spark, processing, p ], ) - # Clickhouse can detect that column is already a part of * and does not produce duplicates - df2 = reader2.run() - assert df1.columns == df2.columns + with pytest.raises(Exception, match="The column `id_int` already exists"): + df2 = reader2.run() + assert df1.columns == df2.columns def test_clickhouse_reader_snapshot_with_columns_mixed_naming(spark, processing, get_schema_table): diff --git a/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_kafka_reader_integration.py b/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_kafka_reader_integration.py index 86f2b7e98..9c6e3db57 100644 --- a/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_kafka_reader_integration.py +++ b/tests/tests_integration/tests_core_integration/tests_db_reader_integration/test_kafka_reader_integration.py @@ -163,7 +163,7 @@ def test_kafka_reader_topic_does_not_exist(spark, processing): @pytest.mark.parametrize("group_id_option", ["group.id", "groupIdPrefix"]) def test_kafka_reader_with_group_id(group_id_option, spark, processing, kafka_dataframe_schema, kafka_topic): if get_spark_version(spark).major < 3: - pytest.skip("Spark 3.x or later is required to pas group.id") + pytest.skip("Spark 3.x or later is required to pass group.id") first_span = processing.create_pandas_df(min_id=0, max_id=100) processing.insert_pandas_df_into_topic(first_span, kafka_topic) diff --git a/tests/tests_integration/tests_db_connection_integration/test_clickhouse_integration.py b/tests/tests_integration/tests_db_connection_integration/test_clickhouse_integration.py index 73d8abfb2..78656d834 100644 --- a/tests/tests_integration/tests_db_connection_integration/test_clickhouse_integration.py +++ b/tests/tests_integration/tests_db_connection_integration/test_clickhouse_integration.py @@ -46,6 +46,21 @@ def test_clickhouse_connection_check_fail(spark): clickhouse.check() +def test_clickhouse_connection_check_extra_is_handled_by_driver(spark, processing): + clickhouse = Clickhouse( + host=processing.host, + port=processing.port, + user=processing.user, + password=processing.password, + database=processing.database, + spark=spark, + extra={"socket_timeout": "wrong_type"}, + ) + + with pytest.raises(RuntimeError, match="Connection is unavailable"): + clickhouse.check() + + @pytest.mark.parametrize("suffix", ["", ";"]) def test_clickhouse_connection_sql(spark, processing, load_table_data, suffix): clickhouse = Clickhouse( @@ -227,9 +242,7 @@ def table_finalizer(): updated_df = pandas.concat([updated_rows, unchanged_rows]) processing.assert_equal_df(df=df, other_frame=updated_df, order_by="id_int") - # not supported by Clickhouse - with pytest.raises(Exception): - clickhouse.execute(f"UPDATE {temp_table} SET hwm_int = 1 WHERE id_int < 50{suffix}") + clickhouse.execute(f"UPDATE {temp_table} SET hwm_int = 1 WHERE id_int < 50{suffix}") clickhouse.execute(f"ALTER TABLE {temp_table} DELETE WHERE id_int < 70{suffix}") df = clickhouse.fetch(f"SELECT * FROM {temp_table}{suffix}") @@ -252,7 +265,7 @@ def table_finalizer(): assert not clickhouse.fetch(f"SELECT * FROM {temp_table}{suffix}").count() -@pytest.mark.xfail(reason="Clickhouse 20.7 doesn't support functions") +@pytest.mark.xfail(reason="CREATE FUNCTION is not supported in Clickhouse < 21.20") @pytest.mark.parametrize("suffix", ["", ";"]) def test_clickhouse_connection_execute_function( request, diff --git a/tests/tests_integration/tests_db_connection_integration/test_greenplum_integration.py b/tests/tests_integration/tests_db_connection_integration/test_greenplum_integration.py index 4514594c5..6775244ee 100644 --- a/tests/tests_integration/tests_db_connection_integration/test_greenplum_integration.py +++ b/tests/tests_integration/tests_db_connection_integration/test_greenplum_integration.py @@ -48,6 +48,21 @@ def test_greenplum_connection_check_fail(spark): greenplum.check() +def test_greenplum_connection_check_extra_is_handled_by_driver(spark, processing): + greenplum = Greenplum( + host=processing.host, + port=processing.port, + user=processing.user, + password=processing.password, + database=processing.database, + spark=spark, + extra={**processing.extra, "connectTimeout": "wrong_type"}, + ) + + with pytest.raises(RuntimeError, match="Connection is unavailable"): + greenplum.check() + + @pytest.mark.parametrize("suffix", ["", ";"]) def test_greenplum_connection_fetch(spark, processing, load_table_data, suffix): greenplum = Greenplum( @@ -62,7 +77,7 @@ def test_greenplum_connection_fetch(spark, processing, load_table_data, suffix): table = load_table_data.full_name - df = greenplum.fetch(f"SELECT * FROM {table}{suffix}", Greenplum.JDBCOptions(fetchsize=2)) + df = greenplum.fetch(f"SELECT * FROM {table}{suffix}", Greenplum.FetchOptions(fetchsize=2)) table_df = processing.get_expected_dataframe( schema=load_table_data.schema, table=load_table_data.table, @@ -93,7 +108,10 @@ def test_greenplum_connection_ddl(spark, processing, get_schema_table, suffix): table_name, schema, table = get_schema_table fields = {column_name: processing.get_column_type(column_name) for column_name in processing.column_names} - assert not greenplum.execute(f"SET search_path TO {schema}, public{suffix}", Greenplum.JDBCOptions(queryTimeout=1)) + assert not greenplum.execute( + f"SET search_path TO {schema}, public{suffix}", + Greenplum.ExecuteOptions(queryTimeout=1), + ) assert not greenplum.execute(processing.create_schema_ddl(schema) + suffix) assert not greenplum.execute(processing.create_table_ddl(table, fields, schema) + suffix) diff --git a/tests/tests_integration/tests_db_connection_integration/test_mssql_integration.py b/tests/tests_integration/tests_db_connection_integration/test_mssql_integration.py index 9a875671a..4fad8a754 100644 --- a/tests/tests_integration/tests_db_connection_integration/test_mssql_integration.py +++ b/tests/tests_integration/tests_db_connection_integration/test_mssql_integration.py @@ -55,6 +55,21 @@ def test_mssql_connection_check_fail(spark): mssql.check() +def test_mssql_connection_check_extra_is_handled_by_driver(spark, processing): + mssql = MSSQL( + host=processing.host, + port=processing.port, + user=processing.user, + password=processing.password, + database=processing.database, + spark=spark, + extra={"trustServerCertificate": "false"}, + ) + + with pytest.raises(RuntimeError, match="Connection is unavailable"): + mssql.check() + + @pytest.mark.parametrize("suffix", ["", ";"]) def test_mssql_connection_sql(spark, processing, load_table_data, suffix): mssql = MSSQL( diff --git a/tests/tests_integration/tests_db_connection_integration/test_mysql_integration.py b/tests/tests_integration/tests_db_connection_integration/test_mysql_integration.py index 72a6b3b8f..8a4840329 100644 --- a/tests/tests_integration/tests_db_connection_integration/test_mysql_integration.py +++ b/tests/tests_integration/tests_db_connection_integration/test_mysql_integration.py @@ -46,6 +46,21 @@ def test_mysql_connection_check_fail(spark): mysql.check() +def test_mysql_connection_check_extra_is_handled_by_driver(spark, processing): + mysql = MySQL( + host=processing.host, + port=processing.port, + user=processing.user, + password=processing.password, + database=processing.database, + spark=spark, + extra={"tcpKeepAlive": "wrong_type"}, + ) + + with pytest.raises(RuntimeError, match="Connection is unavailable"): + mysql.check() + + @pytest.mark.parametrize("suffix", ["", ";"]) def test_mysql_connection_sql(spark, processing, load_table_data, suffix): mysql = MySQL( diff --git a/tests/tests_integration/tests_db_connection_integration/test_oracle_integration.py b/tests/tests_integration/tests_db_connection_integration/test_oracle_integration.py index 6bd96b259..485ca1911 100644 --- a/tests/tests_integration/tests_db_connection_integration/test_oracle_integration.py +++ b/tests/tests_integration/tests_db_connection_integration/test_oracle_integration.py @@ -55,6 +55,22 @@ def test_oracle_connection_check_fail(spark): oracle.check() +def test_oracle_connection_check_extra_is_handled_by_driver(spark, processing): + oracle = Oracle( + host=processing.host, + port=processing.port, + user=processing.user, + password=processing.password, + spark=spark, + sid=processing.sid, + service_name=processing.service_name, + extra={"defaultRowPrefetch": "wrong_type"}, + ) + + with pytest.raises(RuntimeError, match="Connection is unavailable"): + oracle.check() + + @pytest.mark.parametrize("suffix", ["", ";"]) def test_oracle_connection_sql(spark, processing, load_table_data, suffix): oracle = Oracle( diff --git a/tests/tests_integration/tests_db_connection_integration/test_postgres_integration.py b/tests/tests_integration/tests_db_connection_integration/test_postgres_integration.py index 1fb74095f..b72f8ac1b 100644 --- a/tests/tests_integration/tests_db_connection_integration/test_postgres_integration.py +++ b/tests/tests_integration/tests_db_connection_integration/test_postgres_integration.py @@ -91,7 +91,7 @@ def test_postgres_connection_fetch(spark, processing, load_table_data, suffix): table = load_table_data.full_name - df = postgres.fetch(f"SELECT * FROM {table}{suffix}", Postgres.JDBCOptions(fetchsize=2)) + df = postgres.fetch(f"SELECT * FROM {table}{suffix}", Postgres.FetchOptions(fetchsize=2)) table_df = processing.get_expected_dataframe( schema=load_table_data.schema, table=load_table_data.table, @@ -122,7 +122,7 @@ def test_postgres_connection_ddl(spark, processing, get_schema_table, suffix): table_name, schema, table = get_schema_table fields = {column_name: processing.get_column_type(column_name) for column_name in processing.column_names} - assert not postgres.execute(f"SET search_path TO {schema}, public{suffix}", Postgres.JDBCOptions(queryTimeout=1)) + assert not postgres.execute(f"SET search_path TO {schema}, public{suffix}", Postgres.ExecuteOptions(queryTimeout=1)) assert not postgres.execute(processing.create_schema_ddl(schema) + suffix) assert not postgres.execute(processing.create_table_ddl(table, fields, schema) + suffix) @@ -213,7 +213,7 @@ def table_finalizer(): SELECT * FROM {table_name} WHERE id_int >= 50 RETURNING id_int{suffix} - """, + """, ) df = postgres.fetch(f"SELECT * FROM {temp_table}{suffix}") @@ -302,7 +302,7 @@ def test_postgres_connection_execute_procedure( AS $$ SELECT COUNT(*) FROM {table}; $${suffix} - """, + """, ) def proc_finalizer(): @@ -358,7 +358,7 @@ def proc_finalizer(): AS $$ SELECT COUNT(*) FROM {table}; $${suffix} - """, + """, ) with pytest.raises(Exception): @@ -412,7 +412,7 @@ def test_postgres_connection_execute_procedure_arguments( SELECT COUNT(*) FROM {table} WHERE id_int = idd; $${suffix} - """, + """, ) def proc_finalizer(): @@ -471,7 +471,7 @@ def test_postgres_connection_execute_procedure_inout( WHERE id_int < idd; END $${suffix} - """, + """, ) def proc_finalizer(): @@ -518,7 +518,7 @@ def test_postgres_connection_execute_procedure_ddl( AS $$ CREATE TABLE {table} (iid INT, text VARCHAR(400)); $${suffix} - """, + """, ) def proc_finalizer(): @@ -565,7 +565,7 @@ def table_finalizer(): AS $$ INSERT INTO {table} VALUES(idd, text); $${suffix} - """, + """, ) def proc_finalizer(): @@ -605,7 +605,7 @@ def test_postgres_connection_execute_function( RETURN 100; END $$ LANGUAGE PLPGSQL{suffix} - """, + """, ) def function_finalizer(): @@ -667,7 +667,7 @@ def function_finalizer(): RETURN 100; END $$ LANGUAGE PLPGSQL{suffix} - """, + """, ) # replace @@ -681,7 +681,7 @@ def function_finalizer(): RETURN 100; END $$ LANGUAGE PLPGSQL{suffix} - """, + """, ) # missing @@ -706,7 +706,7 @@ def function_finalizer(): RETURN 100 END $$ LANGUAGE PLPGSQL{suffix} - """, + """, ) @@ -746,7 +746,7 @@ def test_postgres_connection_execute_function_arguments( RETURN i*100; END $$ LANGUAGE PLPGSQL{suffix} - """, + """, ) def function_finalizer(): @@ -825,7 +825,7 @@ def test_postgres_connection_execute_function_table( FROM {table} WHERE id_int < i; $$ LANGUAGE SQL{suffix} - """, + """, ) def function_finalizer(): @@ -873,7 +873,7 @@ def test_postgres_connection_execute_function_ddl( RETURN 1; END; $$ LANGUAGE PLPGSQL{suffix} - """, + """, ) def function_finalizer(): @@ -941,7 +941,7 @@ def table_finalizer(): RETURN idd; END; $$ LANGUAGE PLPGSQL{suffix} - """, + """, ) def function_finalizer(): @@ -961,3 +961,77 @@ def function_finalizer(): df = postgres.sql(f"SELECT {func}(2, 'cde') AS result") result_df = pandas.DataFrame([[2]], columns=["result"]) processing.assert_equal_df(df=df, other_frame=result_df) + + +@pytest.mark.parametrize( + "options_class, options_kwargs, expected_warning", + [ + (Postgres.ReadOptions, {"fetchsize": 5000, "sessionInitStatement": "SET timezone TO 'UTC'"}, UserWarning), + (Postgres.SQLOptions, {"fetchsize": 5000, "sessionInitStatement": "SET timezone TO 'UTC'"}, None), + ], +) +def test_postgres_connection_sql_options( + options_class, + options_kwargs, + expected_warning, + spark, + processing, + load_table_data, +): + postgres = Postgres( + host=processing.host, + port=processing.port, + user=processing.user, + password=processing.password, + database=processing.database, + spark=spark, + ) + table = load_table_data.full_name + options = options_class(**options_kwargs) + + if expected_warning: + with pytest.warns( + expected_warning, + match="Using `ReadOptions` for `sql` method is deprecated, use `SQLOptions` instead.", + ): + df = postgres.sql(f"SELECT * FROM {table}", options=options) + else: + df = postgres.sql(f"SELECT * FROM {table}", options=options) + + table_df = processing.get_expected_dataframe( + schema=load_table_data.schema, + table=load_table_data.table, + order_by="id_int", + ) + + processing.assert_equal_df(df=df, other_frame=table_df) + + +def test_postgres_fetch_with_legacy_jdbc_options(spark, processing): + postgres = Postgres( + host=processing.host, + port=processing.port, + user=processing.user, + password=processing.password, + database=processing.database, + spark=spark, + ) + + options = Postgres.JDBCOptions(fetchsize=10) + + df = postgres.fetch("SELECT CURRENT_TIMESTAMP;", options=options) + assert df is not None + + +def test_postgres_execute_with_legacy_jdbc_options(spark, processing): + postgres = Postgres( + host=processing.host, + port=processing.port, + user=processing.user, + password=processing.password, + database=processing.database, + spark=spark, + ) + + options = Postgres.JDBCOptions(query_timeout=30) + postgres.execute("DROP TABLE IF EXISTS temp_table;", options=options) diff --git a/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_clickhouse.py b/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_clickhouse.py index 77ec071b3..0e4fb46ef 100644 --- a/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_clickhouse.py +++ b/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_clickhouse.py @@ -212,7 +212,7 @@ def test_clickhouse_strategy_incremental_nothing_to_read(spark, processing, prep [ ("float_value", ValueError, "Expression 'float_value' returned values"), ("text_string", RuntimeError, "Cannot detect HWM type for"), - ("unknown_column", Exception, "Missing columns"), + ("unknown_column", Exception, "(Missing columns|Unknown expression).*"), ], ) def test_clickhouse_strategy_incremental_wrong_hwm( @@ -302,12 +302,37 @@ def test_clickhouse_strategy_incremental_explicit_hwm_type( ColumnDateHWM, lambda x: x.isoformat(), ), + pytest.param( + "hwm_date", + "CAST(text_string AS Date32)", + ColumnDateHWM, + lambda x: x.isoformat(), + marks=pytest.mark.xfail(reason="Date32 type was added in ClickHouse 21.9"), + ), ( "hwm_datetime", "CAST(text_string AS DateTime)", ColumnDateTimeHWM, lambda x: x.isoformat(), ), + ( + "hwm_datetime", + "CAST(text_string AS DateTime64)", + ColumnDateTimeHWM, + lambda x: x.isoformat(), + ), + ( + "hwm_datetime", + "CAST(text_string AS DateTime64(3))", + ColumnDateTimeHWM, + lambda x: x.isoformat(), + ), + ( + "hwm_datetime", + "CAST(text_string AS DateTime64(6))", + ColumnDateTimeHWM, + lambda x: x.isoformat(), + ), ], ) def test_clickhouse_strategy_incremental_with_hwm_expr( diff --git a/tests/tests_unit/test_file/test_format_unit/test_avro_unit.py b/tests/tests_unit/test_file/test_format_unit/test_avro_unit.py index 97cb42dc8..3c2ef1605 100644 --- a/tests/tests_unit/test_file/test_format_unit/test_avro_unit.py +++ b/tests/tests_unit/test_file/test_format_unit/test_avro_unit.py @@ -4,6 +4,8 @@ from onetl.file.format import Avro +pytestmark = [pytest.mark.avro] + @pytest.mark.parametrize( "spark_version", @@ -33,6 +35,8 @@ def test_avro_get_packages_scala_version_not_supported(): ("2.4.0", "2.12", "org.apache.spark:spark-avro_2.12:2.4.0"), ("3.5.0", "2.12", "org.apache.spark:spark-avro_2.12:3.5.0"), ("3.5.0", "2.13", "org.apache.spark:spark-avro_2.13:3.5.0"), + # Scala version contain three digits when only two needed + ("3.5.0", "2.12.1", "org.apache.spark:spark-avro_2.12:3.5.0"), ], ) def test_avro_get_packages(spark_version, scala_version, package): diff --git a/tests/tests_unit/test_file/test_format_unit/test_csv_unit.py b/tests/tests_unit/test_file/test_format_unit/test_csv_unit.py index 125920f18..34b366c71 100644 --- a/tests/tests_unit/test_file/test_format_unit/test_csv_unit.py +++ b/tests/tests_unit/test_file/test_format_unit/test_csv_unit.py @@ -4,6 +4,8 @@ from onetl.file.format import CSV +pytestmark = [pytest.mark.csv] + def test_csv_options_default(): csv = CSV() diff --git a/tests/tests_unit/test_file/test_format_unit/test_excel_unit.py b/tests/tests_unit/test_file/test_format_unit/test_excel_unit.py index 9424e8b1c..95dae3da1 100644 --- a/tests/tests_unit/test_file/test_format_unit/test_excel_unit.py +++ b/tests/tests_unit/test_file/test_format_unit/test_excel_unit.py @@ -4,6 +4,8 @@ from onetl.file.format import Excel +pytestmark = [pytest.mark.excel] + @pytest.mark.parametrize( "spark_version", @@ -42,6 +44,8 @@ def test_excel_get_packages_package_version_not_supported(): # Override package version ("3.2.0", None, "0.16.0", ["com.crealytics:spark-excel_2.12:3.2.0_0.16.0"]), ("3.5.0", None, "0.18.0", ["com.crealytics:spark-excel_2.12:3.5.0_0.18.0"]), + # Scala version contain three digits when only two needed + ("3.5.0", "2.12.1", None, ["com.crealytics:spark-excel_2.12:3.5.0_0.20.3"]), ], ) def test_excel_get_packages(caplog, spark_version, scala_version, package_version, packages): diff --git a/tests/tests_unit/test_file/test_format_unit/test_json_unit.py b/tests/tests_unit/test_file/test_format_unit/test_json_unit.py index 1b3b4527a..702063ea8 100644 --- a/tests/tests_unit/test_file/test_format_unit/test_json_unit.py +++ b/tests/tests_unit/test_file/test_format_unit/test_json_unit.py @@ -4,6 +4,8 @@ from onetl.file.format import JSON +pytestmark = [pytest.mark.json] + def test_json_options_default(): json = JSON() diff --git a/tests/tests_unit/test_file/test_format_unit/test_jsonline_unit.py b/tests/tests_unit/test_file/test_format_unit/test_jsonline_unit.py index 9b2e361cd..65199adbb 100644 --- a/tests/tests_unit/test_file/test_format_unit/test_jsonline_unit.py +++ b/tests/tests_unit/test_file/test_format_unit/test_jsonline_unit.py @@ -4,6 +4,8 @@ from onetl.file.format import JSONLine +pytestmark = [pytest.mark.json] + def test_jsonline_options_default(): jsonline = JSONLine() diff --git a/tests/tests_unit/test_file/test_format_unit/test_orc_unit.py b/tests/tests_unit/test_file/test_format_unit/test_orc_unit.py index ea3d5a2a8..60910f045 100644 --- a/tests/tests_unit/test_file/test_format_unit/test_orc_unit.py +++ b/tests/tests_unit/test_file/test_format_unit/test_orc_unit.py @@ -4,6 +4,8 @@ from onetl.file.format import ORC +pytestmark = [pytest.mark.orc] + @pytest.mark.parametrize( "known_option", diff --git a/tests/tests_unit/test_file/test_format_unit/test_parquet_unit.py b/tests/tests_unit/test_file/test_format_unit/test_parquet_unit.py index ef83e07b0..e332f50d9 100644 --- a/tests/tests_unit/test_file/test_format_unit/test_parquet_unit.py +++ b/tests/tests_unit/test_file/test_format_unit/test_parquet_unit.py @@ -4,6 +4,8 @@ from onetl.file.format import Parquet +pytestmark = [pytest.mark.parquet] + @pytest.mark.parametrize( "known_option", diff --git a/tests/tests_unit/test_file/test_format_unit/test_xml_unit.py b/tests/tests_unit/test_file/test_format_unit/test_xml_unit.py index 17d9dac2a..b7ee748da 100644 --- a/tests/tests_unit/test_file/test_format_unit/test_xml_unit.py +++ b/tests/tests_unit/test_file/test_format_unit/test_xml_unit.py @@ -4,22 +4,26 @@ from onetl.file.format import XML +pytestmark = [pytest.mark.xml] + @pytest.mark.parametrize( "spark_version, scala_version, package_version, expected_packages", [ - ("3.2.4", None, None, ["com.databricks:spark-xml_2.12:0.17.0"]), + ("3.2.4", None, None, ["com.databricks:spark-xml_2.12:0.18.0"]), ("3.4.1", "2.12", "0.18.0", ["com.databricks:spark-xml_2.12:0.18.0"]), - ("3.0.0", None, None, ["com.databricks:spark-xml_2.12:0.17.0"]), - ("3.0.0", "2.12", "0.17.0", ["com.databricks:spark-xml_2.12:0.17.0"]), - ("3.1.2", None, None, ["com.databricks:spark-xml_2.12:0.17.0"]), + ("3.0.0", None, None, ["com.databricks:spark-xml_2.12:0.18.0"]), + ("3.0.0", "2.12", "0.18.0", ["com.databricks:spark-xml_2.12:0.18.0"]), + ("3.1.2", None, None, ["com.databricks:spark-xml_2.12:0.18.0"]), ("3.1.2", "2.12", "0.16.0", ["com.databricks:spark-xml_2.12:0.16.0"]), - ("3.2.0", "2.12", None, ["com.databricks:spark-xml_2.12:0.17.0"]), + ("3.2.0", "2.12", None, ["com.databricks:spark-xml_2.12:0.18.0"]), ("3.2.0", "2.12", "0.15.0", ["com.databricks:spark-xml_2.12:0.15.0"]), - ("3.2.4", "2.13", None, ["com.databricks:spark-xml_2.13:0.17.0"]), + ("3.2.4", "2.13", None, ["com.databricks:spark-xml_2.13:0.18.0"]), ("3.4.1", "2.13", "0.18.0", ["com.databricks:spark-xml_2.13:0.18.0"]), + ("3.4.1", "3.0", "0.18.0", ["com.databricks:spark-xml_3.0:0.18.0"]), ("3.3.0", None, "0.16.0", ["com.databricks:spark-xml_2.12:0.16.0"]), - ("3.3.0", "2.12", None, ["com.databricks:spark-xml_2.12:0.17.0"]), + ("3.3.0", "2.12", None, ["com.databricks:spark-xml_2.12:0.18.0"]), + ("3.2.4", "2.12.1", "0.15.0", ["com.databricks:spark-xml_2.12:0.15.0"]), ], ) def test_xml_get_packages(spark_version, scala_version, package_version, expected_packages): @@ -51,11 +55,11 @@ def test_xml_get_packages_restriction_for_spark_2x(spark_version, scala_version, "spark_version, scala_version, package_version", [ ("3.2.4", "2.11", None), - ("3.4.1", "2.14", None), + ("3.4.1", "2.10", None), ], ) def test_xml_get_packages_scala_version_error(spark_version, scala_version, package_version): - with pytest.raises(ValueError, match=r"Scala version must be 2.12 or 2.13, got \d+\.\d+"): + with pytest.raises(ValueError, match=f"Scala version must be at least 2.12, got {scala_version}"): XML.get_packages( spark_version=spark_version, scala_version=scala_version, @@ -120,10 +124,3 @@ def test_xml_options_unknown(caplog): xml = XML(row_tag="item", unknownOption="abc") assert xml.unknownOption == "abc" assert "Options ['unknownOption'] are not known by XML, are you sure they are valid?" in caplog.text - - -@pytest.mark.local_fs -def test_xml_missing_package(spark_no_packages): - msg = "Cannot import Java class 'com.databricks.spark.xml.XmlReader'" - with pytest.raises(ValueError, match=msg): - XML(row_tag="item").check_if_supported(spark_no_packages) diff --git a/tests/tests_unit/tests_db_connection_unit/test_clickhouse_unit.py b/tests/tests_unit/tests_db_connection_unit/test_clickhouse_unit.py index 42b5582ae..4ce55a572 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_clickhouse_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_clickhouse_unit.py @@ -1,5 +1,3 @@ -import re - import pytest from onetl.connection import Clickhouse @@ -8,21 +6,88 @@ def test_clickhouse_driver(): - assert Clickhouse.DRIVER == "ru.yandex.clickhouse.ClickHouseDriver" + assert Clickhouse.DRIVER == "com.clickhouse.jdbc.ClickHouseDriver" def test_clickhouse_package(): - warning_msg = re.escape("will be removed in 1.0.0, use `Clickhouse.get_packages()` instead") - with pytest.warns(UserWarning, match=warning_msg): - assert Clickhouse.package == "ru.yandex.clickhouse:clickhouse-jdbc:0.3.2" + expected_packages = ( + "com.clickhouse:clickhouse-jdbc:0.6.0-patch5,com.clickhouse:clickhouse-http-client:0.6.0-patch5," + "org.apache.httpcomponents.client5:httpclient5:5.3.1" + ) + assert Clickhouse.package == expected_packages + + +@pytest.mark.parametrize( + "package_version, apache_http_client_version, expected_packages", + [ + ( + None, + None, + [ + "com.clickhouse:clickhouse-jdbc:0.6.0-patch5", + "com.clickhouse:clickhouse-http-client:0.6.0-patch5", + "org.apache.httpcomponents.client5:httpclient5:5.3.1", + ], + ), + ( + "0.6.0-patch3", + "5.3.1", + [ + "com.clickhouse:clickhouse-jdbc:0.6.0-patch3", + "com.clickhouse:clickhouse-http-client:0.6.0-patch3", + "org.apache.httpcomponents.client5:httpclient5:5.3.1", + ], + ), + ( + "0.4.0", + "4.5.14", + ["com.clickhouse:clickhouse-jdbc:0.4.0", "com.clickhouse:clickhouse-http-client:0.4.0"], + ), # No HTTP client should be included + ( + "0.5.0", + "4.5.14", + [ + "com.clickhouse:clickhouse-jdbc:0.5.0", + "com.clickhouse:clickhouse-http-client:0.5.0", + "org.apache.httpcomponents.client5:httpclient5:4.5.14", + ], + ), + ( + "0.6.0", + "4.5.14", + [ + "com.clickhouse:clickhouse-jdbc:0.6.0", + "com.clickhouse:clickhouse-http-client:0.6.0", + "org.apache.httpcomponents.client5:httpclient5:4.5.14", + ], + ), + ], +) +def test_clickhouse_get_packages(package_version, apache_http_client_version, expected_packages): + assert ( + Clickhouse.get_packages(package_version=package_version, apache_http_client_version=apache_http_client_version) + == expected_packages + ) -def test_clickhouse_get_packages(): - assert Clickhouse.get_packages() == ["ru.yandex.clickhouse:clickhouse-jdbc:0.3.2"] +@pytest.mark.parametrize( + "package_version, apache_http_client_version", + [ + ("0.7", "5.3.1"), + ("1", "5.4.0"), + ("a.b.c", "5.3.1"), + ], +) +def test_clickhouse_get_packages_invalid_version(package_version, apache_http_client_version): + with pytest.raises( + ValueError, + match=rf"Version '{package_version}' does not have enough numeric components for requested format \(expected at least 3\).", + ): + Clickhouse.get_packages(package_version=package_version, apache_http_client_version=apache_http_client_version) def test_clickhouse_missing_package(spark_no_packages): - msg = "Cannot import Java class 'ru.yandex.clickhouse.ClickHouseDriver'" + msg = "Cannot import Java class 'com.clickhouse.jdbc.ClickHouseDriver'" with pytest.raises(ValueError, match=msg): Clickhouse( host="some_host", @@ -56,6 +121,12 @@ def test_clickhouse(spark_mock): assert conn.database == "database" assert conn.jdbc_url == "jdbc:clickhouse://some_host:8123/database" + assert conn.jdbc_params == { + "user": "user", + "password": "passwd", + "driver": "com.clickhouse.jdbc.ClickHouseDriver", + "url": "jdbc:clickhouse://some_host:8123/database", + } assert "password='passwd'" not in str(conn) assert "password='passwd'" not in repr(conn) @@ -79,6 +150,12 @@ def test_clickhouse_with_port(spark_mock): assert conn.database == "database" assert conn.jdbc_url == "jdbc:clickhouse://some_host:5000/database" + assert conn.jdbc_params == { + "user": "user", + "password": "passwd", + "driver": "com.clickhouse.jdbc.ClickHouseDriver", + "url": "jdbc:clickhouse://some_host:5000/database", + } def test_clickhouse_without_database(spark_mock): @@ -92,6 +169,12 @@ def test_clickhouse_without_database(spark_mock): assert not conn.database assert conn.jdbc_url == "jdbc:clickhouse://some_host:8123" + assert conn.jdbc_params == { + "user": "user", + "password": "passwd", + "driver": "com.clickhouse.jdbc.ClickHouseDriver", + "url": "jdbc:clickhouse://some_host:8123", + } def test_clickhouse_with_extra(spark_mock): @@ -100,11 +183,18 @@ def test_clickhouse_with_extra(spark_mock): user="user", password="passwd", database="database", - extra={"socket_timeout": "120000", "query": "SELECT%201%3B"}, + extra={"socket_timeout": 120000, "custom_http_params": "key1=value1,key2=value2"}, spark=spark_mock, ) - assert conn.jdbc_url == "jdbc:clickhouse://some_host:8123/database?query=SELECT%201%3B&socket_timeout=120000" + assert conn.jdbc_params == { + "user": "user", + "password": "passwd", + "driver": "com.clickhouse.jdbc.ClickHouseDriver", + "url": "jdbc:clickhouse://some_host:8123/database", + "socket_timeout": 120000, + "custom_http_params": "key1=value1,key2=value2", + } def test_clickhouse_without_mandatory_args(spark_mock): diff --git a/tests/tests_unit/tests_db_connection_unit/test_db_options_unit.py b/tests/tests_unit/tests_db_connection_unit/test_db_options_unit.py index 8e51d0a89..a280d53c7 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_db_options_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_db_options_unit.py @@ -3,7 +3,16 @@ import pytest -from onetl.connection import Greenplum, Hive, Postgres +from onetl.connection import ( + MSSQL, + Clickhouse, + Greenplum, + Hive, + MySQL, + Oracle, + Postgres, + Teradata, +) pytestmark = [pytest.mark.postgres] @@ -13,10 +22,31 @@ [ Postgres.ReadOptions, Postgres.WriteOptions, - Postgres.JDBCOptions, + Postgres.FetchOptions, + Postgres.ExecuteOptions, Postgres.Options, Greenplum.ReadOptions, Greenplum.WriteOptions, + Clickhouse.ReadOptions, + Clickhouse.WriteOptions, + Clickhouse.FetchOptions, + Clickhouse.ExecuteOptions, + MSSQL.ReadOptions, + MSSQL.WriteOptions, + MSSQL.FetchOptions, + MSSQL.ExecuteOptions, + MySQL.ReadOptions, + MySQL.WriteOptions, + MySQL.FetchOptions, + MySQL.ExecuteOptions, + Teradata.ReadOptions, + Teradata.WriteOptions, + Teradata.FetchOptions, + Teradata.ExecuteOptions, + Oracle.ReadOptions, + Oracle.WriteOptions, + Oracle.FetchOptions, + Oracle.ExecuteOptions, ], ) @pytest.mark.parametrize( @@ -38,11 +68,21 @@ def test_db_options_connection_parameters_cannot_be_passed(options_class, arg, v [ (Hive.WriteOptions, "HiveWriteOptions", {"if_exists": "replace_overlapping_partitions"}), (Hive.Options, "HiveLegacyOptions", {"if_exists": "replace_overlapping_partitions"}), - (Postgres.ReadOptions, "JDBCReadOptions", {"fetchsize": 10, "keytab": "a/b/c"}), - (Postgres.WriteOptions, "JDBCWriteOptions", {"if_exists": "replace_entire_table", "keytab": "a/b/c"}), + (Postgres.ReadOptions, "PostgresReadOptions", {"fetchsize": 10, "keytab": "a/b/c"}), + (Postgres.WriteOptions, "PostgresWriteOptions", {"if_exists": "replace_entire_table", "keytab": "a/b/c"}), (Postgres.Options, "JDBCLegacyOptions", {"if_exists": "replace_entire_table", "keytab": "a/b/c"}), (Greenplum.ReadOptions, "GreenplumReadOptions", {"partitions": 10}), (Greenplum.WriteOptions, "GreenplumWriteOptions", {"if_exists": "replace_entire_table"}), + (Clickhouse.ReadOptions, "ClickhouseReadOptions", {"fetchsize": 10, "keytab": "a/b/c"}), + (Clickhouse.WriteOptions, "ClickhouseWriteOptions", {"if_exists": "replace_entire_table", "keytab": "a/b/c"}), + (MSSQL.ReadOptions, "MSSQLReadOptions", {"fetchsize": 10, "keytab": "a/b/c"}), + (MSSQL.WriteOptions, "MSSQLWriteOptions", {"if_exists": "replace_entire_table", "keytab": "a/b/c"}), + (MySQL.ReadOptions, "MySQLReadOptions", {"fetchsize": 10, "keytab": "a/b/c"}), + (MySQL.WriteOptions, "MySQLWriteOptions", {"if_exists": "replace_entire_table", "keytab": "a/b/c"}), + (Teradata.ReadOptions, "TeradataReadOptions", {"fetchsize": 10, "keytab": "a/b/c"}), + (Teradata.WriteOptions, "TeradataWriteOptions", {"if_exists": "replace_entire_table", "keytab": "a/b/c"}), + (Oracle.ReadOptions, "OracleReadOptions", {"fetchsize": 10, "keytab": "a/b/c"}), + (Oracle.WriteOptions, "OracleWriteOptions", {"if_exists": "replace_entire_table", "keytab": "a/b/c"}), ], ) def test_db_options_warn_for_unknown(options_class, options_class_name, known_options, caplog): @@ -64,18 +104,20 @@ def test_db_options_warn_for_unknown(options_class, options_class_name, known_op @pytest.mark.parametrize( "options_class,options", [ - ( - Postgres.ReadOptions, - Postgres.WriteOptions(), - ), - ( - Postgres.WriteOptions, - Postgres.ReadOptions(), - ), - ], - ids=[ - "Write options object passed to ReadOptions", - "Read options object passed to WriteOptions", + (Postgres.ReadOptions, Postgres.WriteOptions()), + (Postgres.WriteOptions, Postgres.ReadOptions()), + (Clickhouse.ReadOptions, Clickhouse.WriteOptions()), + (Clickhouse.WriteOptions, Clickhouse.ReadOptions()), + (MSSQL.ReadOptions, MSSQL.WriteOptions()), + (MSSQL.WriteOptions, MSSQL.ReadOptions()), + (MySQL.ReadOptions, MySQL.WriteOptions()), + (MySQL.WriteOptions, MySQL.ReadOptions()), + (Teradata.ReadOptions, Teradata.WriteOptions()), + (Teradata.WriteOptions, Teradata.ReadOptions()), + (Greenplum.ReadOptions, Greenplum.WriteOptions()), + (Greenplum.WriteOptions, Greenplum.ReadOptions()), + (Oracle.ReadOptions, Oracle.WriteOptions()), + (Oracle.WriteOptions, Oracle.ReadOptions()), ], ) def test_db_options_parse_mismatch_class(options_class, options): @@ -105,14 +147,36 @@ def test_db_options_parse_mismatch_connection_and_options_types(connection, opti @pytest.mark.parametrize( "options_class", [ + # PostgreSQL options Postgres.ReadOptions, Postgres.WriteOptions, - Postgres.JDBCOptions, + Postgres.FetchOptions, + Postgres.ExecuteOptions, + Postgres.Options, Greenplum.ReadOptions, Greenplum.WriteOptions, Hive.WriteOptions, - Postgres.Options, Hive.Options, + Clickhouse.ReadOptions, + Clickhouse.WriteOptions, + Clickhouse.FetchOptions, + Clickhouse.ExecuteOptions, + MSSQL.ReadOptions, + MSSQL.WriteOptions, + MSSQL.FetchOptions, + MSSQL.ExecuteOptions, + MySQL.ReadOptions, + MySQL.WriteOptions, + MySQL.FetchOptions, + MySQL.ExecuteOptions, + Teradata.ReadOptions, + Teradata.WriteOptions, + Teradata.FetchOptions, + Teradata.ExecuteOptions, + Oracle.ReadOptions, + Oracle.WriteOptions, + Oracle.FetchOptions, + Oracle.ExecuteOptions, ], ) @pytest.mark.parametrize( diff --git a/tests/tests_unit/tests_db_connection_unit/test_greenplum_unit.py b/tests/tests_unit/tests_db_connection_unit/test_greenplum_unit.py index bda06d282..f3b996140 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_greenplum_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_greenplum_unit.py @@ -65,6 +65,8 @@ def test_greenplum_get_packages_scala_version_not_supported(scala_version): # Override Scala version detected automatically ("2.3", "2.11", "io.pivotal:greenplum-spark_2.11:2.2.0"), ("2.4", "2.12", "io.pivotal:greenplum-spark_2.12:2.2.0"), + # Scala version contain three digits when only two needed + ("3.2.4", "2.11.1", "io.pivotal:greenplum-spark_2.11:2.2.0"), ], ) def test_greenplum_get_packages(spark_version, scala_version, package): @@ -117,7 +119,15 @@ def test_greenplum(spark_mock): assert conn.password.get_secret_value() == "passwd" assert conn.database == "database" - assert conn.jdbc_url == "jdbc:postgresql://some_host:5432/database?ApplicationName=abc&tcpKeepAlive=true" + assert conn.jdbc_url == "jdbc:postgresql://some_host:5432/database" + assert conn.jdbc_params == { + "user": "user", + "password": "passwd", + "driver": "org.postgresql.Driver", + "url": "jdbc:postgresql://some_host:5432/database", + "ApplicationName": "abc", + "tcpKeepAlive": "true", + } assert "password='passwd'" not in str(conn) assert "password='passwd'" not in repr(conn) @@ -133,7 +143,15 @@ def test_greenplum_with_port(spark_mock): assert conn.password.get_secret_value() == "passwd" assert conn.database == "database" - assert conn.jdbc_url == "jdbc:postgresql://some_host:5000/database?ApplicationName=abc&tcpKeepAlive=true" + assert conn.jdbc_url == "jdbc:postgresql://some_host:5000/database" + assert conn.jdbc_params == { + "user": "user", + "password": "passwd", + "driver": "org.postgresql.Driver", + "url": "jdbc:postgresql://some_host:5000/database", + "ApplicationName": "abc", + "tcpKeepAlive": "true", + } def test_greenplum_without_database_error(spark_mock): @@ -159,9 +177,16 @@ def test_greenplum_with_extra(spark_mock): # `server.*` and `pool.*` options are ignored while generating jdbc_url # they are used only in `read_source_as_df` and `write_df_to_target` - assert conn.jdbc_url == ( - "jdbc:postgresql://some_host:5432/database?ApplicationName=override&autosave=always&tcpKeepAlive=false" - ) + assert conn.jdbc_url == "jdbc:postgresql://some_host:5432/database" + assert conn.jdbc_params == { + "user": "user", + "password": "passwd", + "driver": "org.postgresql.Driver", + "url": "jdbc:postgresql://some_host:5432/database", + "ApplicationName": "override", + "tcpKeepAlive": "false", + "autosave": "always", + } def test_greenplum_without_mandatory_args(spark_mock): @@ -208,7 +233,8 @@ def test_greenplum_write_options_default(): [ (Greenplum.ReadOptions, "GreenplumReadOptions"), (Greenplum.WriteOptions, "GreenplumWriteOptions"), - (Greenplum.JDBCOptions, "JDBCOptions"), + (Greenplum.FetchOptions, "GreenplumFetchOptions"), + (Greenplum.ExecuteOptions, "GreenplumExecuteOptions"), (Greenplum.Extra, "GreenplumExtra"), ], ) @@ -218,7 +244,8 @@ def test_greenplum_jdbc_options_populated_by_connection_class(klass, name): klass(user="me", password="abc", driver="some.Class", url="jdbc:postgres://some/db") -def test_greenplum_read_write_options_populated_by_connection_class(): +@pytest.mark.parametrize("options_class", [Greenplum.FetchOptions, Greenplum.ExecuteOptions]) +def test_greenplum_read_write_options_populated_by_connection_class(options_class): error_msg = r"Options \['dbschema', 'dbtable'\] are not allowed to use in a GreenplumReadOptions" with pytest.raises(ValueError, match=error_msg): Greenplum.ReadOptions(dbschema="myschema", dbtable="mytable") @@ -227,8 +254,8 @@ def test_greenplum_read_write_options_populated_by_connection_class(): with pytest.raises(ValueError, match=error_msg): Greenplum.WriteOptions(dbschema="myschema", dbtable="mytable") - # JDBCOptions does not have such restriction - options = Greenplum.JDBCOptions(dbschema="myschema", dbtable="mytable") + # FetchOptions & ExecuteOptions does not have such restriction + options = options_class(dbschema="myschema", dbtable="mytable") assert options.dbschema == "myschema" assert options.dbtable == "mytable" diff --git a/tests/tests_unit/tests_db_connection_unit/test_jdbc_options_unit.py b/tests/tests_unit/tests_db_connection_unit/test_jdbc_options_unit.py index 7c8ecfcca..699838889 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_jdbc_options_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_jdbc_options_unit.py @@ -3,7 +3,7 @@ import pytest from onetl._internal import to_camel -from onetl.connection import Postgres +from onetl.connection import MSSQL, Clickhouse, MySQL, Oracle, Postgres, Teradata from onetl.connection.db_connection.jdbc_connection import JDBCTableExistBehavior pytestmark = [pytest.mark.postgres] @@ -44,20 +44,57 @@ def test_jdbc_options_default(): ("properties", {"abc": "cde"}), ], ) -def test_jdbc_read_write_options_populated_by_connection_class(arg, value): - error_msg = rf"Options \['{arg}'\] are not allowed to use in a JDBCReadOptions" - with pytest.raises(ValueError, match=error_msg): - Postgres.ReadOptions.parse({arg: value}) - - error_msg = rf"Options \['{arg}'\] are not allowed to use in a JDBCWriteOptions" - with pytest.raises(ValueError, match=error_msg): - Postgres.WriteOptions.parse({arg: value}) - - # JDBCOptions does not have such restriction - options = Postgres.JDBCOptions.parse({arg: value}) - assert options.dict()[arg] == value +@pytest.mark.parametrize( + "options_class, read_write_restriction", + [ + (Postgres.FetchOptions, False), + (Postgres.ExecuteOptions, False), + (Postgres.ReadOptions, True), + (Postgres.WriteOptions, True), + (Clickhouse.FetchOptions, False), + (Clickhouse.ExecuteOptions, False), + (Clickhouse.ReadOptions, True), + (Clickhouse.WriteOptions, True), + (MSSQL.FetchOptions, False), + (MSSQL.ExecuteOptions, False), + (MSSQL.ReadOptions, True), + (MSSQL.WriteOptions, True), + (MySQL.FetchOptions, False), + (MySQL.ExecuteOptions, False), + (MySQL.ReadOptions, True), + (MySQL.WriteOptions, True), + (Teradata.FetchOptions, False), + (Teradata.ExecuteOptions, False), + (Teradata.ReadOptions, True), + (Teradata.WriteOptions, True), + (Oracle.FetchOptions, False), + (Oracle.ExecuteOptions, False), + (Oracle.ReadOptions, True), + (Oracle.WriteOptions, True), + ], +) +def test_jdbc_read_write_options_populated_by_connection_class(arg, value, options_class, read_write_restriction): + if read_write_restriction: + error_msg = rf"Options \['{arg}'\] are not allowed to use in a {options_class.__name__}" + with pytest.raises(ValueError, match=error_msg): + options_class.parse({arg: value}) + else: + # FetchOptions & ExecuteOptions does not have such restriction + options = options_class.parse({arg: value}) + assert options.dict()[arg] == value +@pytest.mark.parametrize( + "options_class, options_class_name", + [ + (Postgres.ReadOptions, "PostgresReadOptions"), + (Clickhouse.ReadOptions, "ClickhouseReadOptions"), + (MSSQL.ReadOptions, "MSSQLReadOptions"), + (MySQL.ReadOptions, "MySQLReadOptions"), + (Teradata.ReadOptions, "TeradataReadOptions"), + (Oracle.ReadOptions, "OracleReadOptions"), + ], +) @pytest.mark.parametrize( "arg, value", [ @@ -72,12 +109,23 @@ def test_jdbc_read_write_options_populated_by_connection_class(arg, value): ("createTableColumnTypes", "a varchar"), ], ) -def test_jdbc_write_options_cannot_be_used_in_read_options(arg, value): - error_msg = rf"Options \['{arg}'\] are not allowed to use in a JDBCReadOptions" +def test_jdbc_write_options_cannot_be_used_in_read_options(arg, value, options_class, options_class_name): + error_msg = rf"Options \['{arg}'\] are not allowed to use in a {options_class_name}" with pytest.raises(ValueError, match=error_msg): - Postgres.ReadOptions.parse({arg: value}) + options_class.parse({arg: value}) +@pytest.mark.parametrize( + "options_class, options_class_name", + [ + (Postgres.WriteOptions, "PostgresWriteOptions"), + (Clickhouse.WriteOptions, "ClickhouseWriteOptions"), + (MSSQL.WriteOptions, "MSSQLWriteOptions"), + (MySQL.WriteOptions, "MySQLWriteOptions"), + (Teradata.WriteOptions, "TeradataWriteOptions"), + (Oracle.WriteOptions, "OracleWriteOptions"), + ], +) @pytest.mark.parametrize( "arg, value", [ @@ -100,10 +148,10 @@ def test_jdbc_write_options_cannot_be_used_in_read_options(arg, value): ("predicates", "s"), ], ) -def test_jdbc_read_options_cannot_be_used_in_write_options(arg, value): - error_msg = rf"Options \['{arg}'\] are not allowed to use in a JDBCWriteOptions" +def test_jdbc_read_options_cannot_be_used_in_write_options(options_class, options_class_name, arg, value): + error_msg = rf"Options \['{arg}'\] are not allowed to use in a {options_class_name}" with pytest.raises(ValueError, match=error_msg): - Postgres.WriteOptions.parse({arg: value}) + options_class.parse({arg: value}) @pytest.mark.parametrize( @@ -136,12 +184,23 @@ def test_jdbc_old_options_allowed_but_deprecated(arg, value): assert options.dict(by_alias=True)[to_camel(arg)] == value -def test_jdbc_read_options_partitioning_is_not_valid(): +@pytest.mark.parametrize( + "options_class", + [ + Postgres.ReadOptions, + Clickhouse.ReadOptions, + MSSQL.ReadOptions, + MySQL.ReadOptions, + Teradata.ReadOptions, + Oracle.ReadOptions, + ], +) +def test_jdbc_read_options_partitioning_is_not_valid(options_class): with pytest.raises(ValueError): - Postgres.ReadOptions(numPartitions=200) + options_class(numPartitions=200) with pytest.raises(ValueError): - Postgres.ReadOptions(partitionColumn="test") + options_class(partitionColumn="test") def test_jdbc_read_options_case(): @@ -197,70 +256,6 @@ def test_jdbc_write_options_case(): assert camel_case == snake_case -def test_jdbc_read_options_to_jdbc(spark_mock): - connection = Postgres(host="local", user="admin", database="default", password="1234", spark=spark_mock) - jdbc_params = connection.options_to_jdbc_params( - options=Postgres.ReadOptions( - lowerBound=10, - upperBound=1000, - partitionColumn="some_column", - numPartitions=20, - fetchsize=1000, - sessionInitStatement="BEGIN execute immediate 'alter session set '_serial_direct_read'=true", - snake_case_option="left unchanged", - camelCaseOption="left unchanged", - CamelCaseOption="left unchanged", - ), - ) - - assert jdbc_params == { - "column": "some_column", - "lowerBound": "10", - "numPartitions": "20", - "properties": { - "driver": "org.postgresql.Driver", - "fetchsize": "1000", - "password": "1234", - "sessionInitStatement": "BEGIN execute immediate 'alter session set '_serial_direct_read'=true", - "user": "admin", - "snake_case_option": "left unchanged", - "camelCaseOption": "left unchanged", - "CamelCaseOption": "left unchanged", - }, - "upperBound": "1000", - "url": "jdbc:postgresql://local:5432/default?ApplicationName=abc&stringtype=unspecified", - } - - -def test_jdbc_write_options_to_jdbc(spark_mock): - connection = Postgres(host="local", user="admin", database="default", password="1234", spark=spark_mock) - jdbc_params = connection.options_to_jdbc_params( - options=Postgres.WriteOptions( - batchsize=1000, - truncate=True, - isolation_level="NONE", - snake_case_option="left unchanged", - camelCaseOption="left unchanged", - CamelCaseOption="left unchanged", - ), - ) - - assert jdbc_params == { - "properties": { - "batchsize": "1000", - "driver": "org.postgresql.Driver", - "password": "1234", - "isolationLevel": "NONE", - "truncate": "true", - "user": "admin", - "snake_case_option": "left unchanged", - "camelCaseOption": "left unchanged", - "CamelCaseOption": "left unchanged", - }, - "url": "jdbc:postgresql://local:5432/default?ApplicationName=abc&stringtype=unspecified", - } - - @pytest.mark.parametrize( "options, value", [ @@ -317,11 +312,69 @@ def test_jdbc_write_options_mode_deprecated(options, value, message): @pytest.mark.parametrize( - "options", + "options_class, options", [ - {"mode": "wrong_mode"}, + (Postgres.WriteOptions, {"mode": "wrong_mode"}), + (Clickhouse.WriteOptions, {"mode": "wrong_mode"}), + (MSSQL.WriteOptions, {"mode": "wrong_mode"}), + (MySQL.WriteOptions, {"mode": "wrong_mode"}), + (Teradata.WriteOptions, {"mode": "wrong_mode"}), + (Oracle.WriteOptions, {"mode": "wrong_mode"}), ], ) -def test_jdbc_write_options_mode_wrong(options): +def test_jdbc_write_options_mode_wrong(options_class, options): with pytest.raises(ValueError, match="value is not a valid enumeration member"): - Postgres.WriteOptions(**options) + options_class(**options) + + +@pytest.mark.parametrize( + "options, expected_message", + [ + ({"num_partitions": 2}, "lower_bound and upper_bound must be set if num_partitions > 1"), + ({"num_partitions": 2, "lower_bound": 0}, "lower_bound and upper_bound must be set if num_partitions > 1"), + ({"num_partitions": 2, "upper_bound": 10}, "lower_bound and upper_bound must be set if num_partitions > 1"), + ], +) +def test_jdbc_sql_options_partition_bounds(options, expected_message): + with pytest.raises(ValueError, match=expected_message): + Postgres.SQLOptions(**options) + + +@pytest.mark.parametrize( + "options_class", + [ + Postgres.SQLOptions, + Clickhouse.SQLOptions, + MSSQL.SQLOptions, + MySQL.SQLOptions, + Teradata.SQLOptions, + Oracle.SQLOptions, + ], +) +def test_jdbc_sql_options_partitioning_mode_prohibited(options_class): + with pytest.raises(ValueError, match=r"Options \['partitioning_mode'\] are not allowed"): + options_class(partitioning_mode="range") + + +@pytest.mark.parametrize( + "options_class", + [ + Postgres.SQLOptions, + Clickhouse.SQLOptions, + MSSQL.SQLOptions, + MySQL.SQLOptions, + Teradata.SQLOptions, + Oracle.SQLOptions, + ], +) +def test_jdbc_sql_options_default(options_class): + options = options_class() + assert options.fetchsize == 100_000 + assert options.query_timeout is None + + +def test_jdbc_deprecated_jdbcoptions(): + deprecated_warning = "Deprecated in 0.11.0 and will be removed in 1.0.0. Use FetchOptions or ExecuteOptions instead" + + with pytest.warns(DeprecationWarning, match=deprecated_warning): + Postgres.JDBCOptions(fetchsize=10, query_timeout=30) diff --git a/tests/tests_unit/tests_db_connection_unit/test_kafka_unit.py b/tests/tests_unit/tests_db_connection_unit/test_kafka_unit.py index a97212560..2e0ccd1a0 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_kafka_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_kafka_unit.py @@ -42,6 +42,7 @@ def create_temp_file(tmp_path_factory): ("3.3.0", None, "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0"), ("3.3.0", "2.12", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0"), ("3.3.0", "2.13", "org.apache.spark:spark-sql-kafka-0-10_2.13:3.3.0"), + ("3.3.1", "2.12.2", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.1"), ], ) def test_kafka_get_packages(spark_version, scala_version, package): diff --git a/tests/tests_unit/tests_db_connection_unit/test_mongodb_unit.py b/tests/tests_unit/tests_db_connection_unit/test_mongodb_unit.py index 48882f185..f494e3deb 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_mongodb_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_mongodb_unit.py @@ -12,9 +12,9 @@ def test_mongodb_package(): warning_msg = re.escape("will be removed in 1.0.0, use `MongoDB.get_packages(spark_version=") with pytest.warns(UserWarning, match=warning_msg): - assert MongoDB.package_spark_3_2 == "org.mongodb.spark:mongo-spark-connector_2.12:10.1.1" - assert MongoDB.package_spark_3_3 == "org.mongodb.spark:mongo-spark-connector_2.12:10.1.1" - assert MongoDB.package_spark_3_4 == "org.mongodb.spark:mongo-spark-connector_2.12:10.1.1" + assert MongoDB.package_spark_3_2 == "org.mongodb.spark:mongo-spark-connector_2.12:10.3.0" + assert MongoDB.package_spark_3_3 == "org.mongodb.spark:mongo-spark-connector_2.12:10.3.0" + assert MongoDB.package_spark_3_4 == "org.mongodb.spark:mongo-spark-connector_2.12:10.3.0" def test_mongodb_get_packages_no_input(): @@ -38,33 +38,51 @@ def test_mongodb_get_packages_spark_version_not_supported(spark_version): @pytest.mark.parametrize( "scala_version", [ + "2.9.2", "2.11", - "2.14", - "3.0", ], ) def test_mongodb_get_packages_scala_version_not_supported(scala_version): - with pytest.raises(ValueError, match=f"Scala version must be 2.12 - 2.13, got {scala_version}"): + with pytest.raises(ValueError, match=f"Scala version must be at least 2.12, got {scala_version}"): MongoDB.get_packages(scala_version=scala_version) @pytest.mark.parametrize( - "spark_version, scala_version, package", + "spark_version, scala_version, package_version, package", [ - # use Scala version directly - (None, "2.12", "org.mongodb.spark:mongo-spark-connector_2.12:10.1.1"), - (None, "2.13", "org.mongodb.spark:mongo-spark-connector_2.13:10.1.1"), - # Detect Scala version by Spark version - ("3.2", None, "org.mongodb.spark:mongo-spark-connector_2.12:10.1.1"), - ("3.3", None, "org.mongodb.spark:mongo-spark-connector_2.12:10.1.1"), - ("3.4", None, "org.mongodb.spark:mongo-spark-connector_2.12:10.1.1"), - # Override Scala version detected automatically - ("3.2", "2.12", "org.mongodb.spark:mongo-spark-connector_2.12:10.1.1"), - ("3.4", "2.13", "org.mongodb.spark:mongo-spark-connector_2.13:10.1.1"), + (None, "2.12", "10.3.0", "org.mongodb.spark:mongo-spark-connector_2.12:10.3.0"), + (None, "2.13", "10.3.0", "org.mongodb.spark:mongo-spark-connector_2.13:10.3.0"), + ("3.2", None, "10.3.0", "org.mongodb.spark:mongo-spark-connector_2.12:10.3.0"), + ("3.3", None, "10.3.0", "org.mongodb.spark:mongo-spark-connector_2.12:10.3.0"), + ("3.4", None, "10.3.0", "org.mongodb.spark:mongo-spark-connector_2.12:10.3.0"), + ("3.2", "2.12", "10.1.1", "org.mongodb.spark:mongo-spark-connector_2.12:10.1.1"), + ("3.4", "2.13", "10.1.1", "org.mongodb.spark:mongo-spark-connector_2.13:10.1.1"), + ("3.2", "2.12", "10.2.1", "org.mongodb.spark:mongo-spark-connector_2.12:10.2.1"), + ("3.2", "2.12", "10.2.0", "org.mongodb.spark:mongo-spark-connector_2.12:10.2.0"), + ("3.2.4", "2.12.1", "10.3.0", "org.mongodb.spark:mongo-spark-connector_2.12:10.3.0"), ], ) -def test_mongodb_get_packages(spark_version, scala_version, package): - assert MongoDB.get_packages(spark_version=spark_version, scala_version=scala_version) == [package] +def test_mongodb_get_packages(spark_version, scala_version, package_version, package): + assert MongoDB.get_packages( + spark_version=spark_version, + scala_version=scala_version, + package_version=package_version, + ) == [package] + + +@pytest.mark.parametrize( + "package_version", + [ + "10", + "abc", + ], +) +def test_mongodb_get_packages_invalid_package_version(package_version): + with pytest.raises( + ValueError, + match=rf"Version '{package_version}' does not have enough numeric components for requested format \(expected at least 2\).", + ): + MongoDB.get_packages(scala_version="2.12", package_version=package_version) def test_mongodb_missing_package(spark_no_packages): diff --git a/tests/tests_unit/tests_db_connection_unit/test_mssql_unit.py b/tests/tests_unit/tests_db_connection_unit/test_mssql_unit.py index 7b0328ca9..3e3f81496 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_mssql_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_mssql_unit.py @@ -14,31 +14,48 @@ def test_mssql_class_attributes(): def test_mssql_package(): warning_msg = re.escape("will be removed in 1.0.0, use `MSSQL.get_packages()` instead") with pytest.warns(UserWarning, match=warning_msg): - assert MSSQL.package == "com.microsoft.sqlserver:mssql-jdbc:12.2.0.jre8" + assert MSSQL.package == "com.microsoft.sqlserver:mssql-jdbc:12.6.2.jre8" -def test_mssql_get_packages_no_input(): - assert MSSQL.get_packages() == ["com.microsoft.sqlserver:mssql-jdbc:12.2.0.jre8"] - - -@pytest.mark.parametrize("java_version", ["7", "6"]) -def test_mssql_get_packages_java_version_not_supported(java_version): - with pytest.raises(ValueError, match=f"Java version must be at least 8, got {java_version}"): - MSSQL.get_packages(java_version=java_version) +@pytest.mark.parametrize( + "java_version, package_version, expected_packages", + [ + (None, None, ["com.microsoft.sqlserver:mssql-jdbc:12.6.2.jre8"]), + ("8", None, ["com.microsoft.sqlserver:mssql-jdbc:12.6.2.jre8"]), + ("9", None, ["com.microsoft.sqlserver:mssql-jdbc:12.6.2.jre8"]), + ("11", None, ["com.microsoft.sqlserver:mssql-jdbc:12.6.2.jre11"]), + ("20", None, ["com.microsoft.sqlserver:mssql-jdbc:12.6.2.jre11"]), + ("8", "12.6.2.jre8", ["com.microsoft.sqlserver:mssql-jdbc:12.6.2.jre8"]), + ("11", "12.6.2.jre11", ["com.microsoft.sqlserver:mssql-jdbc:12.6.2.jre11"]), + ("11", "12.7.0.jre11-preview", ["com.microsoft.sqlserver:mssql-jdbc:12.7.0.jre11-preview"]), + ("8", "12.7.0.jre8-preview", ["com.microsoft.sqlserver:mssql-jdbc:12.7.0.jre8-preview"]), + ("8", "12.6.2", ["com.microsoft.sqlserver:mssql-jdbc:12.6.2.jre8"]), + ("11", "12.6.2", ["com.microsoft.sqlserver:mssql-jdbc:12.6.2.jre11"]), + ], +) +def test_mssql_get_packages(java_version, package_version, expected_packages): + assert MSSQL.get_packages(java_version=java_version, package_version=package_version) == expected_packages @pytest.mark.parametrize( - "java_version, package", + "package_version", [ - ("8", "com.microsoft.sqlserver:mssql-jdbc:12.2.0.jre8"), - ("9", "com.microsoft.sqlserver:mssql-jdbc:12.2.0.jre8"), - ("11", "com.microsoft.sqlserver:mssql-jdbc:12.2.0.jre11"), - ("17", "com.microsoft.sqlserver:mssql-jdbc:12.2.0.jre11"), - ("20", "com.microsoft.sqlserver:mssql-jdbc:12.2.0.jre11"), + "12.7", + "abc", ], ) -def test_mssql_get_packages(java_version, package): - assert MSSQL.get_packages(java_version=java_version) == [package] +def test_mssql_get_packages_invalid_version(package_version): + with pytest.raises( + ValueError, + match=rf"Version '{package_version}' does not have enough numeric components for requested format \(expected at least 3\).", + ): + MSSQL.get_packages(package_version=package_version) + + +@pytest.mark.parametrize("java_version", ["7", "6"]) +def test_mssql_get_packages_java_version_not_supported(java_version): + with pytest.raises(ValueError, match=f"Java version must be at least 8, got {java_version}"): + MSSQL.get_packages(java_version=java_version) def test_mssql_missing_package(spark_no_packages): @@ -75,7 +92,14 @@ def test_mssql(spark_mock): assert conn.password.get_secret_value() == "passwd" assert conn.database == "database" - assert conn.jdbc_url == "jdbc:sqlserver://some_host:1433;databaseName=database" + assert conn.jdbc_url == "jdbc:sqlserver://some_host:1433" + assert conn.jdbc_params == { + "user": "user", + "password": "passwd", + "driver": "com.microsoft.sqlserver.jdbc.SQLServerDriver", + "url": "jdbc:sqlserver://some_host:1433", + "databaseName": "database", + } assert "password='passwd'" not in str(conn) assert "password='passwd'" not in repr(conn) @@ -91,7 +115,14 @@ def test_mssql_with_port(spark_mock): assert conn.password.get_secret_value() == "passwd" assert conn.database == "database" - assert conn.jdbc_url == "jdbc:sqlserver://some_host:5000;databaseName=database" + assert conn.jdbc_url == "jdbc:sqlserver://some_host:5000" + assert conn.jdbc_params == { + "user": "user", + "password": "passwd", + "driver": "com.microsoft.sqlserver.jdbc.SQLServerDriver", + "url": "jdbc:sqlserver://some_host:5000", + "databaseName": "database", + } def test_mssql_without_database_error(spark_mock): @@ -101,7 +132,6 @@ def test_mssql_without_database_error(spark_mock): user="user", password="passwd", spark=spark_mock, - extra={"trustServerCertificate": "true"}, ) @@ -115,10 +145,16 @@ def test_mssql_with_extra(spark_mock): spark=spark_mock, ) - assert ( - conn.jdbc_url - == "jdbc:sqlserver://some_host:1433;characterEncoding=UTF-8;databaseName=database;trustServerCertificate=true" - ) + assert conn.jdbc_url == "jdbc:sqlserver://some_host:1433" + assert conn.jdbc_params == { + "user": "user", + "password": "passwd", + "driver": "com.microsoft.sqlserver.jdbc.SQLServerDriver", + "url": "jdbc:sqlserver://some_host:1433", + "databaseName": "database", + "characterEncoding": "UTF-8", + "trustServerCertificate": "true", + } def test_mssql_with_extra_prohibited(spark_mock): diff --git a/tests/tests_unit/tests_db_connection_unit/test_mysql_unit.py b/tests/tests_unit/tests_db_connection_unit/test_mysql_unit.py index ed730c418..e95b34f1a 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_mysql_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_mysql_unit.py @@ -14,11 +14,35 @@ def test_mysql_class_attributes(): def test_mysql_package(): warning_msg = re.escape("will be removed in 1.0.0, use `MySQL.get_packages()` instead") with pytest.warns(UserWarning, match=warning_msg): - assert MySQL.package == "com.mysql:mysql-connector-j:8.0.33" - - -def test_mysql_get_packages(): - assert MySQL.get_packages() == ["com.mysql:mysql-connector-j:8.0.33"] + assert MySQL.package == "com.mysql:mysql-connector-j:8.4.0" + + +@pytest.mark.parametrize( + "package_version, expected_packages", + [ + (None, ["com.mysql:mysql-connector-j:8.4.0"]), + ("8.4.0", ["com.mysql:mysql-connector-j:8.4.0"]), + ("8.1.0", ["com.mysql:mysql-connector-j:8.1.0"]), + ("8.0.33", ["com.mysql:mysql-connector-j:8.0.33"]), + ], +) +def test_mysql_get_packages(package_version, expected_packages): + assert MySQL.get_packages(package_version=package_version) == expected_packages + + +@pytest.mark.parametrize( + "package_version", + [ + "8.3", + "abc", + ], +) +def test_mysql_get_packages_invalid_version(package_version): + with pytest.raises( + ValueError, + match=rf"Version '{package_version}' does not have enough numeric components for requested format \(expected at least 3\).", + ): + MySQL.get_packages(package_version=package_version) def test_mysql_missing_package(spark_no_packages): @@ -55,7 +79,15 @@ def test_mysql(spark_mock): assert conn.password.get_secret_value() == "passwd" assert conn.database == "database" - assert conn.jdbc_url == "jdbc:mysql://some_host:3306/database?characterEncoding=UTF-8&useUnicode=yes" + assert conn.jdbc_url == "jdbc:mysql://some_host:3306/database" + assert conn.jdbc_params == { + "user": "user", + "password": "passwd", + "driver": "com.mysql.cj.jdbc.Driver", + "url": "jdbc:mysql://some_host:3306/database", + "characterEncoding": "UTF-8", + "useUnicode": "yes", + } assert "password='passwd'" not in str(conn) assert "password='passwd'" not in repr(conn) @@ -71,7 +103,15 @@ def test_mysql_with_port(spark_mock): assert conn.password.get_secret_value() == "passwd" assert conn.database == "database" - assert conn.jdbc_url == "jdbc:mysql://some_host:5000/database?characterEncoding=UTF-8&useUnicode=yes" + assert conn.jdbc_url == "jdbc:mysql://some_host:5000/database" + assert conn.jdbc_params == { + "user": "user", + "password": "passwd", + "driver": "com.mysql.cj.jdbc.Driver", + "url": "jdbc:mysql://some_host:5000/database", + "characterEncoding": "UTF-8", + "useUnicode": "yes", + } def test_mysql_without_database(spark_mock): @@ -84,7 +124,15 @@ def test_mysql_without_database(spark_mock): assert conn.password.get_secret_value() == "passwd" assert not conn.database - assert conn.jdbc_url == "jdbc:mysql://some_host:3306?characterEncoding=UTF-8&useUnicode=yes" + assert conn.jdbc_url == "jdbc:mysql://some_host:3306" + assert conn.jdbc_params == { + "user": "user", + "password": "passwd", + "driver": "com.mysql.cj.jdbc.Driver", + "url": "jdbc:mysql://some_host:3306", + "characterEncoding": "UTF-8", + "useUnicode": "yes", + } def test_mysql_with_extra(spark_mock): @@ -97,10 +145,17 @@ def test_mysql_with_extra(spark_mock): spark=spark_mock, ) - assert conn.jdbc_url == ( - "jdbc:mysql://some_host:3306/database?allowMultiQueries=true&characterEncoding=UTF-8&" - "requireSSL=true&useUnicode=yes" - ) + assert conn.jdbc_url == "jdbc:mysql://some_host:3306/database" + assert conn.jdbc_params == { + "user": "user", + "password": "passwd", + "driver": "com.mysql.cj.jdbc.Driver", + "url": "jdbc:mysql://some_host:3306/database", + "characterEncoding": "UTF-8", + "useUnicode": "yes", + "allowMultiQueries": "true", + "requireSSL": "true", + } conn = MySQL( host="some_host", @@ -111,7 +166,15 @@ def test_mysql_with_extra(spark_mock): spark=spark_mock, ) - assert conn.jdbc_url == ("jdbc:mysql://some_host:3306/database?characterEncoding=CP-1251&useUnicode=no") + assert conn.jdbc_url == "jdbc:mysql://some_host:3306/database" + assert conn.jdbc_params == { + "user": "user", + "password": "passwd", + "driver": "com.mysql.cj.jdbc.Driver", + "url": "jdbc:mysql://some_host:3306/database", + "characterEncoding": "CP-1251", + "useUnicode": "no", + } def test_mysql_without_mandatory_args(spark_mock): diff --git a/tests/tests_unit/tests_db_connection_unit/test_oracle_unit.py b/tests/tests_unit/tests_db_connection_unit/test_oracle_unit.py index 6a875b8f7..892c86906 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_oracle_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_oracle_unit.py @@ -14,11 +14,11 @@ def test_oracle_class_attributes(): def test_oracle_package(): warning_msg = re.escape("will be removed in 1.0.0, use `Oracle.get_packages()` instead") with pytest.warns(UserWarning, match=warning_msg): - assert Oracle.package == "com.oracle.database.jdbc:ojdbc8:23.2.0.0" + assert Oracle.package == "com.oracle.database.jdbc:ojdbc8:23.4.0.24.05" def test_oracle_get_packages_no_input(): - assert Oracle.get_packages() == ["com.oracle.database.jdbc:ojdbc8:23.2.0.0"] + assert Oracle.get_packages() == ["com.oracle.database.jdbc:ojdbc8:23.4.0.24.05"] @pytest.mark.parametrize("java_version", ["7", "6"]) @@ -28,17 +28,44 @@ def test_oracle_get_packages_java_version_not_supported(java_version): @pytest.mark.parametrize( - "java_version, package", + "java_version, package_version, expected_packages", [ - ("8", "com.oracle.database.jdbc:ojdbc8:23.2.0.0"), - ("9", "com.oracle.database.jdbc:ojdbc8:23.2.0.0"), - ("11", "com.oracle.database.jdbc:ojdbc11:23.2.0.0"), - ("17", "com.oracle.database.jdbc:ojdbc11:23.2.0.0"), - ("20", "com.oracle.database.jdbc:ojdbc11:23.2.0.0"), + (None, None, ["com.oracle.database.jdbc:ojdbc8:23.4.0.24.05"]), + ("8", None, ["com.oracle.database.jdbc:ojdbc8:23.4.0.24.05"]), + ("8", "23.4.0.24.05", ["com.oracle.database.jdbc:ojdbc8:23.4.0.24.05"]), + ("8", "21.13.0.0", ["com.oracle.database.jdbc:ojdbc8:21.13.0.0"]), + ("9", None, ["com.oracle.database.jdbc:ojdbc8:23.4.0.24.05"]), + ("9", "21.13.0.0", ["com.oracle.database.jdbc:ojdbc8:21.13.0.0"]), + ("11", None, ["com.oracle.database.jdbc:ojdbc11:23.4.0.24.05"]), + ("11", "21.13.0.0", ["com.oracle.database.jdbc:ojdbc11:21.13.0.0"]), + ("17", "21.13.0.0", ["com.oracle.database.jdbc:ojdbc11:21.13.0.0"]), + ("20", "23.4.0.24.05", ["com.oracle.database.jdbc:ojdbc11:23.4.0.24.05"]), ], ) -def test_oracle_get_packages(java_version, package): - assert Oracle.get_packages(java_version=java_version) == [package] +def test_oracle_get_packages(java_version, package_version, expected_packages): + assert Oracle.get_packages(java_version=java_version, package_version=package_version) == expected_packages + + +@pytest.mark.parametrize( + "java_version, package_version", + [ + ("8", "23.3.0"), + ("11", "23.3"), + ("11", "a.b.c.d"), + ], +) +def test_oracle_get_packages_invalid_version(java_version, package_version): + with pytest.raises( + ValueError, + match=rf"Version '{package_version}' does not have enough numeric components for requested format \(expected at least 4\).", + ): + Oracle.get_packages(java_version=java_version, package_version=package_version) + + +@pytest.mark.parametrize("java_version", ["7", "6"]) +def test_oracle_get_packages_java_version_not_supported(java_version): + with pytest.raises(ValueError, match=f"Java version must be at least 8, got {java_version}"): + Oracle.get_packages(java_version=java_version) def test_oracle_missing_package(spark_no_packages): @@ -76,6 +103,12 @@ def test_oracle(spark_mock): assert conn.sid == "sid" assert conn.jdbc_url == "jdbc:oracle:thin:@some_host:1521:sid" + assert conn.jdbc_params == { + "user": "user", + "password": "passwd", + "driver": "oracle.jdbc.driver.OracleDriver", + "url": "jdbc:oracle:thin:@some_host:1521:sid", + } assert "password='passwd'" not in str(conn) assert "password='passwd'" not in repr(conn) @@ -92,12 +125,24 @@ def test_oracle_with_port(spark_mock): assert conn.sid == "sid" assert conn.jdbc_url == "jdbc:oracle:thin:@some_host:5000:sid" + assert conn.jdbc_params == { + "user": "user", + "password": "passwd", + "driver": "oracle.jdbc.driver.OracleDriver", + "url": "jdbc:oracle:thin:@some_host:5000:sid", + } def test_oracle_uri_with_service_name(spark_mock): conn = Oracle(host="some_host", user="user", password="passwd", service_name="service", spark=spark_mock) assert conn.jdbc_url == "jdbc:oracle:thin:@//some_host:1521/service" + assert conn.jdbc_params == { + "user": "user", + "password": "passwd", + "driver": "oracle.jdbc.driver.OracleDriver", + "url": "jdbc:oracle:thin:@//some_host:1521/service", + } def test_oracle_without_sid_and_service_name(spark_mock): @@ -140,7 +185,15 @@ def test_oracle_with_extra(spark_mock): spark=spark_mock, ) - assert conn.jdbc_url == "jdbc:oracle:thin:@some_host:1521:sid?connectTimeout=10&tcpKeepAlive=false" + assert conn.jdbc_url == "jdbc:oracle:thin:@some_host:1521:sid" + assert conn.jdbc_params == { + "user": "user", + "password": "passwd", + "driver": "oracle.jdbc.driver.OracleDriver", + "url": "jdbc:oracle:thin:@some_host:1521:sid", + "tcpKeepAlive": "false", + "connectTimeout": "10", + } def test_oracle_without_mandatory_args(spark_mock): diff --git a/tests/tests_unit/tests_db_connection_unit/test_postgres_unit.py b/tests/tests_unit/tests_db_connection_unit/test_postgres_unit.py index f6b5f4e9b..268525220 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_postgres_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_postgres_unit.py @@ -14,14 +14,38 @@ def test_postgres_class_attributes(): def test_postgres_package(): warning_msg = re.escape("will be removed in 1.0.0, use `Postgres.get_packages()` instead") with pytest.warns(UserWarning, match=warning_msg): - assert Postgres.package == "org.postgresql:postgresql:42.6.0" - - -def test_postgres_get_packages(): - assert Postgres.get_packages() == ["org.postgresql:postgresql:42.6.0"] - - -def test_oracle_missing_package(spark_no_packages): + assert Postgres.package == "org.postgresql:postgresql:42.7.3" + + +@pytest.mark.parametrize( + "package_version, expected_packages", + [ + (None, ["org.postgresql:postgresql:42.7.3"]), + ("42.7.3", ["org.postgresql:postgresql:42.7.3"]), + ("42.7.3-patch", ["org.postgresql:postgresql:42.7.3-patch"]), + ("42.6.0", ["org.postgresql:postgresql:42.6.0"]), + ], +) +def test_postgres_get_packages(package_version, expected_packages): + assert Postgres.get_packages(package_version=package_version) == expected_packages + + +@pytest.mark.parametrize( + "package_version", + [ + "42.2", + "abc", + ], +) +def test_postgres_get_packages_invalid_version(package_version): + with pytest.raises( + ValueError, + match=rf"Version '{package_version}' does not have enough numeric components for requested format \(expected at least 3\).", + ): + Postgres.get_packages(package_version=package_version) + + +def test_postgres_missing_package(spark_no_packages): msg = "Cannot import Java class 'org.postgresql.Driver'" with pytest.raises(ValueError, match=msg): Postgres( @@ -55,7 +79,16 @@ def test_postgres(spark_mock): assert conn.password.get_secret_value() == "passwd" assert conn.database == "database" - assert conn.jdbc_url == "jdbc:postgresql://some_host:5432/database?ApplicationName=abc&stringtype=unspecified" + assert conn.jdbc_url == "jdbc:postgresql://some_host:5432/database" + assert conn.jdbc_params == { + "user": "user", + "password": "passwd", + "driver": "org.postgresql.Driver", + "url": "jdbc:postgresql://some_host:5432/database", + "ApplicationName": "abc", + "tcpKeepAlive": "true", + "stringtype": "unspecified", + } assert "password='passwd'" not in str(conn) assert "password='passwd'" not in repr(conn) @@ -71,7 +104,16 @@ def test_postgres_with_port(spark_mock): assert conn.password.get_secret_value() == "passwd" assert conn.database == "database" - assert conn.jdbc_url == "jdbc:postgresql://some_host:5000/database?ApplicationName=abc&stringtype=unspecified" + assert conn.jdbc_url == "jdbc:postgresql://some_host:5000/database" + assert conn.jdbc_params == { + "user": "user", + "password": "passwd", + "driver": "org.postgresql.Driver", + "url": "jdbc:postgresql://some_host:5000/database", + "ApplicationName": "abc", + "tcpKeepAlive": "true", + "stringtype": "unspecified", + } def test_postgres_without_database_error(spark_mock): @@ -85,14 +127,28 @@ def test_postgres_with_extra(spark_mock): user="user", password="passwd", database="database", - extra={"ssl": "true", "autosave": "always"}, + extra={ + "stringtype": "VARCHAR", + "autosave": "always", + "tcpKeepAlive": "false", + "ApplicationName": "override", + "ssl": "true", + }, spark=spark_mock, ) - assert ( - conn.jdbc_url - == "jdbc:postgresql://some_host:5432/database?ApplicationName=abc&autosave=always&ssl=true&stringtype=unspecified" - ) + assert conn.jdbc_url == "jdbc:postgresql://some_host:5432/database" + assert conn.jdbc_params == { + "user": "user", + "password": "passwd", + "driver": "org.postgresql.Driver", + "url": "jdbc:postgresql://some_host:5432/database", + "stringtype": "VARCHAR", + "autosave": "always", + "tcpKeepAlive": "false", + "ApplicationName": "override", + "ssl": "true", + } def test_postgres_without_mandatory_args(spark_mock): diff --git a/tests/tests_unit/tests_db_connection_unit/test_teradata_unit.py b/tests/tests_unit/tests_db_connection_unit/test_teradata_unit.py index dd9ba525d..b71d7e8d1 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_teradata_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_teradata_unit.py @@ -17,8 +17,31 @@ def test_teradata_package(): assert Teradata.package == "com.teradata.jdbc:terajdbc:17.20.00.15" -def test_teradata_get_packages(): - assert Teradata.get_packages() == ["com.teradata.jdbc:terajdbc:17.20.00.15"] +@pytest.mark.parametrize( + "package_version, expected_package", + [ + (None, ["com.teradata.jdbc:terajdbc:17.20.00.15"]), + ("17.20.00.15", ["com.teradata.jdbc:terajdbc:17.20.00.15"]), + ("16.20.00.13", ["com.teradata.jdbc:terajdbc:16.20.00.13"]), + ], +) +def test_teradata_get_packages_valid_versions(package_version, expected_package): + assert Teradata.get_packages(package_version=package_version) == expected_package + + +@pytest.mark.parametrize( + "package_version", + [ + "20.00.13", + "abc", + ], +) +def test_teradata_get_packages_invalid_version(package_version): + with pytest.raises( + ValueError, + match=rf"Version '{package_version}' does not have enough numeric components for requested format \(expected at least 4\).", + ): + Teradata.get_packages(package_version=package_version) def test_teradata_missing_package(spark_no_packages): @@ -59,6 +82,12 @@ def test_teradata(spark_mock): "jdbc:teradata://some_host/CHARSET=UTF8,COLUMN_NAME=ON,DATABASE=database," "DBS_PORT=1025,FLATTEN=ON,MAYBENULL=ON,STRICT_NAMES=OFF" ) + assert conn.jdbc_params == { + "user": "user", + "password": "passwd", + "driver": "com.teradata.jdbc.TeraDriver", + "url": conn.jdbc_url, + } assert "password='passwd'" not in str(conn) assert "password='passwd'" not in repr(conn) @@ -78,6 +107,12 @@ def test_teradata_with_port(spark_mock): "jdbc:teradata://some_host/CHARSET=UTF8,COLUMN_NAME=ON,DATABASE=database," "DBS_PORT=5000,FLATTEN=ON,MAYBENULL=ON,STRICT_NAMES=OFF" ) + assert conn.jdbc_params == { + "user": "user", + "password": "passwd", + "driver": "com.teradata.jdbc.TeraDriver", + "url": conn.jdbc_url, + } def test_teradata_without_database(spark_mock): @@ -94,6 +129,12 @@ def test_teradata_without_database(spark_mock): "jdbc:teradata://some_host/CHARSET=UTF8,COLUMN_NAME=ON," "DBS_PORT=1025,FLATTEN=ON,MAYBENULL=ON,STRICT_NAMES=OFF" ) + assert conn.jdbc_params == { + "user": "user", + "password": "passwd", + "driver": "com.teradata.jdbc.TeraDriver", + "url": conn.jdbc_url, + } def test_teradata_with_extra(spark_mock): @@ -102,14 +143,20 @@ def test_teradata_with_extra(spark_mock): user="user", password="passwd", database="database", - extra={"TMODE": "TERA", "LOGMECH": "LDAP"}, + extra={"TMODE": "TERA", "LOGMECH": "LDAP", "PARAM_WITH_COMMA": "some,value"}, spark=spark_mock, ) assert conn.jdbc_url == ( "jdbc:teradata://some_host/CHARSET=UTF8,COLUMN_NAME=ON,DATABASE=database," - "DBS_PORT=1025,FLATTEN=ON,LOGMECH=LDAP,MAYBENULL=ON,STRICT_NAMES=OFF,TMODE=TERA" + "DBS_PORT=1025,FLATTEN=ON,LOGMECH=LDAP,MAYBENULL=ON,PARAM_WITH_COMMA='some,value',STRICT_NAMES=OFF,TMODE=TERA" ) + assert conn.jdbc_params == { + "user": "user", + "password": "passwd", + "driver": "com.teradata.jdbc.TeraDriver", + "url": conn.jdbc_url, + } conn = Teradata( host="some_host", @@ -124,6 +171,12 @@ def test_teradata_with_extra(spark_mock): "jdbc:teradata://some_host/CHARSET=CP-1251,COLUMN_NAME=OFF,DATABASE=database," "DBS_PORT=1025,FLATTEN=OFF,MAYBENULL=OFF,STRICT_NAMES=ON" ) + assert conn.jdbc_params == { + "user": "user", + "password": "passwd", + "driver": "com.teradata.jdbc.TeraDriver", + "url": conn.jdbc_url, + } def test_teradata_with_extra_prohibited(spark_mock): diff --git a/tests/util/assert_df.py b/tests/util/assert_df.py index f7adad032..e36c9af73 100644 --- a/tests/util/assert_df.py +++ b/tests/util/assert_df.py @@ -67,4 +67,4 @@ def assert_subset_df( for column in columns: # noqa: WPS528 difference = ~small_pdf[column].isin(large_pdf[column]) - assert not difference.all(), large_pdf[difference] + assert not difference.all(), small_pdf[difference]