From 68d13f255b5cc768f35b7c6f57f6ffe2b6aa26fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Mon, 5 Feb 2024 09:16:22 +0000 Subject: [PATCH 01/63] [DOP-11905] Bump version --- onetl/VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onetl/VERSION b/onetl/VERSION index 571215736..5eef0f10e 100644 --- a/onetl/VERSION +++ b/onetl/VERSION @@ -1 +1 @@ -0.10.1 +0.10.2 From 34cb7f5996045d925c5c27dd83c24066a8ff4027 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 8 Feb 2024 11:02:34 +0300 Subject: [PATCH 02/63] Bump mikefarah/yq from 4.40.5 to 4.40.6 (#210) Bumps [mikefarah/yq](https://github.com/mikefarah/yq) from 4.40.5 to 4.40.6. - [Release notes](https://github.com/mikefarah/yq/releases) - [Changelog](https://github.com/mikefarah/yq/blob/master/release_notes.txt) - [Commits](https://github.com/mikefarah/yq/compare/v4.40.5...v4.40.6) --- updated-dependencies: - dependency-name: mikefarah/yq dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/get-matrix.yml | 38 ++++++++++++++++---------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/.github/workflows/get-matrix.yml b/.github/workflows/get-matrix.yml index a97dd5a7d..27b7ee8e3 100644 --- a/.github/workflows/get-matrix.yml +++ b/.github/workflows/get-matrix.yml @@ -154,7 +154,7 @@ jobs: - name: Get Core matrix id: matrix-core - uses: mikefarah/yq@v4.40.5 + uses: mikefarah/yq@v4.40.6 with: cmd: yq -o=json '.matrix' .github/workflows/data/core/matrix.yml @@ -184,7 +184,7 @@ jobs: - name: Get Clickhouse matrix id: matrix-clickhouse - uses: mikefarah/yq@v4.40.5 + uses: mikefarah/yq@v4.40.6 with: cmd: yq -o=json '.matrix' .github/workflows/data/clickhouse/matrix.yml @@ -214,7 +214,7 @@ jobs: - name: Get Greenplum matrix id: matrix-greenplum - uses: mikefarah/yq@v4.40.5 + uses: mikefarah/yq@v4.40.6 with: cmd: yq -o=json '.matrix' .github/workflows/data/greenplum/matrix.yml @@ -244,7 +244,7 @@ jobs: - name: Get Hive matrix id: matrix-hive - uses: mikefarah/yq@v4.40.5 + uses: mikefarah/yq@v4.40.6 with: cmd: yq -o=json '.matrix' .github/workflows/data/hive/matrix.yml @@ -274,7 +274,7 @@ jobs: - name: Get Kafka matrix id: matrix-kafka - uses: mikefarah/yq@v4.40.5 + uses: mikefarah/yq@v4.40.6 with: cmd: yq -o=json '.matrix' .github/workflows/data/kafka/matrix.yml @@ -304,7 +304,7 @@ jobs: - name: Get LocalFS matrix id: matrix-local-fs - uses: mikefarah/yq@v4.40.5 + uses: mikefarah/yq@v4.40.6 with: cmd: yq -o=json '.matrix' .github/workflows/data/local-fs/matrix.yml @@ -334,7 +334,7 @@ jobs: - name: Get MongoDB matrix id: matrix-mongodb - uses: mikefarah/yq@v4.40.5 + uses: mikefarah/yq@v4.40.6 with: cmd: yq -o=json '.matrix' .github/workflows/data/mongodb/matrix.yml @@ -364,7 +364,7 @@ jobs: - name: Get MSSQL matrix id: matrix-mssql - uses: mikefarah/yq@v4.40.5 + uses: mikefarah/yq@v4.40.6 with: cmd: yq -o=json '.matrix' .github/workflows/data/mssql/matrix.yml @@ -394,7 +394,7 @@ jobs: - name: Get MySQL matrix id: matrix-mysql - uses: mikefarah/yq@v4.40.5 + uses: mikefarah/yq@v4.40.6 with: cmd: yq -o=json '.matrix' .github/workflows/data/mysql/matrix.yml @@ -424,7 +424,7 @@ jobs: - name: Get Oracle matrix id: matrix-oracle - uses: mikefarah/yq@v4.40.5 + uses: mikefarah/yq@v4.40.6 with: cmd: yq -o=json '.matrix' .github/workflows/data/oracle/matrix.yml @@ -454,7 +454,7 @@ jobs: - name: Get Postgres matrix id: matrix-postgres - uses: mikefarah/yq@v4.40.5 + uses: mikefarah/yq@v4.40.6 with: cmd: yq -o=json '.matrix' .github/workflows/data/postgres/matrix.yml @@ -484,7 +484,7 @@ jobs: - name: Get Teradata matrix id: matrix-teradata - uses: mikefarah/yq@v4.40.5 + uses: mikefarah/yq@v4.40.6 with: cmd: yq -o=json '.matrix' .github/workflows/data/teradata/matrix.yml @@ -514,7 +514,7 @@ jobs: - name: Get FTP matrix id: matrix-ftp - uses: mikefarah/yq@v4.40.5 + uses: mikefarah/yq@v4.40.6 with: cmd: yq -o=json '.matrix' .github/workflows/data/ftp/matrix.yml @@ -544,7 +544,7 @@ jobs: - name: Get FTPS matrix id: matrix-ftps - uses: mikefarah/yq@v4.40.5 + uses: mikefarah/yq@v4.40.6 with: cmd: yq -o=json '.matrix' .github/workflows/data/ftps/matrix.yml @@ -574,7 +574,7 @@ jobs: - name: Get HDFS matrix id: matrix-hdfs - uses: mikefarah/yq@v4.40.5 + uses: mikefarah/yq@v4.40.6 with: cmd: yq -o=json '.matrix' .github/workflows/data/hdfs/matrix.yml @@ -604,7 +604,7 @@ jobs: - name: Get S3 matrix id: matrix-s3 - uses: mikefarah/yq@v4.40.5 + uses: mikefarah/yq@v4.40.6 with: cmd: yq -o=json '.matrix' .github/workflows/data/s3/matrix.yml @@ -634,7 +634,7 @@ jobs: - name: Get SFTP matrix id: matrix-sftp - uses: mikefarah/yq@v4.40.5 + uses: mikefarah/yq@v4.40.6 with: cmd: yq -o=json '.matrix' .github/workflows/data/sftp/matrix.yml @@ -664,7 +664,7 @@ jobs: - name: Get Samba matrix id: matrix-samba - uses: mikefarah/yq@v4.40.5 + uses: mikefarah/yq@v4.40.6 with: cmd: yq -o=json '.matrix' .github/workflows/data/samba/matrix.yml @@ -694,6 +694,6 @@ jobs: - name: Get WebDAV matrix id: matrix-webdav - uses: mikefarah/yq@v4.40.5 + uses: mikefarah/yq@v4.40.6 with: cmd: yq -o=json '.matrix' .github/workflows/data/webdav/matrix.yml From 8627a849ab7a5bb53aac5bfbd166b031d0f8b013 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 9 Feb 2024 10:55:26 +0300 Subject: [PATCH 03/63] Bump mikefarah/yq from 4.40.6 to 4.40.7 (#212) Bumps [mikefarah/yq](https://github.com/mikefarah/yq) from 4.40.6 to 4.40.7. - [Release notes](https://github.com/mikefarah/yq/releases) - [Changelog](https://github.com/mikefarah/yq/blob/master/release_notes.txt) - [Commits](https://github.com/mikefarah/yq/compare/v4.40.6...v4.40.7) --- updated-dependencies: - dependency-name: mikefarah/yq dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/get-matrix.yml | 38 ++++++++++++++++---------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/.github/workflows/get-matrix.yml b/.github/workflows/get-matrix.yml index 27b7ee8e3..1dc0b1ddd 100644 --- a/.github/workflows/get-matrix.yml +++ b/.github/workflows/get-matrix.yml @@ -154,7 +154,7 @@ jobs: - name: Get Core matrix id: matrix-core - uses: mikefarah/yq@v4.40.6 + uses: mikefarah/yq@v4.40.7 with: cmd: yq -o=json '.matrix' .github/workflows/data/core/matrix.yml @@ -184,7 +184,7 @@ jobs: - name: Get Clickhouse matrix id: matrix-clickhouse - uses: mikefarah/yq@v4.40.6 + uses: mikefarah/yq@v4.40.7 with: cmd: yq -o=json '.matrix' .github/workflows/data/clickhouse/matrix.yml @@ -214,7 +214,7 @@ jobs: - name: Get Greenplum matrix id: matrix-greenplum - uses: mikefarah/yq@v4.40.6 + uses: mikefarah/yq@v4.40.7 with: cmd: yq -o=json '.matrix' .github/workflows/data/greenplum/matrix.yml @@ -244,7 +244,7 @@ jobs: - name: Get Hive matrix id: matrix-hive - uses: mikefarah/yq@v4.40.6 + uses: mikefarah/yq@v4.40.7 with: cmd: yq -o=json '.matrix' .github/workflows/data/hive/matrix.yml @@ -274,7 +274,7 @@ jobs: - name: Get Kafka matrix id: matrix-kafka - uses: mikefarah/yq@v4.40.6 + uses: mikefarah/yq@v4.40.7 with: cmd: yq -o=json '.matrix' .github/workflows/data/kafka/matrix.yml @@ -304,7 +304,7 @@ jobs: - name: Get LocalFS matrix id: matrix-local-fs - uses: mikefarah/yq@v4.40.6 + uses: mikefarah/yq@v4.40.7 with: cmd: yq -o=json '.matrix' .github/workflows/data/local-fs/matrix.yml @@ -334,7 +334,7 @@ jobs: - name: Get MongoDB matrix id: matrix-mongodb - uses: mikefarah/yq@v4.40.6 + uses: mikefarah/yq@v4.40.7 with: cmd: yq -o=json '.matrix' .github/workflows/data/mongodb/matrix.yml @@ -364,7 +364,7 @@ jobs: - name: Get MSSQL matrix id: matrix-mssql - uses: mikefarah/yq@v4.40.6 + uses: mikefarah/yq@v4.40.7 with: cmd: yq -o=json '.matrix' .github/workflows/data/mssql/matrix.yml @@ -394,7 +394,7 @@ jobs: - name: Get MySQL matrix id: matrix-mysql - uses: mikefarah/yq@v4.40.6 + uses: mikefarah/yq@v4.40.7 with: cmd: yq -o=json '.matrix' .github/workflows/data/mysql/matrix.yml @@ -424,7 +424,7 @@ jobs: - name: Get Oracle matrix id: matrix-oracle - uses: mikefarah/yq@v4.40.6 + uses: mikefarah/yq@v4.40.7 with: cmd: yq -o=json '.matrix' .github/workflows/data/oracle/matrix.yml @@ -454,7 +454,7 @@ jobs: - name: Get Postgres matrix id: matrix-postgres - uses: mikefarah/yq@v4.40.6 + uses: mikefarah/yq@v4.40.7 with: cmd: yq -o=json '.matrix' .github/workflows/data/postgres/matrix.yml @@ -484,7 +484,7 @@ jobs: - name: Get Teradata matrix id: matrix-teradata - uses: mikefarah/yq@v4.40.6 + uses: mikefarah/yq@v4.40.7 with: cmd: yq -o=json '.matrix' .github/workflows/data/teradata/matrix.yml @@ -514,7 +514,7 @@ jobs: - name: Get FTP matrix id: matrix-ftp - uses: mikefarah/yq@v4.40.6 + uses: mikefarah/yq@v4.40.7 with: cmd: yq -o=json '.matrix' .github/workflows/data/ftp/matrix.yml @@ -544,7 +544,7 @@ jobs: - name: Get FTPS matrix id: matrix-ftps - uses: mikefarah/yq@v4.40.6 + uses: mikefarah/yq@v4.40.7 with: cmd: yq -o=json '.matrix' .github/workflows/data/ftps/matrix.yml @@ -574,7 +574,7 @@ jobs: - name: Get HDFS matrix id: matrix-hdfs - uses: mikefarah/yq@v4.40.6 + uses: mikefarah/yq@v4.40.7 with: cmd: yq -o=json '.matrix' .github/workflows/data/hdfs/matrix.yml @@ -604,7 +604,7 @@ jobs: - name: Get S3 matrix id: matrix-s3 - uses: mikefarah/yq@v4.40.6 + uses: mikefarah/yq@v4.40.7 with: cmd: yq -o=json '.matrix' .github/workflows/data/s3/matrix.yml @@ -634,7 +634,7 @@ jobs: - name: Get SFTP matrix id: matrix-sftp - uses: mikefarah/yq@v4.40.6 + uses: mikefarah/yq@v4.40.7 with: cmd: yq -o=json '.matrix' .github/workflows/data/sftp/matrix.yml @@ -664,7 +664,7 @@ jobs: - name: Get Samba matrix id: matrix-samba - uses: mikefarah/yq@v4.40.6 + uses: mikefarah/yq@v4.40.7 with: cmd: yq -o=json '.matrix' .github/workflows/data/samba/matrix.yml @@ -694,6 +694,6 @@ jobs: - name: Get WebDAV matrix id: matrix-webdav - uses: mikefarah/yq@v4.40.6 + uses: mikefarah/yq@v4.40.7 with: cmd: yq -o=json '.matrix' .github/workflows/data/webdav/matrix.yml From 383e14c772cde25edf23eee29e1cb4ab0661de34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Thu, 15 Feb 2024 08:29:42 +0000 Subject: [PATCH 04/63] Temporary avoid FileConnection tests on Python 3.12.2 --- .github/workflows/data/core/matrix.yml | 6 ++++-- .github/workflows/data/ftp/matrix.yml | 3 ++- .github/workflows/data/ftps/matrix.yml | 3 ++- .github/workflows/data/hdfs/matrix.yml | 6 ++++-- .github/workflows/data/s3/matrix.yml | 6 ++++-- .github/workflows/data/samba/matrix.yml | 3 ++- .github/workflows/data/sftp/matrix.yml | 3 ++- .github/workflows/data/webdav/matrix.yml | 3 ++- 8 files changed, 22 insertions(+), 11 deletions(-) diff --git a/.github/workflows/data/core/matrix.yml b/.github/workflows/data/core/matrix.yml index 522e2160a..82da65384 100644 --- a/.github/workflows/data/core/matrix.yml +++ b/.github/workflows/data/core/matrix.yml @@ -6,13 +6,15 @@ min: &min max: &max spark-version: 3.5.0 - python-version: '3.12' + # TODO: replace with 3.12 after release of https://github.com/python/typing_extensions/pull/324 + python-version: '3.12.1' java-version: 20 os: ubuntu-latest latest: &latest spark-version: latest - python-version: '3.12' + # TODO: replace with 3.12 after release of https://github.com/python/typing_extensions/pull/324 + python-version: '3.12.1' java-version: 20 os: ubuntu-latest diff --git a/.github/workflows/data/ftp/matrix.yml b/.github/workflows/data/ftp/matrix.yml index 86243228c..fd1ca573a 100644 --- a/.github/workflows/data/ftp/matrix.yml +++ b/.github/workflows/data/ftp/matrix.yml @@ -3,7 +3,8 @@ min: &min os: ubuntu-latest max: &max - python-version: '3.12' + # TODO: replace with 3.12 after release of https://github.com/python/typing_extensions/pull/324 + python-version: '3.12.1' os: ubuntu-latest matrix: diff --git a/.github/workflows/data/ftps/matrix.yml b/.github/workflows/data/ftps/matrix.yml index bb61dc6b0..f902c5c98 100644 --- a/.github/workflows/data/ftps/matrix.yml +++ b/.github/workflows/data/ftps/matrix.yml @@ -3,7 +3,8 @@ min: &min os: ubuntu-latest max: &max - python-version: '3.12' + # TODO: replace with 3.12 after release of https://github.com/python/typing_extensions/pull/324 + python-version: '3.12.1' os: ubuntu-latest matrix: diff --git a/.github/workflows/data/hdfs/matrix.yml b/.github/workflows/data/hdfs/matrix.yml index c117c8d6f..07d7844c0 100644 --- a/.github/workflows/data/hdfs/matrix.yml +++ b/.github/workflows/data/hdfs/matrix.yml @@ -8,14 +8,16 @@ min: &min max: &max hadoop-version: hadoop3-hdfs spark-version: 3.5.0 - python-version: '3.12' + # TODO: replace with 3.12 after release of https://github.com/python/typing_extensions/pull/324 + python-version: '3.12.1' java-version: 20 os: ubuntu-latest latest: &latest hadoop-version: hadoop3-hdfs spark-version: latest - python-version: '3.12' + # TODO: replace with 3.12 after release of https://github.com/python/typing_extensions/pull/324 + python-version: '3.12.1' java-version: 20 os: ubuntu-latest diff --git a/.github/workflows/data/s3/matrix.yml b/.github/workflows/data/s3/matrix.yml index b2d595d6b..872cb36ca 100644 --- a/.github/workflows/data/s3/matrix.yml +++ b/.github/workflows/data/s3/matrix.yml @@ -10,14 +10,16 @@ min: &min max: &max minio-version: 2023.7.18 spark-version: 3.5.0 - python-version: '3.12' + # TODO: replace with 3.12 after release of https://github.com/python/typing_extensions/pull/324 + python-version: '3.12.1' java-version: 20 os: ubuntu-latest latest: &latest minio-version: latest spark-version: latest - python-version: '3.12' + # TODO: replace with 3.12 after release of https://github.com/python/typing_extensions/pull/324 + python-version: '3.12.1' java-version: 20 os: ubuntu-latest diff --git a/.github/workflows/data/samba/matrix.yml b/.github/workflows/data/samba/matrix.yml index f03adcd08..8bcb05216 100644 --- a/.github/workflows/data/samba/matrix.yml +++ b/.github/workflows/data/samba/matrix.yml @@ -3,7 +3,8 @@ min: &min os: ubuntu-latest max: &max - python-version: '3.12' + # TODO: replace with 3.12 after release of https://github.com/python/typing_extensions/pull/324 + python-version: '3.12.1' os: ubuntu-latest matrix: diff --git a/.github/workflows/data/sftp/matrix.yml b/.github/workflows/data/sftp/matrix.yml index 3379a6e3e..453966524 100644 --- a/.github/workflows/data/sftp/matrix.yml +++ b/.github/workflows/data/sftp/matrix.yml @@ -3,7 +3,8 @@ min: &min os: ubuntu-latest max: &max - python-version: '3.12' + # TODO: replace with 3.12 after release of https://github.com/python/typing_extensions/pull/324 + python-version: '3.12.1' os: ubuntu-latest matrix: diff --git a/.github/workflows/data/webdav/matrix.yml b/.github/workflows/data/webdav/matrix.yml index 0a67ff838..d1c5b9f5d 100644 --- a/.github/workflows/data/webdav/matrix.yml +++ b/.github/workflows/data/webdav/matrix.yml @@ -3,7 +3,8 @@ min: &min os: ubuntu-latest max: &max - python-version: '3.12' + # TODO: replace with 3.12 after release of https://github.com/python/typing_extensions/pull/324 + python-version: '3.12.1' os: ubuntu-latest matrix: From d51d6c0d4ae5d96cfb8dba598772a79331918e7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Thu, 15 Feb 2024 09:06:52 +0000 Subject: [PATCH 05/63] Fix mypy warnings --- mypy.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mypy.ini b/mypy.ini index 38776255b..38dc2b0fe 100644 --- a/mypy.ini +++ b/mypy.ini @@ -1,5 +1,5 @@ [mypy] -python_version = 3.7 +python_version = 3.8 exclude = site-packages|__pycache__|docs|old|build|dist|tests|venv strict_optional=True follow_imports = silent From 8d5c21ba45c7f20b2b285bd11908ab486802e2b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Thu, 15 Feb 2024 09:07:45 +0000 Subject: [PATCH 06/63] Fix YAMLHWMStore docs --- docs/hwm_store/yaml_hwm_store.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/hwm_store/yaml_hwm_store.rst b/docs/hwm_store/yaml_hwm_store.rst index b9d268fa7..45cf2612a 100644 --- a/docs/hwm_store/yaml_hwm_store.rst +++ b/docs/hwm_store/yaml_hwm_store.rst @@ -6,4 +6,4 @@ YAML HWM Store .. currentmodule:: onetl.hwm.store.yaml_hwm_store .. autoclass:: YAMLHWMStore - :members: get_hwm, save_hwm, __enter__ + :members: get_hwm, set_hwm, __enter__ From 49c069f8d81cebf5c04c989cea4428afaf8acf42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Thu, 15 Feb 2024 09:12:56 +0000 Subject: [PATCH 07/63] Fix AutoDetectHWM docs --- docs/changelog/0.10.1.rst | 2 +- onetl/db/db_reader/db_reader.py | 9 ++------- onetl/strategy/incremental_strategy.py | 5 +---- onetl/strategy/snapshot_strategy.py | 2 -- 4 files changed, 4 insertions(+), 14 deletions(-) diff --git a/docs/changelog/0.10.1.rst b/docs/changelog/0.10.1.rst index 7f30ca284..49aed2425 100644 --- a/docs/changelog/0.10.1.rst +++ b/docs/changelog/0.10.1.rst @@ -11,7 +11,7 @@ Features reader = DBReader( connection=Kafka(...), source="topic_name", - hwm=AutoDetectHWM(name="some_hwm_name", expression="offset"), + hwm=DBReader.AutoDetectHWM(name="some_hwm_name", expression="offset"), ) with IncrementalStrategy(): diff --git a/onetl/db/db_reader/db_reader.py b/onetl/db/db_reader/db_reader.py index 7f6f6f0d9..2ce47a54c 100644 --- a/onetl/db/db_reader/db_reader.py +++ b/onetl/db/db_reader/db_reader.py @@ -130,9 +130,7 @@ class DBReader(FrozenModel): .. code:: python - from onetl.hwm import AutoDetectHWM - - hwm = AutoDetectHWM( + hwm = DBReader.AutoDetectHWM( name="some_unique_hwm_name", expression="hwm_column", ) @@ -143,9 +141,7 @@ class DBReader(FrozenModel): .. code:: python - from onetl.hwm import AutoDetectHWM - - hwm = AutoDetectHWM( + hwm = DBReader.AutoDetectHWM( name="some_unique_hwm_name", expression="cast(hwm_column_orig as date)", ) @@ -332,7 +328,6 @@ class DBReader(FrozenModel): from onetl.db import DBReader from onetl.connection import Postgres from onetl.strategy import IncrementalStrategy - from onetl.hwm import AutoDetectHWM from pyspark.sql import SparkSession maven_packages = Postgres.get_packages() diff --git a/onetl/strategy/incremental_strategy.py b/onetl/strategy/incremental_strategy.py index 8946f7d5d..267b01613 100644 --- a/onetl/strategy/incremental_strategy.py +++ b/onetl/strategy/incremental_strategy.py @@ -196,7 +196,6 @@ class IncrementalStrategy(OffsetMixin, HWMStrategy): from onetl.connection import Postgres from onetl.db import DBReader from onetl.strategy import IncrementalStrategy - from onetl.hwm import AutoDetectHWM from pyspark.sql import SparkSession @@ -290,7 +289,6 @@ class IncrementalStrategy(OffsetMixin, HWMStrategy): from onetl.connection import Kafka from onetl.db import DBReader from onetl.strategy import IncrementalStrategy - from onetl.hwm import AutoDetectHWM from pyspark.sql import SparkSession @@ -323,7 +321,7 @@ class IncrementalStrategy(OffsetMixin, HWMStrategy): from onetl.connection import SFTP from onetl.file import FileDownloader from onetl.strategy import SnapshotStrategy - from etl_entities import FileListHWM + from etl_entities.hwm import FileListHWM sftp = SFTP( host="sftp.domain.com", @@ -495,7 +493,6 @@ class IncrementalBatchStrategy(OffsetMixin, BatchHWMStrategy): from onetl.connection import Postgres, Hive from onetl.db import DBReader from onetl.strategy import IncrementalBatchStrategy - from onetl.hwm import AutoDetectHWM from pyspark.sql import SparkSession diff --git a/onetl/strategy/snapshot_strategy.py b/onetl/strategy/snapshot_strategy.py index 49bce44dd..5368b2039 100644 --- a/onetl/strategy/snapshot_strategy.py +++ b/onetl/strategy/snapshot_strategy.py @@ -55,7 +55,6 @@ class SnapshotStrategy(BaseStrategy): from onetl.connection import Postgres from onetl.db import DBReader from onetl.strategy import SnapshotStrategy - from onetl.hwm import AutoDetectHWM from pyspark.sql import SparkSession @@ -227,7 +226,6 @@ class SnapshotBatchStrategy(BatchHWMStrategy): from onetl.connection import Postgres, Hive from onetl.db import DBReader from onetl.strategy import SnapshotBatchStrategy - from onetl.hwm import AutoDetectHWM from pyspark.sql import SparkSession From 008c0758bd33c4de9f6dc3ad717fb6ca96d1096f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Thu, 15 Feb 2024 09:15:39 +0000 Subject: [PATCH 08/63] Fix Hive.WriteOptions warning while inserting to existing table --- mypy.ini | 9 --------- onetl/connection/db_connection/hive/connection.py | 3 ++- setup.cfg | 4 ++-- .../test_hive_writer_integration.py | 5 ++++- 4 files changed, 8 insertions(+), 13 deletions(-) delete mode 100644 mypy.ini diff --git a/mypy.ini b/mypy.ini deleted file mode 100644 index 38dc2b0fe..000000000 --- a/mypy.ini +++ /dev/null @@ -1,9 +0,0 @@ -[mypy] -python_version = 3.8 -exclude = site-packages|__pycache__|docs|old|build|dist|tests|venv -strict_optional=True -follow_imports = silent -show_error_codes = True -# ignore typing in third-party packages -ignore_missing_imports = True -plugins = pydantic.mypy diff --git a/onetl/connection/db_connection/hive/connection.py b/onetl/connection/db_connection/hive/connection.py index 34d5af5cb..6df295481 100644 --- a/onetl/connection/db_connection/hive/connection.py +++ b/onetl/connection/db_connection/hive/connection.py @@ -497,7 +497,8 @@ def _insert_into( unsupported_options = write_options.dict(by_alias=True, exclude_unset=True, exclude={"if_exists"}) if unsupported_options: log.warning( - "|%s| Options %r are not supported while inserting into existing table, ignoring", + "|%s| User-specified options %r are ignored while inserting into existing table. " + "Using only table parameters from Hive metastore", self.__class__.__name__, unsupported_options, ) diff --git a/setup.cfg b/setup.cfg index d73a28390..74e3f2b28 100644 --- a/setup.cfg +++ b/setup.cfg @@ -426,10 +426,10 @@ per-file-ignores = docstring_style = sphinx [mypy] -python_version = 3.7 +python_version = 3.8 # TODO: remove later exclude = ^(?=.*file).* -strict_optional=True +strict_optional = True # ignore typing in third-party packages ignore_missing_imports = True follow_imports = silent diff --git a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_hive_writer_integration.py b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_hive_writer_integration.py index 07b30cb07..a841f09f1 100644 --- a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_hive_writer_integration.py +++ b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_hive_writer_integration.py @@ -289,7 +289,10 @@ def test_hive_writer_insert_into_with_options(spark, processing, get_schema_tabl options=options, ) - error_msg = f"|Hive| Options {option_kv} are not supported while inserting into existing table, ignoring" + error_msg = ( + f"|Hive| User-specified options {option_kv} are ignored while inserting into existing table. " + "Using only table parameters from Hive metastore" + ) with caplog.at_level(logging.WARNING): # write to table with new options writer2.run(df2) From 97c8bf84dcd4a96ccc4a2d7987c45b432e6f53a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Thu, 15 Feb 2024 09:53:31 +0000 Subject: [PATCH 09/63] Fix flaky Kafka tests --- .../test_strategy_increment_kafka.py | 139 +++++++++++------- 1 file changed, 89 insertions(+), 50 deletions(-) diff --git a/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_kafka.py b/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_kafka.py index 1ba96cee1..bd65af56b 100644 --- a/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_kafka.py +++ b/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_kafka.py @@ -26,7 +26,7 @@ def test_kafka_strategy_incremental( kafka_topic, num_partitions, ): - from pyspark.sql.functions import count as spark_count + from pyspark.sql.functions import max as spark_max hwm_type = KeyValueIntHWM hwm_name = secrets.token_hex(5) @@ -71,18 +71,23 @@ def test_kafka_strategy_incremental( with IncrementalStrategy(): first_df = reader.run() + # all the data has been read + deserialized_first_df = processing.json_deserialize(first_df, df_schema=kafka_dataframe_schema) + processing.assert_equal_df(df=deserialized_first_df, other_frame=first_span, order_by="id_int") + hwm = store.get_hwm(hwm_name) assert hwm is not None assert isinstance(hwm, hwm_type) - # check that HWM distribution of messages in partitions matches the distribution in sparkDF - partition_counts = first_df.groupBy("partition").agg(spark_count("*").alias("count")) - partition_count_dict_first_df = {row["partition"]: row["count"] for row in partition_counts.collect()} - assert hwm.value == partition_count_dict_first_df - - # all the data has been read - deserialized_first_df = processing.json_deserialize(first_df, df_schema=kafka_dataframe_schema) - processing.assert_equal_df(df=deserialized_first_df, other_frame=first_span, order_by="id_int") + # HWM contains mapping `partition: max offset + 1` + partition_offsets_initial = dict.fromkeys(range(num_partitions or 1), 0) + partition_offsets_initial.update( + { + row["partition"]: row["offset"] + 1 + for row in first_df.groupBy("partition").agg(spark_max("offset").alias("offset")).collect() + }, + ) + assert hwm.value == partition_offsets_initial # insert second span processing.insert_pandas_df_into_topic(second_span, kafka_topic) @@ -90,17 +95,22 @@ def test_kafka_strategy_incremental( with IncrementalStrategy(): second_df = reader.run() - hwm = store.get_hwm(hwm_name) - - # check that HWM distribution of messages in partitions matches the distribution in sparkDF - combined_df = first_df.union(second_df) - partition_counts_combined = combined_df.groupBy("partition").agg(spark_count("*").alias("count")) - partition_count_dict_combined = {row["partition"]: row["count"] for row in partition_counts_combined.collect()} - assert hwm.value == partition_count_dict_combined - + # only new data has been read deserialized_second_df = processing.json_deserialize(second_df, df_schema=kafka_dataframe_schema) processing.assert_equal_df(df=deserialized_second_df, other_frame=second_span, order_by="id_int") + # HWM contains mapping `partition: max offset + 1` + hwm = store.get_hwm(hwm_name) + partition_offsets_new = dict.fromkeys(range(num_partitions or 1), 0) + partition_offsets_new.update(partition_offsets_initial) + partition_offsets_new.update( + { + row["partition"]: row["offset"] + 1 + for row in second_df.groupBy("partition").agg(spark_max("offset").alias("offset")).collect() + }, + ) + assert hwm.value == partition_offsets_new + @pytest.mark.parametrize( "num_partitions", @@ -117,7 +127,7 @@ def test_kafka_strategy_incremental_nothing_to_read( num_partitions, kafka_topic, ): - from pyspark.sql.functions import count as spark_count + from pyspark.sql.functions import max as spark_max hwm_name = secrets.token_hex(5) store = HWMStoreStackManager.get_current() @@ -170,15 +180,20 @@ def test_kafka_strategy_incremental_nothing_to_read( assert reader.has_data() first_df = reader.run() - hwm = store.get_hwm(name=hwm_name) - # check that HWM distribution of messages in partitions matches the distribution in sparkDF - partition_counts = first_df.groupBy("partition").agg(spark_count("*").alias("count")) - partition_count_dict_first_df = {row["partition"]: row["count"] for row in partition_counts.collect()} - assert hwm.value == partition_count_dict_first_df - deserialized_df = processing.json_deserialize(first_df, df_schema=kafka_dataframe_schema) processing.assert_equal_df(df=deserialized_df, other_frame=first_span, order_by="id_int") + # HWM contains mapping `partition: max offset + 1` + hwm = store.get_hwm(name=hwm_name) + partition_offsets_initial = dict.fromkeys(range(num_partitions or 1), 0) + partition_offsets_initial.update( + { + row["partition"]: row["offset"] + 1 + for row in first_df.groupBy("partition").agg(spark_max("offset").alias("offset")).collect() + }, + ) + assert hwm.value == partition_offsets_initial + # no new data yet, nothing to read with IncrementalStrategy(): df = reader.run() @@ -186,7 +201,7 @@ def test_kafka_strategy_incremental_nothing_to_read( assert not df.count() # HWM value is unchanged hwm = store.get_hwm(name=hwm_name) - assert hwm.value == partition_count_dict_first_df + assert hwm.value == partition_offsets_initial # insert second span processing.insert_pandas_df_into_topic(second_span, kafka_topic) @@ -195,38 +210,45 @@ def test_kafka_strategy_incremental_nothing_to_read( assert not df.count() # HWM value is unchanged hwm = store.get_hwm(name=hwm_name) - assert hwm.value == partition_count_dict_first_df + assert hwm.value == partition_offsets_initial # read data with IncrementalStrategy(): - df = reader.run() + second_df = reader.run() - hwm = store.get_hwm(name=hwm_name) - # check that HWM distribution of messages in partitions matches the distribution in sparkDF - combined_df = df.union(first_df) - partition_counts_combined = combined_df.groupBy("partition").agg(spark_count("*").alias("count")) - partition_count_dict_combined = {row["partition"]: row["count"] for row in partition_counts_combined.collect()} - assert hwm.value == partition_count_dict_combined + # only new data has been read + deserialized_second_df = processing.json_deserialize(second_df, df_schema=kafka_dataframe_schema) + processing.assert_equal_df(df=deserialized_second_df, other_frame=second_span, order_by="id_int") - deserialized_df = processing.json_deserialize(df, df_schema=kafka_dataframe_schema) - processing.assert_equal_df(df=deserialized_df, other_frame=second_span, order_by="id_int") + # HWM contains mapping `partition: max offset + 1` + hwm = store.get_hwm(name=hwm_name) + partition_offsets_new = dict.fromkeys(range(num_partitions or 1), 0) + partition_offsets_new.update(partition_offsets_initial) + partition_offsets_new.update( + { + row["partition"]: row["offset"] + 1 + for row in second_df.groupBy("partition").agg(spark_max("offset").alias("offset")).collect() + }, + ) + assert hwm.value == partition_offsets_new @pytest.mark.parametrize( - "initial_partitions, additional_partitions", + "initial_partitions, new_partitions", [ - (3, 2), # starting with 3 partitions, adding 2 more - (5, 1), # starting with 5 partitions, adding 1 more + (3, 5), + (5, 6), ], ) def test_kafka_strategy_incremental_with_new_partition( spark, processing, initial_partitions, - additional_partitions, + new_partitions, kafka_topic, + kafka_dataframe_schema, ): - from pyspark.sql.functions import count as spark_count + from pyspark.sql.functions import max as spark_max hwm_name = secrets.token_hex(5) store = HWMStoreStackManager.get_current() @@ -267,21 +289,38 @@ def test_kafka_strategy_incremental_with_new_partition( # Specified: Set(topic1, topic2) Assigned: Set(topic1, topic2, additional_topic3, additional_topic4) first_df.cache() + # all the data has been read + deserialized_df = processing.json_deserialize(first_df, df_schema=kafka_dataframe_schema) + processing.assert_equal_df(df=deserialized_df, other_frame=first_span, order_by="id_int") + + # HWM contains mapping `partition: max offset + 1` hwm = store.get_hwm(name=hwm_name) - first_run_hwm_keys_num = len(hwm.value.keys()) + partition_offsets_initial = dict.fromkeys(range(initial_partitions), 0) + partition_offsets_initial.update( + { + row["partition"]: row["offset"] + 1 + for row in first_df.groupBy("partition").agg(spark_max("offset").alias("offset")).collect() + }, + ) - processing.change_topic_partitions(kafka_topic, initial_partitions + additional_partitions) + processing.change_topic_partitions(kafka_topic, new_partitions) processing.insert_pandas_df_into_topic(second_span, kafka_topic) with IncrementalStrategy(): second_df = reader.run() + # only new data has been read + deserialized_second_df = processing.json_deserialize(second_df, df_schema=kafka_dataframe_schema) + processing.assert_equal_df(df=deserialized_second_df, other_frame=second_span, order_by="id_int") + + # HWM contains mapping `partition: max offset + 1` hwm = store.get_hwm(name=hwm_name) - second_run_hwm_keys_num = len(hwm.value) - assert first_run_hwm_keys_num + additional_partitions == second_run_hwm_keys_num - - # check that HWM distribution of messages in partitions matches the distribution in sparkDF - combined_df = second_df.union(first_df) - partition_counts_combined = combined_df.groupBy("partition").agg(spark_count("*").alias("count")) - partition_count_dict_combined = {row["partition"]: row["count"] for row in partition_counts_combined.collect()} - assert hwm.value == partition_count_dict_combined + partition_offsets_new = dict.fromkeys(range(new_partitions), 0) + partition_offsets_new.update(partition_offsets_initial) + partition_offsets_new.update( + { + row["partition"]: row["offset"] + 1 + for row in second_df.groupBy("partition").agg(spark_max("offset").alias("offset")).collect() + }, + ) + assert hwm.value == partition_offsets_new From e253eabfb4dcfca82be7e3a0c9e6f682ab121ec5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Thu, 15 Feb 2024 09:54:43 +0000 Subject: [PATCH 10/63] Fix flaky Kafka tests --- .../test_strategy_increment_kafka.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_kafka.py b/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_kafka.py index bd65af56b..7ae146cca 100644 --- a/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_kafka.py +++ b/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_kafka.py @@ -101,8 +101,7 @@ def test_kafka_strategy_incremental( # HWM contains mapping `partition: max offset + 1` hwm = store.get_hwm(hwm_name) - partition_offsets_new = dict.fromkeys(range(num_partitions or 1), 0) - partition_offsets_new.update(partition_offsets_initial) + partition_offsets_new = partition_offsets_initial.copy() partition_offsets_new.update( { row["partition"]: row["offset"] + 1 @@ -222,8 +221,7 @@ def test_kafka_strategy_incremental_nothing_to_read( # HWM contains mapping `partition: max offset + 1` hwm = store.get_hwm(name=hwm_name) - partition_offsets_new = dict.fromkeys(range(num_partitions or 1), 0) - partition_offsets_new.update(partition_offsets_initial) + partition_offsets_new = partition_offsets_initial.copy() partition_offsets_new.update( { row["partition"]: row["offset"] + 1 From bc224c05bbf53fbc82187b17d12fab61a890b220 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 12 Feb 2024 23:04:47 +0000 Subject: [PATCH 11/63] [pre-commit.ci] pre-commit autoupdate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/psf/black: 24.1.1 → 24.2.0](https://github.com/psf/black/compare/24.1.1...24.2.0) --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e209d7e54..b16cf09c0 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -95,7 +95,7 @@ repos: args: [--py37-plus, --keep-runtime-typing] - repo: https://github.com/psf/black - rev: 24.1.1 + rev: 24.2.0 hooks: - id: black language_version: python3 From 8de16ea8b97b5833c81a162ded27fa89e012ca5c Mon Sep 17 00:00:00 2001 From: Maxim Martynov Date: Thu, 15 Feb 2024 13:22:39 +0300 Subject: [PATCH 12/63] Update black in blacken-docs --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b16cf09c0..961c8de69 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -105,7 +105,7 @@ repos: hooks: - id: blacken-docs additional_dependencies: - - black==24.1.1 + - black==24.2.0 - repo: meta hooks: From 6414afb77d012b277392ad7ba59652ad99be1880 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Thu, 15 Feb 2024 10:38:19 +0000 Subject: [PATCH 13/63] Add platform section to docker-compose images which does not support M1 Silicon --- docker-compose.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docker-compose.yml b/docker-compose.yml index 5b4fea437..6ba2aca64 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -90,6 +90,7 @@ services: - 1433:1433 networks: - onetl + platform: linux/amd64 mysql: image: ${MYSQL_IMAGE:-mysql:latest} @@ -99,6 +100,7 @@ services: - 3306:3306 networks: - onetl + platform: linux/amd64 postgres: image: ${POSTGRES_IMAGE:-postgres:15.2-alpine} @@ -136,6 +138,7 @@ services: interval: 10s timeout: 5s retries: 10 + platform: linux/amd64 ftp: image: ${FTP_IMAGE:-chonjay21/ftps:latest} From 8519d3966a5d7f9560a0c4ad8c55631a19e80043 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Thu, 15 Feb 2024 10:38:53 +0000 Subject: [PATCH 14/63] Fix Greenplum package URL --- docs/conf.py | 2 ++ onetl/connection/db_connection/greenplum/connection.py | 2 +- setup.cfg | 1 + tests/fixtures/spark.py | 1 - 4 files changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 958bfe1c5..fe9e92bda 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -61,6 +61,8 @@ "sphinx_favicon", ] numpydoc_show_class_members = True +autosummary_generate_overwrite = False +autosummary_generate = False autodoc_pydantic_model_show_config = False autodoc_pydantic_model_show_config_summary = False autodoc_pydantic_model_show_config_member = False diff --git a/onetl/connection/db_connection/greenplum/connection.py b/onetl/connection/db_connection/greenplum/connection.py index 5d0268590..a525713f9 100644 --- a/onetl/connection/db_connection/greenplum/connection.py +++ b/onetl/connection/db_connection/greenplum/connection.py @@ -64,7 +64,7 @@ class Greenplum(JDBCMixin, DBConnection): """Greenplum connection. |support_hooks| Based on package ``io.pivotal:greenplum-spark:2.3.0`` - (`VMware Greenplum connector for Spark `_). + (`VMware Greenplum connector for Spark `_). .. warning:: diff --git a/setup.cfg b/setup.cfg index 74e3f2b28..f34539545 100644 --- a/setup.cfg +++ b/setup.cfg @@ -269,6 +269,7 @@ ignore = WPS201, # WPS429 Found multiple assign targets WPS429, +# https://github.com/wemake-services/wemake-python-styleguide/issues/2847 # E704 multiple statements on one line: def func(): ... E704, # WPS220 Found too deep nesting: 46 > 20 diff --git a/tests/fixtures/spark.py b/tests/fixtures/spark.py index 3196210e1..f56fa4002 100644 --- a/tests/fixtures/spark.py +++ b/tests/fixtures/spark.py @@ -78,7 +78,6 @@ def maven_packages(): packages.extend(MongoDB.get_packages(spark_version=pyspark_version)) # There is no Excel files support for Spark less than 3.2 - # And there is still no package released for 3.5.0 https://github.com/crealytics/spark-excel/issues/787 packages.extend(Excel.get_packages(spark_version=pyspark_version)) return packages From 589608a06bbc0ebaa3caef6fe8e2aa12a3c781e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Thu, 15 Feb 2024 10:39:47 +0000 Subject: [PATCH 15/63] Avoid unnecessary columns sorting while inserting data to Hive table --- onetl/connection/db_connection/hive/connection.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/onetl/connection/db_connection/hive/connection.py b/onetl/connection/db_connection/hive/connection.py index 6df295481..328d6b98e 100644 --- a/onetl/connection/db_connection/hive/connection.py +++ b/onetl/connection/db_connection/hive/connection.py @@ -507,7 +507,8 @@ def _insert_into( # So we should sort columns according their order in the existing table # instead of using order from the dataframe columns = self._sort_df_columns_like_table(table, df.columns) - writer = df.select(*columns).write + if columns != df.columns: + df = df.select(*columns) # Writer option "partitionOverwriteMode" was added to Spark only in 2.4.0 # so using a workaround with patching Spark config and then setting up the previous value @@ -515,7 +516,7 @@ def _insert_into( overwrite = write_options.if_exists != HiveTableExistBehavior.APPEND log.info("|%s| Inserting data into existing table %r ...", self.__class__.__name__, table) - writer.insertInto(table, overwrite=overwrite) + df.write.insertInto(table, overwrite=overwrite) log.info("|%s| Data is successfully inserted into table %r.", self.__class__.__name__, table) From f934786e8d2715e073dcad410e6bf4f1f0cece37 Mon Sep 17 00:00:00 2001 From: Maxim Martynov Date: Thu, 15 Feb 2024 14:12:37 +0300 Subject: [PATCH 16/63] [DOP-13156] Update release instuction (#214) --- CONTRIBUTING.rst | 67 +++++++++++++++++++++++++++------ docs/changelog/NEXT_RELEASE.rst | 5 --- docs/changelog/index.rst | 1 - 3 files changed, 56 insertions(+), 17 deletions(-) diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index cab05a780..32b1f21a2 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -343,38 +343,83 @@ Release Process Before making a release from the ``develop`` branch, follow these steps: +0. Checkout to ``develop`` branch and update it to the actual state + +.. code:: bash + + git checkout develop + git pull -p + 1. Backup ``NEXT_RELEASE.rst`` .. code:: bash - cp docs/changelog/NEXT_RELEASE.rst docs/changelog/temp_NEXT_RELEASE.rst + cp "docs/changelog/NEXT_RELEASE.rst" "docs/changelog/temp_NEXT_RELEASE.rst" 2. Build the Release notes with Towncrier .. code:: bash - export VERSION=$(cat onetl/VERSION) - towncrier build --version=${VERSION} + VERSION=$(cat onetl/VERSION) + towncrier build "--version=${VERSION}" --yes -3. Update Changelog +3. Change file with changelog to release version number .. code:: bash - mv docs/changelog/NEXT_RELEASE.rst docs/changelog/${VERSION}.rst + mv docs/changelog/NEXT_RELEASE.rst "docs/changelog/${VERSION}.rst" + +4. Remove content above the version number heading in the ``${VERSION}.rst`` file + +.. code:: bash -4. Edit the ``${VERSION}.rst`` file -Remove content above the version number heading in the ``${VERSION}.rst`` file. + sed "0,/^.*towncrier release notes start/d" -i "docs/changelog/${VERSION}.rst" 5. Update Changelog Index .. code:: bash - awk -v version=${VERSION} '/NEXT_RELEASE/{print;print " " version;next}1' docs/changelog/index.rst > temp && mv temp docs/changelog/index.rst + sed -E "s/DRAFT/DRAFT\n ${VERSION}/" -i "docs/changelog/index.rst" -6. Reset ``NEXT_RELEASE.rst`` file +6. Restore ``NEXT_RELEASE.rst`` file from backup .. code:: bash - mv docs/changelog/temp_NEXT_RELEASE.rst docs/changelog/NEXT_RELEASE.rst + mv "docs/changelog/temp_NEXT_RELEASE.rst" "docs/changelog/NEXT_RELEASE.rst" -7. Update the patch version in the ``VERSION`` file of ``develop`` branch **after release**. +7. Commit and push changes to ``develop`` branch + +.. code:: bash + + git add . + git commit -m "Prepare for release ${VERSION}" + git push + +8. Merge ``develop`` branch to ``master``, **WITHOUT** squashing + +.. code:: bash + + git checkout master + git pull + git merge develop + git push + +9. Add git tag to the latest commit in ``master`` branch + +.. code:: bash + + git tag "$VERSION" + git push origin "$VERSION" + +10. Update version in ``develop`` branch **after release**: + +.. code:: bash + + git checkout develop + + NEXT_VERSION=$(echo "$VERSION" | awk -F. '/[0-9]+\./{$NF++;print}' OFS=.) + echo "$NEXT_VERSION" > onetl/VERSION + + git add . + git commit -m "Bump version" + git push diff --git a/docs/changelog/NEXT_RELEASE.rst b/docs/changelog/NEXT_RELEASE.rst index a598be017..28a2621b2 100644 --- a/docs/changelog/NEXT_RELEASE.rst +++ b/docs/changelog/NEXT_RELEASE.rst @@ -1,6 +1 @@ -.. fill up this file using ``towncrier build`` -.. then delete everything up to the header with version number -.. then rename file to ``{VERSION}.rst`` and add it to index.rst -.. and restore ``NEXT_RELEASE.rst`` content`` as it was before running the command above - .. towncrier release notes start diff --git a/docs/changelog/index.rst b/docs/changelog/index.rst index 557ac69b4..45f7c283b 100644 --- a/docs/changelog/index.rst +++ b/docs/changelog/index.rst @@ -3,7 +3,6 @@ :caption: Changelog DRAFT - NEXT_RELEASE 0.10.1 0.10.0 0.9.5 From bf95ee59a60d9bc34f8f5046c8a724492c27fc8d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Thu, 15 Feb 2024 12:37:17 +0000 Subject: [PATCH 17/63] Fix docs warnings --- docs/conf.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index fe9e92bda..f65c954b3 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -60,9 +60,7 @@ "sphinx.ext.extlinks", "sphinx_favicon", ] -numpydoc_show_class_members = True -autosummary_generate_overwrite = False -autosummary_generate = False +numpydoc_show_class_members = False autodoc_pydantic_model_show_config = False autodoc_pydantic_model_show_config_summary = False autodoc_pydantic_model_show_config_member = False From ee019613ac627d71d272a686b2b2346bc7d23046 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 16 Feb 2024 06:29:19 +0000 Subject: [PATCH 18/63] Bump mikefarah/yq from 4.40.7 to 4.41.1 Bumps [mikefarah/yq](https://github.com/mikefarah/yq) from 4.40.7 to 4.41.1. - [Release notes](https://github.com/mikefarah/yq/releases) - [Changelog](https://github.com/mikefarah/yq/blob/master/release_notes.txt) - [Commits](https://github.com/mikefarah/yq/compare/v4.40.7...v4.41.1) --- updated-dependencies: - dependency-name: mikefarah/yq dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- .github/workflows/get-matrix.yml | 38 ++++++++++++++++---------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/.github/workflows/get-matrix.yml b/.github/workflows/get-matrix.yml index 1dc0b1ddd..a3ccc49f0 100644 --- a/.github/workflows/get-matrix.yml +++ b/.github/workflows/get-matrix.yml @@ -154,7 +154,7 @@ jobs: - name: Get Core matrix id: matrix-core - uses: mikefarah/yq@v4.40.7 + uses: mikefarah/yq@v4.41.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/core/matrix.yml @@ -184,7 +184,7 @@ jobs: - name: Get Clickhouse matrix id: matrix-clickhouse - uses: mikefarah/yq@v4.40.7 + uses: mikefarah/yq@v4.41.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/clickhouse/matrix.yml @@ -214,7 +214,7 @@ jobs: - name: Get Greenplum matrix id: matrix-greenplum - uses: mikefarah/yq@v4.40.7 + uses: mikefarah/yq@v4.41.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/greenplum/matrix.yml @@ -244,7 +244,7 @@ jobs: - name: Get Hive matrix id: matrix-hive - uses: mikefarah/yq@v4.40.7 + uses: mikefarah/yq@v4.41.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/hive/matrix.yml @@ -274,7 +274,7 @@ jobs: - name: Get Kafka matrix id: matrix-kafka - uses: mikefarah/yq@v4.40.7 + uses: mikefarah/yq@v4.41.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/kafka/matrix.yml @@ -304,7 +304,7 @@ jobs: - name: Get LocalFS matrix id: matrix-local-fs - uses: mikefarah/yq@v4.40.7 + uses: mikefarah/yq@v4.41.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/local-fs/matrix.yml @@ -334,7 +334,7 @@ jobs: - name: Get MongoDB matrix id: matrix-mongodb - uses: mikefarah/yq@v4.40.7 + uses: mikefarah/yq@v4.41.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/mongodb/matrix.yml @@ -364,7 +364,7 @@ jobs: - name: Get MSSQL matrix id: matrix-mssql - uses: mikefarah/yq@v4.40.7 + uses: mikefarah/yq@v4.41.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/mssql/matrix.yml @@ -394,7 +394,7 @@ jobs: - name: Get MySQL matrix id: matrix-mysql - uses: mikefarah/yq@v4.40.7 + uses: mikefarah/yq@v4.41.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/mysql/matrix.yml @@ -424,7 +424,7 @@ jobs: - name: Get Oracle matrix id: matrix-oracle - uses: mikefarah/yq@v4.40.7 + uses: mikefarah/yq@v4.41.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/oracle/matrix.yml @@ -454,7 +454,7 @@ jobs: - name: Get Postgres matrix id: matrix-postgres - uses: mikefarah/yq@v4.40.7 + uses: mikefarah/yq@v4.41.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/postgres/matrix.yml @@ -484,7 +484,7 @@ jobs: - name: Get Teradata matrix id: matrix-teradata - uses: mikefarah/yq@v4.40.7 + uses: mikefarah/yq@v4.41.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/teradata/matrix.yml @@ -514,7 +514,7 @@ jobs: - name: Get FTP matrix id: matrix-ftp - uses: mikefarah/yq@v4.40.7 + uses: mikefarah/yq@v4.41.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/ftp/matrix.yml @@ -544,7 +544,7 @@ jobs: - name: Get FTPS matrix id: matrix-ftps - uses: mikefarah/yq@v4.40.7 + uses: mikefarah/yq@v4.41.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/ftps/matrix.yml @@ -574,7 +574,7 @@ jobs: - name: Get HDFS matrix id: matrix-hdfs - uses: mikefarah/yq@v4.40.7 + uses: mikefarah/yq@v4.41.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/hdfs/matrix.yml @@ -604,7 +604,7 @@ jobs: - name: Get S3 matrix id: matrix-s3 - uses: mikefarah/yq@v4.40.7 + uses: mikefarah/yq@v4.41.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/s3/matrix.yml @@ -634,7 +634,7 @@ jobs: - name: Get SFTP matrix id: matrix-sftp - uses: mikefarah/yq@v4.40.7 + uses: mikefarah/yq@v4.41.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/sftp/matrix.yml @@ -664,7 +664,7 @@ jobs: - name: Get Samba matrix id: matrix-samba - uses: mikefarah/yq@v4.40.7 + uses: mikefarah/yq@v4.41.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/samba/matrix.yml @@ -694,6 +694,6 @@ jobs: - name: Get WebDAV matrix id: matrix-webdav - uses: mikefarah/yq@v4.40.7 + uses: mikefarah/yq@v4.41.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/webdav/matrix.yml From c0f074ab926f8951f4e7ed60972e254a29e31e12 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 19 Feb 2024 06:19:20 +0000 Subject: [PATCH 19/63] Bump alexwilson/enable-github-automerge-action from 1.0.0 to 2.0.0 Bumps [alexwilson/enable-github-automerge-action](https://github.com/alexwilson/enable-github-automerge-action) from 1.0.0 to 2.0.0. - [Release notes](https://github.com/alexwilson/enable-github-automerge-action/releases) - [Commits](https://github.com/alexwilson/enable-github-automerge-action/compare/1.0.0...2.0.0) --- updated-dependencies: - dependency-name: alexwilson/enable-github-automerge-action dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- .github/workflows/automerge.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/automerge.yml b/.github/workflows/automerge.yml index c29d2d6c2..40bec6775 100644 --- a/.github/workflows/automerge.yml +++ b/.github/workflows/automerge.yml @@ -10,7 +10,7 @@ jobs: if: github.event.pull_request.user.login == 'pre-commit-ci[bot]' || github.event.pull_request.user.login == 'dependabot[bot]' steps: - - uses: alexwilson/enable-github-automerge-action@1.0.0 + - uses: alexwilson/enable-github-automerge-action@2.0.0 with: github-token: ${{ secrets.AUTOMERGE_TOKEN }} merge-method: REBASE From cb17a82e0528210f738ff9956870610bbee6c284 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Mon, 19 Feb 2024 13:18:36 +0000 Subject: [PATCH 20/63] [DOP-13156] Fix release documentation for MacOS --- CONTRIBUTING.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 32b1f21a2..847cb5765 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -373,13 +373,13 @@ Before making a release from the ``develop`` branch, follow these steps: .. code:: bash - sed "0,/^.*towncrier release notes start/d" -i "docs/changelog/${VERSION}.rst" + awk '!/^.*towncrier release notes start/' "docs/changelog/${VERSION}.rst" > temp && mv temp "docs/changelog/${VERSION}.rst" 5. Update Changelog Index .. code:: bash - sed -E "s/DRAFT/DRAFT\n ${VERSION}/" -i "docs/changelog/index.rst" + awk -v version=${VERSION} '/DRAFT/{print;print " " version;next}1' docs/changelog/index.rst > temp && mv temp docs/changelog/index.rst 6. Restore ``NEXT_RELEASE.rst`` file from backup From 11aa8012cce4fb49e0c0fb019bbb17f16f9abd64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Mon, 19 Feb 2024 13:35:50 +0000 Subject: [PATCH 21/63] [DOP-11877] Group dependabot PRs --- .github/dependabot.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/dependabot.yml b/.github/dependabot.yml index a5c01bee7..a6fa6548d 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -8,3 +8,8 @@ updates: interval: daily labels: - type:ci + # https://til.simonwillison.net/github/dependabot-python-setup + groups: + github-actions: + patterns: + - '*' From 27d2a885e671f7f3e9b8bbd09af8cb9ef2b4503d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Mon, 19 Feb 2024 13:55:10 +0000 Subject: [PATCH 22/63] [DOP-13209] Add labels sync workflow --- .github/labels.yml | 153 +++++++++++++++++++++++++ .github/workflows/repo-labels-sync.yml | 27 +++++ 2 files changed, 180 insertions(+) create mode 100644 .github/labels.yml create mode 100644 .github/workflows/repo-labels-sync.yml diff --git a/.github/labels.yml b/.github/labels.yml new file mode 100644 index 000000000..20a4408dd --- /dev/null +++ b/.github/labels.yml @@ -0,0 +1,153 @@ +# config for https://github.com/marketplace/actions/github-labeler + + - name: attention:help wanted + description: Extra attention is needed + color: bfdadc + from_name: help wanted + + - name: attention:invalid + description: This doesn't seem right + color: ea2357 + from_name: invalid + + - name: ci:skip-changelog + description: Add this label to skip changelog file check + color: 04990f + + - name: component:connection + description: Connection-related + color: 75aeb3 + + - name: component:db + description: DB-related + color: 5319e7 + + - name: component:file + description: DB-related + color: 75f526 + + - name: component:hooks + description: Hooks-related + color: 006b75 + + - name: component:hwm + description: HWM-related + color: 670f54 + + - name: component:plugins + description: Plugins-related + color: 77ba7b + + - name: component:spark + description: Spark-related + color: e09183 + + - name: db:clickhouse + description: Clickhouse-related + color: 1d76db + + - name: db:greenplum + description: Greenplum-related + color: c2e0c6 + + - name: db:hive + description: Hive-related + color: ef2895 + + - name: db:kafka + description: Kafka-related + color: f00a38 + + - name: db:mongodb + description: MongoDB-related + color: 1587c4 + + - name: db:mssql + description: MSSQL-related + color: ad368a + + - name: db:mysql + description: MySQL-related + color: '473255' + + - name: db:oracle + description: Oracle-related + color: '646821' + + - name: db:postgres + description: Postgres-related + color: 1077a6 + + - name: db:teradata + description: Teradata-related + color: 04115d + + - name: file:ftp + description: FTP-related + color: e41c74 + + - name: file:ftps + description: FTPS-related + color: 07c469 + + - name: file:hdfs + description: HDFS-related + color: 0295d0 + + - name: file:s3 + description: S3-related + color: b0e8f9 + + - name: file:sftp + description: SFTP-related + color: 3e9d6d + + - name: file:webdav + description: WebDAV-related + color: 6d1c10 + + - name: kind:bug + description: Something isn't working + color: d73a4a + from_name: bug + + - name: kind:feature + description: New feature or request + color: 389a3f + + - name: kind:improvement + description: Improvement of some existing feature + color: 1a92c2 + from_name: enhancement + + - name: kind:question + description: Further information is requested + color: 0e857c + from_name: question + + - name: resolution:duplicate + description: This issue or pull request already exists + color: cfd3d7 + from_name: duplicate + + - name: resolution:wontfix + description: This will not be worked on + color: ec103b + from_name: wontfix + + - name: type:ci + description: CI-related changes + color: cdb0bd + + - name: type:dependency + description: Dependency-related changes + color: 214efe + + - name: type:documentation + description: Improvements or additions to documentation + color: 6b9f54 + from_name: documentation + + - name: type:tests + description: Tests-related changes + color: 5cca5b diff --git a/.github/workflows/repo-labels-sync.yml b/.github/workflows/repo-labels-sync.yml new file mode 100644 index 000000000..c0b1e4c5b --- /dev/null +++ b/.github/workflows/repo-labels-sync.yml @@ -0,0 +1,27 @@ +name: Repo labels sync + +on: + push: + branches: + - develop + paths: + - .github/labels.yml + - .github/workflows/repo-labels-sync.yml + pull_request: + paths: + - .github/labels.yml + - .github/workflows/repo-labels-sync.yml + +jobs: + labeler: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Run Labeler + uses: crazy-max/ghaction-github-labeler@v5 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + yaml-file: .github/labels.yml + dry-run: ${{ github.event_name == 'pull_request' }} From 722dd4bb0a670cfe6cf9a671eab961f6abaf2e64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Mon, 19 Feb 2024 14:06:36 +0000 Subject: [PATCH 23/63] [DOP-11877] Run Dependabot weekly --- .github/dependabot.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/dependabot.yml b/.github/dependabot.yml index a6fa6548d..6b89502b5 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -5,7 +5,7 @@ updates: - package-ecosystem: github-actions directory: / schedule: - interval: daily + interval: weekly labels: - type:ci # https://til.simonwillison.net/github/dependabot-python-setup From a8c840345dca758fae5f6f705ac708a27400e115 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Mon, 19 Feb 2024 14:16:12 +0000 Subject: [PATCH 24/63] [DOP-13209] Add pre-commit.ci badge to Readme --- README.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 0372381ae..ffc68e52b 100644 --- a/README.rst +++ b/README.rst @@ -4,7 +4,7 @@ onETL ===== |Repo Status| |PyPI| |PyPI License| |PyPI Python Version| -|Documentation| |Build Status| |Coverage| +|Documentation| |Build Status| |Coverage| |pre-commit.ci| .. |Repo Status| image:: https://www.repostatus.org/badges/latest/active.svg :target: https://github.com/MobileTeleSystems/onetl @@ -20,6 +20,8 @@ onETL :target: https://github.com/MobileTeleSystems/onetl/actions .. |Coverage| image:: https://codecov.io/gh/MobileTeleSystems/onetl/branch/develop/graph/badge.svg?token=RIO8URKNZJ :target: https://codecov.io/gh/MobileTeleSystems/onetl +.. |pre-commit.ci| image:: https://results.pre-commit.ci/badge/github/MobileTeleSystems/onetl/develop.svg + :target: https://results.pre-commit.ci/latest/github/MobileTeleSystems/onetl/develop |Logo| From 04757b775d9567c921f24ba7c25d56485e062403 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 19 Feb 2024 23:24:18 +0000 Subject: [PATCH 25/63] [pre-commit.ci] pre-commit autoupdate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/Lucas-C/pre-commit-hooks: v1.5.4 → v1.5.5](https://github.com/Lucas-C/pre-commit-hooks/compare/v1.5.4...v1.5.5) - [github.com/asottile/pyupgrade: v3.15.0 → v3.15.1](https://github.com/asottile/pyupgrade/compare/v3.15.0...v3.15.1) - [github.com/PyCQA/autoflake: v2.2.1 → v2.3.0](https://github.com/PyCQA/autoflake/compare/v2.2.1...v2.3.0) --- .pre-commit-config.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 961c8de69..55ad11f4d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -27,7 +27,7 @@ repos: - id: trailing-whitespace - repo: https://github.com/Lucas-C/pre-commit-hooks - rev: v1.5.4 + rev: v1.5.5 hooks: - id: forbid-tabs - id: remove-tabs @@ -89,7 +89,7 @@ repos: - id: text-unicode-replacement-char - repo: https://github.com/asottile/pyupgrade - rev: v3.15.0 + rev: v3.15.1 hooks: - id: pyupgrade args: [--py37-plus, --keep-runtime-typing] @@ -113,7 +113,7 @@ repos: - id: check-useless-excludes - repo: https://github.com/PyCQA/autoflake - rev: v2.2.1 + rev: v2.3.0 hooks: - id: autoflake args: From d2536b677bd0c363bfb8b8f4adb485d5c6f0c519 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Thu, 8 Feb 2024 12:28:52 +0000 Subject: [PATCH 26/63] [DOP-11906] Improve Clickhouse documentation --- .../next_release/211.improvement.rst | 3 + .../db_connection/clickhouse/execute.rst | 90 ++++++ .../db_connection/clickhouse/index.rst | 8 + .../clickhouse/prerequisites.rst | 69 +++++ .../db_connection/clickhouse/read.rst | 72 ++++- .../db_connection/clickhouse/sql.rst | 55 ++++ .../db_connection/clickhouse/types.rst | 257 ++++++++++++++++++ .../db_connection/clickhouse/write.rst | 47 +++- .../db_connection/clickhouse/connection.py | 30 +- .../db_connection/jdbc_mixin/connection.py | 12 +- 10 files changed, 598 insertions(+), 45 deletions(-) create mode 100644 docs/changelog/next_release/211.improvement.rst create mode 100644 docs/connection/db_connection/clickhouse/prerequisites.rst create mode 100644 docs/connection/db_connection/clickhouse/sql.rst create mode 100644 docs/connection/db_connection/clickhouse/types.rst diff --git a/docs/changelog/next_release/211.improvement.rst b/docs/changelog/next_release/211.improvement.rst new file mode 100644 index 000000000..853a7cf94 --- /dev/null +++ b/docs/changelog/next_release/211.improvement.rst @@ -0,0 +1,3 @@ +Improve Clickhouse documentation: + * Add "Types" section describing mapping between Clickhouse and Spark types + * Add "Prerequisites" section describing different aspects of connecting to Clickhouse diff --git a/docs/connection/db_connection/clickhouse/execute.rst b/docs/connection/db_connection/clickhouse/execute.rst index f43eb3995..f1ee68ff5 100644 --- a/docs/connection/db_connection/clickhouse/execute.rst +++ b/docs/connection/db_connection/clickhouse/execute.rst @@ -3,6 +3,96 @@ Executing statements in Clickhouse ================================== +How to +------ + +There are 2 ways to execute some statement in Clickhouse + +Use :obj:`Clickhouse.fetch ` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Use this method to execute some ``SELECT`` query which returns **small number or rows**, like reading +Clickhouse config, or reading data from some reference table. + +Method accepts :obj:`JDBCOptions `. + +Connection opened using this method should be then closed with :obj:`Clickhouse.close `. + +Syntax support +^^^^^^^^^^^^^^ + +This method supports **any** query syntax supported by Clickhouse, like: + +* ``SELECT ... FROM ...`` +* ``WITH alias AS (...) SELECT ...`` +* ``SHOW ...`` + +It does not support multiple queries in the same operation, like ``SET ...; SELECT ...;``. + +Examples +^^^^^^^^ + +.. code-block:: python + + from onetl.connection import Clickhouse + + clickhouse = Clickhouse(...) + + df = clickhouse.fetch( + "SELECT value FROM some.reference_table WHERE key = 'some_constant'", + options=Clickhouse.JDBCOptions(query_timeout=10), + ) + clickhouse.close() + value = df.collect()[0][0] # get value from first row and first column + +Use :obj:`Clickhouse.execute ` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it. + +Method accepts :obj:`JDBCOptions `. + +Connection opened using this method should be then closed with :obj:`Clickhouse.close `. + +Syntax support +^^^^^^^^^^^^^^ + +This method supports **any** query syntax supported by Clickhouse, like: + +* ``CREATE TABLE ...`` +* ``ALTER ...`` +* ``INSERT INTO ... AS SELECT ...`` + +It does not support multiple queries in the same operation, like ``SET ...; CREATE TABLE ...;``. + +Examples +^^^^^^^^ + +.. code-block:: python + + from onetl.connection import Clickhouse + + clickhouse = Clickhouse(...) + + with clickhouse: + clickhouse.execute("DROP TABLE schema.table") + clickhouse.execute( + """ + CREATE TABLE schema.table AS ( + id UInt8, + key String, + value Float32 + ) + ENGINE = MergeTree() + ORDER BY id + """, + options=Clickhouse.JDBCOptions(query_timeout=10), + ) + + +References +---------- + .. currentmodule:: onetl.connection.db_connection.clickhouse.connection .. automethod:: Clickhouse.fetch diff --git a/docs/connection/db_connection/clickhouse/index.rst b/docs/connection/db_connection/clickhouse/index.rst index 1e0d1de65..fcb0ba18d 100644 --- a/docs/connection/db_connection/clickhouse/index.rst +++ b/docs/connection/db_connection/clickhouse/index.rst @@ -7,6 +7,7 @@ Clickhouse :maxdepth: 1 :caption: Connection + prerequisites connection .. toctree:: @@ -14,5 +15,12 @@ Clickhouse :caption: Operations read + sql write execute + +.. toctree:: + :maxdepth: 1 + :caption: Troubleshooting + + types diff --git a/docs/connection/db_connection/clickhouse/prerequisites.rst b/docs/connection/db_connection/clickhouse/prerequisites.rst new file mode 100644 index 000000000..35c3f04e6 --- /dev/null +++ b/docs/connection/db_connection/clickhouse/prerequisites.rst @@ -0,0 +1,69 @@ +.. _clickhouse-prerequisites: + +Prerequisites +============= + +Version Compatibility +--------------------- + +* Clickhouse server versions: 20.7 or higher +* Spark versions: 2.3.x - 3.5.x +* Java versions: 8 - 20 + +See `official documentation `_. + +Installing PySpark +------------------ + +To use Clickhouse connector you should have PySpark installed (or injected to ``sys.path``) +BEFORE creating the connector instance. + +See :ref:`install-spark` installation instruction for more details. + +Connecting to Clickhouse +----------------------- + +Connection port +~~~~~~~~~~~~~~~ + +Connector can only use **HTTP** (usually ``8123`` port) or **HTTPS** (usually ``8443`` port) protocol. + +TCP and GRPC protocols are NOT supported. + +Clickhouse cluster interaction +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +If you're using Clickhouse cluster, it is currently possible to connect only to one specific cluster node. +Connecting to multiple nodes simultaneously is not supported. + +Required grants +~~~~~~~~~~~~~~~ + +Ask your Clickhouse cluster administrator to set following grants for a user, +used for creating a connection: + +.. tabs:: + + .. code-tab:: sql Read + Write + + -- allow external tables in the same schema as target table + GRANT CREATE TABLE ON myschema.* TO username; + + -- allow read & write access to specific table + GRANT SELECT, INSERT ON myschema.mytable TO username; + + .. code-tab:: sql Read only + + -- allow read access to specific table + GRANT SELECT ON myschema.mytable TO username; + + .. code-tab:: sql Write only + + -- allow external tables in the same schema as target table + GRANT CREATE TABLE ON myschema.* TO username; + + -- allow read access to specific table (to get column types) + -- allow write access to specific table + GRANT SELECT, INSERT ON myschema.mytable TO username; + +More details can be found in `official documentation `_. diff --git a/docs/connection/db_connection/clickhouse/read.rst b/docs/connection/db_connection/clickhouse/read.rst index a2c07733c..fcc165062 100644 --- a/docs/connection/db_connection/clickhouse/read.rst +++ b/docs/connection/db_connection/clickhouse/read.rst @@ -1,18 +1,74 @@ .. _clickhouse-read: -Reading from Clickhouse -======================= +Reading from Clickhouse using ``DBReader`` +========================================== -There are 2 ways of distributed data reading from Clickhouse: +.. warning:: -* Using :obj:`DBReader ` with different :ref:`strategy` -* Using :obj:`Clickhouse.sql ` + Please take into account :ref:`clickhouse-types` -Both methods accept :obj:`JDBCReadOptions ` +:obj:`DBReader ` supports :ref:`strategy` for incremental data reading, +but does not support custom queries, like JOINs. -.. currentmodule:: onetl.connection.db_connection.clickhouse.connection +Supported DBReader features +--------------------------- -.. automethod:: Clickhouse.sql +* ✅︎ ``columns`` +* ✅︎ ``where`` +* ✅︎ ``hwm``, supported strategies: +* * ✅︎ :ref:`snapshot-strategy` +* * ✅︎ :ref:`incremental-strategy` +* * ✅︎ :ref:`snapshot-batch-strategy` +* * ✅︎ :ref:`incremental-batch-strategy` +* ❌ ``hint`` (is not supported by Clickhouse) +* ❌ ``df_schema`` +* ✅︎ ``options`` (see :obj:`JDBCReadOptions `) + +Examples +-------- + +Snapshot strategy: + +.. code-block:: python + + from onetl.connection import Clickhouse + from onetl.db import DBReader + + clickhouse = Clickhouse(...) + + reader = DBReader( + connection=clickhouse, + source="schema.table", + columns=["id", "key", "CAST(value AS String) value", "updated_dt"], + where="key = 'something'", + options=Clickhouse.ReadOptions(partition_column="id", num_partitions=10), + ) + df = reader.run() + +Incremental strategy: + +.. code-block:: python + + from onetl.connection import Clickhouse + from onetl.db import DBReader + from onetl.strategy import IncrementalStrategy + + clickhouse = Clickhouse(...) + + reader = DBReader( + connection=clickhouse, + source="schema.table", + columns=["id", "key", "CAST(value AS String) value", "updated_dt"], + where="key = 'something'", + hwm=DBReader.AutoDetectHWM(name="clickhouse_hwm", expression="updated_dt"), + options=Clickhouse.ReadOptions(partition_column="id", num_partitions=10), + ) + + with IncrementalStrategy(): + df = reader.run() + +References +---------- .. currentmodule:: onetl.connection.db_connection.jdbc_connection.options diff --git a/docs/connection/db_connection/clickhouse/sql.rst b/docs/connection/db_connection/clickhouse/sql.rst new file mode 100644 index 000000000..5550cde7b --- /dev/null +++ b/docs/connection/db_connection/clickhouse/sql.rst @@ -0,0 +1,55 @@ +.. _clickhouse-sql: + +Reading from Clickhouse using ``Clickhouse.sql`` +================================================ + +.. warning:: + + Please take into account :ref:`clickhouse-types` + +:obj:`Clickhouse.sql ` allows passing custom SQL query, +but does not support incremental strategies. + +Method also accepts :obj:`JDBCReadOptions `. + +Syntax support +-------------- + +Only queries with the following syntax are supported: + +* ``SELECT ...`` +* ``WITH ... SELECT ...`` + +Queries like ``SHOW ...`` are not supported. + +This method also does not support multiple queries in the same operation, like ``SET ...; SELECT ...;``. + +Examples +-------- + +.. code-block:: python + + from onetl.connection import Clickhouse + + clickhouse = Clickhouse(...) + df = clickhouse.sql( + """ + SELECT + id, + key, + CAST(value AS String) value, + updated_at + FROM + some.mytable + WHERE + key = 'something' + """, + options=Clickhouse.ReadOptions(partition_column="id", num_partitions=10), + ) + +References +---------- + +.. currentmodule:: onetl.connection.db_connection.clickhouse.connection + +.. automethod:: Clickhouse.sql diff --git a/docs/connection/db_connection/clickhouse/types.rst b/docs/connection/db_connection/clickhouse/types.rst new file mode 100644 index 000000000..34b785b3c --- /dev/null +++ b/docs/connection/db_connection/clickhouse/types.rst @@ -0,0 +1,257 @@ +.. _clickhouse-types: + +Clickhouse <-> Spark type mapping +================================= + +Generic types +------------- + +* ``LowCardinality(T)`` is same as ``T`` +* ``Nullable(T)`` is same as ``T``, but Spark column is inferred as ``nullable=True`` + +Numeric types +------------- + ++--------------------------------------+--------------------------------------------+----------------------------------------+--------------------------------------------+ +| Clickhouse type (read) | Spark type | Clickhousetype (write) | Clickhouse type (create table using Spark) | ++======================================+============================================+========================================+============================================+ +| ``Bool`` | ``IntegerType()`` | ``Int32`` | ``Int32`` | ++--------------------------------------+--------------------------------------------+----------------------------------------+--------------------------------------------+ +| ``Decimal`` | ``DecimalType(10,0)`` | ``Decimal(10,0)`` | ``Decimal(10,0)`` | ++--------------------------------------+--------------------------------------------+----------------------------------------+--------------------------------------------+ +| ``Decimal(P)`` | ``DecimalType(P,0)`` | ``Decimal(P,0)`` | ``Decimal(P,0)`` | ++--------------------------------------+--------------------------------------------+----------------------------------------+--------------------------------------------+ +| ``Decimal(P,S)``, 0 <= P <= S <= 38 | ``DecimalType(P,S)`` | ``Decimal(P,S)`` | ``Decimal(P,S)`` | ++--------------------------------------+--------------------------------------------+----------------------------------------+--------------------------------------------+ +| ``Decimal(P,S)``, P,S > 38 | ``DecimalType(38,38)``, **precision loss** | ``Decimal(38,38)``, **precision loss** | ``Decimal(38,38)``, **precision loss** | ++--------------------------------------+--------------------------------------------+----------------------------------------+--------------------------------------------+ +| ``Decimal32(S)`` | ``DecimalType(9,S)`` | ``Decimal(9,S)`` | ``Decimal(9,S)`` | ++--------------------------------------+--------------------------------------------+----------------------------------------+--------------------------------------------+ +| ``Decimal64(S)`` | ``DecimalType(18,S)`` | ``Decimal(18,S)`` | ``Decimal(18,S)`` | ++--------------------------------------+--------------------------------------------+----------------------------------------+--------------------------------------------+ +| ``Decimal128(S)`` | ``DecimalType(38,S)`` | ``Decimal(38,S)`` | ``Decimal(38,S)`` | ++--------------------------------------+--------------------------------------------+----------------------------------------+--------------------------------------------+ +| ``Decimal256(S)`` | ``DecimalType(38,S)``, **precision loss** | ``Decimal(38,S)``, **precision loss** | ``Decimal(38,S)``, **precision loss** | ++--------------------------------------+--------------------------------------------+----------------------------------------+--------------------------------------------+ +| ``Float32`` | ``DoubleType()`` | ``Float64`` | ``Float64`` | ++--------------------------------------+ + + + +| ``Float64`` | | | | ++--------------------------------------+--------------------------------------------+----------------------------------------+--------------------------------------------+ +| ``Int8`` | ``IntegerType()`` | ``Int32`` | ``INTEGER`` | ++--------------------------------------+ + + + +| ``Int16`` | | | | ++--------------------------------------+ + + + +| ``Int32`` | | | | ++--------------------------------------+--------------------------------------------+----------------------------------------+--------------------------------------------+ +| ``Int64`` | ``LongType()`` | ``Int64`` | ``BIGINT`` | ++--------------------------------------+--------------------------------------------+----------------------------------------+--------------------------------------------+ +| ``Int128`` | ``DecimalType(20,0)`` | ``Decimal(20,0)`` | ``Decimal(20,0)`` | ++--------------------------------------+--------------------------------------------+----------------------------------------+--------------------------------------------+ +| ``Int256`` | ``DecimalType(38,0)``, **precision loss** | ``Decimal(38,0)``, **precision loss** | ``Decimal(38,0)``, **precision loss** | ++--------------------------------------+--------------------------------------------+----------------------------------------+--------------------------------------------+ +| ``UInt8`` | ``IntegerType()`` | ``Int32`` | ``INTEGER`` | ++--------------------------------------+ + + + +| ``UInt16`` | | | | ++--------------------------------------+--------------------------------------------+----------------------------------------+--------------------------------------------+ +| ``UInt32`` | ``DecimalType(20,0)`` | ``Decimal(20,0)`` | ``Decimal(20,0)`` | ++--------------------------------------+ + + + +| ``UInt64`` | | | | ++--------------------------------------+ + + + +| ``UInt128`` | | | | ++--------------------------------------+--------------------------------------------+----------------------------------------+--------------------------------------------+ +| ``UInt256`` | ``DecimalType(38,0)``, **precision loss** | ``Decimal(38,0)``, **precision loss** | ``Decimal(38,0)``, **precision loss** | ++--------------------------------------+--------------------------------------------+----------------------------------------+--------------------------------------------+ + +Temporal types +-------------- + ++--------------------------------------+--------------------------------------------+----------------------------------+--------------------------------------------+ +| Clickhouse type (read) | Spark type | Clickhousetype (write) | Clickhouse type (create table using Spark) | ++======================================+============================================+==================================+============================================+ +| ``Date`` | ``DateType()`` | ``Date`` | ``Date`` | ++--------------------------------------+--------------------------------------------+----------------------------------+--------------------------------------------+ +| ``Date32`` | unsupported | | | ++--------------------------------------+--------------------------------------------+----------------------------------+--------------------------------------------+ +| ``DateTime32``, seconds | ``TimestampType()``, milliseconds | ``DateTime64(6)``, milliseconds, | ``DateTime``, seconds, **precision loss** | ++--------------------------------------+ + + + +| ``DateTime64(3)``, milliseconds | | | | ++--------------------------------------+ + + + +| ``DateTime64(6)``, microseconds | | | | ++--------------------------------------+--------------------------------------------+----------------------------------+ + +| ``DateTime64(7..9)``, nanoseconds | ``TimestampType()``, milliseconds, | ``DateTime64(6)``, milliseconds, | | +| | **precision loss** | **precision loss** | + ++--------------------------------------+--------------------------------------------+----------------------------------+--------------------------------------------+ +| ``IntervalNanosecond`` | unsupported | | | ++--------------------------------------+ + + + +| ``IntervalMicrosecond`` | | | | ++--------------------------------------+ + + + +| ``IntervalMillisecond`` | | | | ++--------------------------------------+--------------------------------------------+----------------------------------+--------------------------------------------+ +| ``IntervalSecond`` | ``IntegerType()`` | ``Int32`` | ``Int32`` | ++--------------------------------------+ + + + +| ``IntervalMinute`` | | | | ++--------------------------------------+ + + + +| ``IntervalHour`` | | | | ++--------------------------------------+ + + + +| ``IntervalDay`` | | | | ++--------------------------------------+ + + + +| ``IntervalMonth`` | | | | ++--------------------------------------+ + + + +| ``IntervalQuarter`` | | | | ++--------------------------------------+ + + + +| ``IntervalWeek`` | | | | ++--------------------------------------+ + + + +| ``IntervalYear`` | | | | ++--------------------------------------+--------------------------------------------+----------------------------------+--------------------------------------------+ + +String types +------------ + ++--------------------------------------+------------------+------------------------+--------------------------------------------+ +| Clickhouse type (read) | Spark type | Clickhousetype (write) | Clickhouse type (create table using Spark) | ++======================================+==================+========================+============================================+ +| ``IPv4`` | ``StringType()`` | ``String`` | ``String`` | ++--------------------------------------+ + + + +| ``IPv6`` | | | | ++--------------------------------------+ + + + +| ``Enum8`` | | | | ++--------------------------------------+ + + + +| ``Enum16`` | | | | ++--------------------------------------+ + + + +| ``FixedString(N)`` | | | | ++--------------------------------------+ + + + +| ``String`` | | | | ++--------------------------------------+------------------+------------------------+--------------------------------------------+ + +Unsupported types +----------------- + +Columns of these types cannot be read/written by Spark: + * ``AggregateFunction(func, T)`` + * ``Array(T)`` + * ``JSON`` + * ``Map(K, V)`` + * ``MultiPolygon`` + * ``Nested(field1 T1, ...)`` + * ``Nothing`` + * ``Point`` + * ``Polygon`` + * ``Ring`` + * ``SimpleAggregateFunction(func, T)`` + * ``Tuple(T1, T2, ...)`` + * ``UUID`` + +This is because there is no dedicated Clickhouse dialect for Spark, so some types cannot be properly converted to Spark types (e.g. ``Array``), +even if Spark does support such type (e.g. ``ArrayType()``). + +The is a way to avoid this - just cast unsupported type to ``String``. + +Read unsupported column type +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Use ``CAST`` or ``toJSONString`` to get column data as string in JSON format: + +.. code:: sql + + SELECT CAST(array_column AS String) FROM ... + + -- or + + SELECT toJSONString(array_column) FROM ... + +And then cast string column in resulting dataframe to proper type using `from_json `_: + +.. code:: python + + from pyspark.sql.functions import from_json + from pyspark.sql.types import ArrayType, IntegerType + + df = clickhouse.sql(...) + + # Spark requires all columns to have some specific type, describe it + column_type = ArrayType(IntegerType()) + + parsed_array = from_json(df.array_column, schema).alias("array_column") + df = df.select(parsed_array) + +Write unsupported column type +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Convert dataframe column to JSON using `to_json `_, +and write it as ``String`` column in Clickhouse: + +.. code:: sql + + CREATE TABLE target_tbl AS ( + id Int32, + array_column_json String, + ) + ENGINE = MergeTree() + ORDER BY time + +.. code:: python + + from pyspark.sql.functions import to_json + + array_column_json = to_json(df.array_column) + df = df.select(df.id, array_column_json.alias("array_column_json")) + + writer.run(df) + +Then you can parse this column on Clickhouse side: + +.. code:: sql + + SELECT id, JSONExtract(json_column, 'Array(String)') FROM target_tbl + +You can also use `MATERIALIZED `_ +and `ALIAS `_ columns +to avoid writing such expression in every ``SELECT`` clause all the time. + +Downsides: + +* Using ``SELECT JSONExtract(...)`` or ``ALIAS`` column can be expensive, because value is calculated on every row access. This can be especially harmful if such column is used in ``WHERE`` clause. +* Using ``MATERIALIZED`` column allows to perform such expensive calculation just once, but this requires up to 2x storage, because Clickhouse stores both raw and parsed column. +* Both ``ALIAS`` and ``MATERIALIZED`` columns are not included in ``SELECT *`` clause, they should be added explicitly: ``SELECT *, calculated_column FROM table``. + +.. warning:: + + `EPHEMERAL `_ columns are not supported by Spark + because they cannot be selected to determine target column type. + +Creating tables using Spark +--------------------------- + +.. warning:: + + Absolutely not recommended! + +If Spark dataframe is written to Clickhouse table which does not exists yet, it will be automatically created. + +But Spark will use types from Generic JDBC dialect, and generic types like ``TIMESTAMP`` have different precision +than Clickhouse-specific ``DateTime32`` / ``DateTime64``. + +Always prefer creating tables with specific types **BEFORE WRITING DATA** using Spark: + +.. code:: python + + clickhouse.execute( + """ + CREATE TABLE target_tbl AS ( + id UInt8, + value DateTime64(6) -- specific type and precision + ) + ENGINE = MergeTree() + ORDER BY value + """, + ) + +References +---------- + +Here you can find the source code used by Clickhouse JDBC and Spark for performing type conversion: + +* `Clickhouse -> JDBC `_ +* `JDBC -> Spark `_ +* `Spark -> JDBC `_ +* `JDBC -> Clickhouse `_ diff --git a/docs/connection/db_connection/clickhouse/write.rst b/docs/connection/db_connection/clickhouse/write.rst index 97cc95d36..0bc305930 100644 --- a/docs/connection/db_connection/clickhouse/write.rst +++ b/docs/connection/db_connection/clickhouse/write.rst @@ -1,9 +1,50 @@ .. _clickhouse-write: -Writing to Clickhouse -===================== +Writing to Clickhouse using ``DBWriter`` +======================================== -For writing data to Clickhouse, use :obj:`DBWriter ` with options below. +.. warning:: + + Please take into account :ref:`clickhouse-types` + +.. warning:: + + It is always recommended to create table explicitly using :obj:`Clickhouse.execute ` + instead of relying on Spark's table DDL generation. + + This is because Spark's DDL generator can create columns with different precision and types than it is expected, + causing precision loss or other issues. + +For writing data to Clickhouse, use :obj:`DBWriter `. + +Examples +-------- + +.. code-block:: python + + from onetl.connection import Clickhouse + from onetl.db import DBWriter + + clickhouse = Clickhouse(...) + + df = ... # data is here + + writer = DBWriter( + connection=clickhouse, + target="schema.table", + options=Clickhouse.WriteOptions( + if_exists="replace_entire_table", + createTableOptions="ENGINE = MergeTree() ORDER BY id", + ), + ) + + writer.run(df) + + +Write options +------------- + +Method above accepts :obj:`JDBCWriteOptions ` .. currentmodule:: onetl.connection.db_connection.jdbc_connection.options diff --git a/onetl/connection/db_connection/clickhouse/connection.py b/onetl/connection/db_connection/clickhouse/connection.py index 63eeb455e..88de5645a 100644 --- a/onetl/connection/db_connection/clickhouse/connection.py +++ b/onetl/connection/db_connection/clickhouse/connection.py @@ -31,29 +31,9 @@ class Clickhouse(JDBCConnection): Based on Maven package ``ru.yandex.clickhouse:clickhouse-jdbc:0.3.2`` (`official Clickhouse JDBC driver `_). - .. dropdown:: Version compatibility - - * Clickhouse server versions: 20.7 or higher - * Spark versions: 2.3.x - 3.5.x - * Java versions: 8 - 20 - - See `official documentation `_. - .. warning:: - To use Clickhouse connector you should have PySpark installed (or injected to ``sys.path``) - BEFORE creating the connector instance. - - You can install PySpark as follows: - - .. code:: bash - - pip install onetl[spark] # latest PySpark version - - # or - pip install onetl pyspark=3.5.0 # pass specific PySpark version - - See :ref:`install-spark` installation instruction for more details. + Before using this connector please take into account :ref:`clickhouse-prerequisites` Parameters ---------- @@ -82,9 +62,11 @@ class Clickhouse(JDBCConnection): For example: ``{"continueBatchOnError": "false"}``. - See `Clickhouse JDBC driver properties documentation - `_ - for more details + See: + * `Clickhouse JDBC driver properties documentation `_ + * `Clickhouse core settings documentation `_ + * `Clickhouse query complexity documentation `_ + * `Clickhouse query level settings `_ Examples -------- diff --git a/onetl/connection/db_connection/jdbc_mixin/connection.py b/onetl/connection/db_connection/jdbc_mixin/connection.py index d7324db0e..e22ad6f2d 100644 --- a/onetl/connection/db_connection/jdbc_mixin/connection.py +++ b/onetl/connection/db_connection/jdbc_mixin/connection.py @@ -92,7 +92,7 @@ def close(self): .. code:: python - df = connection.fetch("SELECT * FROM mytable") + df = connection.fetch("SELECT * FROM mytable LIMIT 10") assert df.count() connection.close() @@ -207,15 +207,7 @@ def fetch( .. code:: python - df = connection.fetch("SELECT * FROM mytable") - assert df.count() - - Read data from a table with options: - - .. code:: python - - # reads data from table in batches, 10000 rows per batch - df = connection.fetch("SELECT * FROM mytable", {"fetchsize": 10000}) + df = connection.fetch("SELECT * FROM mytable LIMIT 10") assert df.count() """ From bc7222e043da1c9f28e002fe13e3cd252e59e026 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 26 Feb 2024 06:23:10 +0000 Subject: [PATCH 27/63] Bump the github-actions group with 1 update Bumps the github-actions group with 1 update: [mikefarah/yq](https://github.com/mikefarah/yq). Updates `mikefarah/yq` from 4.41.1 to 4.42.1 - [Release notes](https://github.com/mikefarah/yq/releases) - [Changelog](https://github.com/mikefarah/yq/blob/master/release_notes.txt) - [Commits](https://github.com/mikefarah/yq/compare/v4.41.1...v4.42.1) --- updated-dependencies: - dependency-name: mikefarah/yq dependency-type: direct:production update-type: version-update:semver-minor dependency-group: github-actions ... Signed-off-by: dependabot[bot] --- .github/workflows/get-matrix.yml | 38 ++++++++++++++++---------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/.github/workflows/get-matrix.yml b/.github/workflows/get-matrix.yml index a3ccc49f0..de57b161f 100644 --- a/.github/workflows/get-matrix.yml +++ b/.github/workflows/get-matrix.yml @@ -154,7 +154,7 @@ jobs: - name: Get Core matrix id: matrix-core - uses: mikefarah/yq@v4.41.1 + uses: mikefarah/yq@v4.42.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/core/matrix.yml @@ -184,7 +184,7 @@ jobs: - name: Get Clickhouse matrix id: matrix-clickhouse - uses: mikefarah/yq@v4.41.1 + uses: mikefarah/yq@v4.42.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/clickhouse/matrix.yml @@ -214,7 +214,7 @@ jobs: - name: Get Greenplum matrix id: matrix-greenplum - uses: mikefarah/yq@v4.41.1 + uses: mikefarah/yq@v4.42.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/greenplum/matrix.yml @@ -244,7 +244,7 @@ jobs: - name: Get Hive matrix id: matrix-hive - uses: mikefarah/yq@v4.41.1 + uses: mikefarah/yq@v4.42.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/hive/matrix.yml @@ -274,7 +274,7 @@ jobs: - name: Get Kafka matrix id: matrix-kafka - uses: mikefarah/yq@v4.41.1 + uses: mikefarah/yq@v4.42.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/kafka/matrix.yml @@ -304,7 +304,7 @@ jobs: - name: Get LocalFS matrix id: matrix-local-fs - uses: mikefarah/yq@v4.41.1 + uses: mikefarah/yq@v4.42.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/local-fs/matrix.yml @@ -334,7 +334,7 @@ jobs: - name: Get MongoDB matrix id: matrix-mongodb - uses: mikefarah/yq@v4.41.1 + uses: mikefarah/yq@v4.42.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/mongodb/matrix.yml @@ -364,7 +364,7 @@ jobs: - name: Get MSSQL matrix id: matrix-mssql - uses: mikefarah/yq@v4.41.1 + uses: mikefarah/yq@v4.42.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/mssql/matrix.yml @@ -394,7 +394,7 @@ jobs: - name: Get MySQL matrix id: matrix-mysql - uses: mikefarah/yq@v4.41.1 + uses: mikefarah/yq@v4.42.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/mysql/matrix.yml @@ -424,7 +424,7 @@ jobs: - name: Get Oracle matrix id: matrix-oracle - uses: mikefarah/yq@v4.41.1 + uses: mikefarah/yq@v4.42.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/oracle/matrix.yml @@ -454,7 +454,7 @@ jobs: - name: Get Postgres matrix id: matrix-postgres - uses: mikefarah/yq@v4.41.1 + uses: mikefarah/yq@v4.42.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/postgres/matrix.yml @@ -484,7 +484,7 @@ jobs: - name: Get Teradata matrix id: matrix-teradata - uses: mikefarah/yq@v4.41.1 + uses: mikefarah/yq@v4.42.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/teradata/matrix.yml @@ -514,7 +514,7 @@ jobs: - name: Get FTP matrix id: matrix-ftp - uses: mikefarah/yq@v4.41.1 + uses: mikefarah/yq@v4.42.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/ftp/matrix.yml @@ -544,7 +544,7 @@ jobs: - name: Get FTPS matrix id: matrix-ftps - uses: mikefarah/yq@v4.41.1 + uses: mikefarah/yq@v4.42.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/ftps/matrix.yml @@ -574,7 +574,7 @@ jobs: - name: Get HDFS matrix id: matrix-hdfs - uses: mikefarah/yq@v4.41.1 + uses: mikefarah/yq@v4.42.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/hdfs/matrix.yml @@ -604,7 +604,7 @@ jobs: - name: Get S3 matrix id: matrix-s3 - uses: mikefarah/yq@v4.41.1 + uses: mikefarah/yq@v4.42.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/s3/matrix.yml @@ -634,7 +634,7 @@ jobs: - name: Get SFTP matrix id: matrix-sftp - uses: mikefarah/yq@v4.41.1 + uses: mikefarah/yq@v4.42.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/sftp/matrix.yml @@ -664,7 +664,7 @@ jobs: - name: Get Samba matrix id: matrix-samba - uses: mikefarah/yq@v4.41.1 + uses: mikefarah/yq@v4.42.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/samba/matrix.yml @@ -694,6 +694,6 @@ jobs: - name: Get WebDAV matrix id: matrix-webdav - uses: mikefarah/yq@v4.41.1 + uses: mikefarah/yq@v4.42.1 with: cmd: yq -o=json '.matrix' .github/workflows/data/webdav/matrix.yml From d42a303c2dca8b552e643acfa5a25fe3ef2af0b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Mon, 26 Feb 2024 08:23:03 +0000 Subject: [PATCH 28/63] Run FileConnection tests on Python 3.12.2 --- .github/workflows/data/core/matrix.yml | 6 ++---- .github/workflows/data/ftp/matrix.yml | 3 +-- .github/workflows/data/ftps/matrix.yml | 3 +-- .github/workflows/data/hdfs/matrix.yml | 6 ++---- .github/workflows/data/s3/matrix.yml | 6 ++---- .github/workflows/data/samba/matrix.yml | 3 +-- .github/workflows/data/sftp/matrix.yml | 3 +-- .github/workflows/data/webdav/matrix.yml | 3 +-- 8 files changed, 11 insertions(+), 22 deletions(-) diff --git a/.github/workflows/data/core/matrix.yml b/.github/workflows/data/core/matrix.yml index 82da65384..522e2160a 100644 --- a/.github/workflows/data/core/matrix.yml +++ b/.github/workflows/data/core/matrix.yml @@ -6,15 +6,13 @@ min: &min max: &max spark-version: 3.5.0 - # TODO: replace with 3.12 after release of https://github.com/python/typing_extensions/pull/324 - python-version: '3.12.1' + python-version: '3.12' java-version: 20 os: ubuntu-latest latest: &latest spark-version: latest - # TODO: replace with 3.12 after release of https://github.com/python/typing_extensions/pull/324 - python-version: '3.12.1' + python-version: '3.12' java-version: 20 os: ubuntu-latest diff --git a/.github/workflows/data/ftp/matrix.yml b/.github/workflows/data/ftp/matrix.yml index fd1ca573a..86243228c 100644 --- a/.github/workflows/data/ftp/matrix.yml +++ b/.github/workflows/data/ftp/matrix.yml @@ -3,8 +3,7 @@ min: &min os: ubuntu-latest max: &max - # TODO: replace with 3.12 after release of https://github.com/python/typing_extensions/pull/324 - python-version: '3.12.1' + python-version: '3.12' os: ubuntu-latest matrix: diff --git a/.github/workflows/data/ftps/matrix.yml b/.github/workflows/data/ftps/matrix.yml index f902c5c98..bb61dc6b0 100644 --- a/.github/workflows/data/ftps/matrix.yml +++ b/.github/workflows/data/ftps/matrix.yml @@ -3,8 +3,7 @@ min: &min os: ubuntu-latest max: &max - # TODO: replace with 3.12 after release of https://github.com/python/typing_extensions/pull/324 - python-version: '3.12.1' + python-version: '3.12' os: ubuntu-latest matrix: diff --git a/.github/workflows/data/hdfs/matrix.yml b/.github/workflows/data/hdfs/matrix.yml index 07d7844c0..c117c8d6f 100644 --- a/.github/workflows/data/hdfs/matrix.yml +++ b/.github/workflows/data/hdfs/matrix.yml @@ -8,16 +8,14 @@ min: &min max: &max hadoop-version: hadoop3-hdfs spark-version: 3.5.0 - # TODO: replace with 3.12 after release of https://github.com/python/typing_extensions/pull/324 - python-version: '3.12.1' + python-version: '3.12' java-version: 20 os: ubuntu-latest latest: &latest hadoop-version: hadoop3-hdfs spark-version: latest - # TODO: replace with 3.12 after release of https://github.com/python/typing_extensions/pull/324 - python-version: '3.12.1' + python-version: '3.12' java-version: 20 os: ubuntu-latest diff --git a/.github/workflows/data/s3/matrix.yml b/.github/workflows/data/s3/matrix.yml index 872cb36ca..b2d595d6b 100644 --- a/.github/workflows/data/s3/matrix.yml +++ b/.github/workflows/data/s3/matrix.yml @@ -10,16 +10,14 @@ min: &min max: &max minio-version: 2023.7.18 spark-version: 3.5.0 - # TODO: replace with 3.12 after release of https://github.com/python/typing_extensions/pull/324 - python-version: '3.12.1' + python-version: '3.12' java-version: 20 os: ubuntu-latest latest: &latest minio-version: latest spark-version: latest - # TODO: replace with 3.12 after release of https://github.com/python/typing_extensions/pull/324 - python-version: '3.12.1' + python-version: '3.12' java-version: 20 os: ubuntu-latest diff --git a/.github/workflows/data/samba/matrix.yml b/.github/workflows/data/samba/matrix.yml index 8bcb05216..f03adcd08 100644 --- a/.github/workflows/data/samba/matrix.yml +++ b/.github/workflows/data/samba/matrix.yml @@ -3,8 +3,7 @@ min: &min os: ubuntu-latest max: &max - # TODO: replace with 3.12 after release of https://github.com/python/typing_extensions/pull/324 - python-version: '3.12.1' + python-version: '3.12' os: ubuntu-latest matrix: diff --git a/.github/workflows/data/sftp/matrix.yml b/.github/workflows/data/sftp/matrix.yml index 453966524..3379a6e3e 100644 --- a/.github/workflows/data/sftp/matrix.yml +++ b/.github/workflows/data/sftp/matrix.yml @@ -3,8 +3,7 @@ min: &min os: ubuntu-latest max: &max - # TODO: replace with 3.12 after release of https://github.com/python/typing_extensions/pull/324 - python-version: '3.12.1' + python-version: '3.12' os: ubuntu-latest matrix: diff --git a/.github/workflows/data/webdav/matrix.yml b/.github/workflows/data/webdav/matrix.yml index d1c5b9f5d..0a67ff838 100644 --- a/.github/workflows/data/webdav/matrix.yml +++ b/.github/workflows/data/webdav/matrix.yml @@ -3,8 +3,7 @@ min: &min os: ubuntu-latest max: &max - # TODO: replace with 3.12 after release of https://github.com/python/typing_extensions/pull/324 - python-version: '3.12.1' + python-version: '3.12' os: ubuntu-latest matrix: From 6849b4640eae8cf46d5a8d9df59f2cfedafe97c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Mon, 26 Feb 2024 08:31:34 +0000 Subject: [PATCH 29/63] Fix Hive.WriteOptions documentation --- onetl/connection/db_connection/hive/options.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/onetl/connection/db_connection/hive/options.py b/onetl/connection/db_connection/hive/options.py index 357584660..222f914fc 100644 --- a/onetl/connection/db_connection/hive/options.py +++ b/onetl/connection/db_connection/hive/options.py @@ -206,7 +206,7 @@ class Config: .. warning:: - Used **only** while **creating new table**, or in case of ``if_exists=recreate_entire_table`` + Used **only** while **creating new table**, or in case of ``if_exists=replace_entire_table`` """ partition_by: Optional[Union[List[str], str]] = Field(default=None, alias="partitionBy") @@ -233,7 +233,7 @@ class Config: .. warning:: - Used **only** while **creating new table**, or in case of ``if_exists=recreate_entire_table`` + Used **only** while **creating new table**, or in case of ``if_exists=replace_entire_table`` """ bucket_by: Optional[Tuple[int, Union[List[str], str]]] = Field(default=None, alias="bucketBy") # noqa: WPS234 @@ -258,14 +258,14 @@ class Config: It is recommended to use this option **ONLY** if you have a large table (hundreds of Gb or more), which is used mostly for JOINs with other tables, - and you're inserting data using ``if_exists=overwrite_partitions`` or ``if_exists=recreate_entire_table``. + and you're inserting data using ``if_exists=overwrite_partitions`` or ``if_exists=replace_entire_table``. Otherwise Spark will create a lot of small files (one file for each bucket and each executor), drastically **decreasing** HDFS performance. .. warning:: - Used **only** while **creating new table**, or in case of ``if_exists=recreate_entire_table`` + Used **only** while **creating new table**, or in case of ``if_exists=replace_entire_table`` """ sort_by: Optional[Union[List[str], str]] = Field(default=None, alias="sortBy") @@ -283,7 +283,7 @@ class Config: .. warning:: - Used **only** while **creating new table**, or in case of ``if_exists=recreate_entire_table`` + Used **only** while **creating new table**, or in case of ``if_exists=replace_entire_table`` """ compression: Optional[str] = None @@ -294,7 +294,7 @@ class Config: .. warning:: - Used **only** while **creating new table**, or in case of ``if_exists=recreate_entire_table`` + Used **only** while **creating new table**, or in case of ``if_exists=replace_entire_table`` """ @validator("sort_by") From 3fe2aa8de55b1a055965863ebdcbec96ed161869 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Wed, 21 Feb 2024 15:50:32 +0000 Subject: [PATCH 30/63] [DOP-13259] Update Clickhouse types documentation --- .../db_connection/clickhouse/read.rst | 4 +- .../db_connection/clickhouse/types.rst | 476 +++++++++++------- .../db_connection/clickhouse/write.rst | 7 +- 3 files changed, 304 insertions(+), 183 deletions(-) diff --git a/docs/connection/db_connection/clickhouse/read.rst b/docs/connection/db_connection/clickhouse/read.rst index fcc165062..e421f340b 100644 --- a/docs/connection/db_connection/clickhouse/read.rst +++ b/docs/connection/db_connection/clickhouse/read.rst @@ -67,8 +67,8 @@ Incremental strategy: with IncrementalStrategy(): df = reader.run() -References ----------- +Read options +------------ .. currentmodule:: onetl.connection.db_connection.jdbc_connection.options diff --git a/docs/connection/db_connection/clickhouse/types.rst b/docs/connection/db_connection/clickhouse/types.rst index 34b785b3c..01d6e6b53 100644 --- a/docs/connection/db_connection/clickhouse/types.rst +++ b/docs/connection/db_connection/clickhouse/types.rst @@ -3,130 +3,273 @@ Clickhouse <-> Spark type mapping ================================= +Type detection & casting +------------------------ + +Spark's DataFrames always have a ``schema`` which is a list of columns with corresponding Spark types. All operations on a column are performed using column type. + +Reading from Clickhouse +~~~~~~~~~~~~~~~~~~~~~~~ + +This is how Clickhouse connector performs this: + +* For each column in query result (``SELECT column1, column2, ... FROM table ...``) get column name and Clickhouse type. +* Find corresponding ``Clickhouse type (read)`` -> ``Spark type`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* Create DataFrame from query with specific column names and Spark types. + +Writing to some existing Clickhuse table +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This is how Clickhouse connector performs this: + +* Get names of columns in DataFrame. [1]_ +* Perform ``SELECT column1, colum2, ... FROM table LIMIT 0`` query +* For each column in query result get column name and Clickhouse type. +* **Find corresponding** ``Clickhouse type (read)`` -> ``Spark type`` **combination** (see below) for each DataFrame column. If no combination is found, raise exception. [2]_ +* Find corresponding ``Spark type`` -> ``Clickhousetype (write)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* If ``Clickhousetype (write)`` match ``Clickhouse type (read)``, no additional casts will be performed, DataFrame column will be written to Clickhouse as is. +* If ``Clickhousetype (write)`` does not match ``Clickhouse type (read)``, DataFrame column will be casted to target column type **on Clickhouse side**. For example, you can write column with text data to ``Int32`` column, if column contains valid integer values within supported value range and precision. + +.. [1] + This allows to write data to tables with ``DEFAULT`` columns - if DataFrame has no such column, + it will be populated by Clickhouse. + +.. [2] + + Yes, this is weird. + +Create new table using Spark +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. warning:: + + ABSOLUTELY NOT RECOMMENDED! + +This is how Clickhouse connector performs this: + +* Find corresponding ``Spark type`` -> ``Clickhouse type (create)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* Generate DDL for creating table in Clickhouse, like ``CREATE TABLE (col1 ...)``, and run it. +* Write DataFrame to created table as is. + +But Spark does not have specific dialect for Clickhouse, so Generic JDBC dialect is used. +Generic dialect is using SQL ANSI type names while creating tables in target database, not database-specific types. + +If some cases this may lead to using wrong column type. For example, Spark creates column of type ``TIMESTAMP`` +which corresponds to Clickhouse's type ``DateTime32`` (precision up to seconds) +instead of more precise ``DateTime64`` (precision up to nanoseconds). +This may lead to incidental precision loss, or sometimes data cannot be written to created table at all. + +So instead of relying on Spark to create tables: + +.. code:: python + + writer = DBWriter( + connection=clickhouse, + table="default.target_tbl", + options=Clickhouse.WriteOptions( + if_exists="append", + # ENGINE is required by Clickhouse + createTableOptions="ENGINE = MergeTree() ORDER BY id", + ), + ) + writer.run(df) + +Always prefer creating tables with specific types **BEFORE WRITING DATA**: + +.. code:: python + + clickhouse.execute( + """ + CREATE TABLE default.target_tbl AS ( + id UInt8, + value DateTime64(6) -- specific type and precision + ) + ENGINE = MergeTree() + ORDER BY id + """, + ) + + writer = DBWriter( + connection=clickhouse, + table="default.target_tbl", + options=Clickhouse.WriteOptions(if_exists="append"), + ) + writer.run(df) + +References +~~~~~~~~~~ + +Here you can find source code with type conversions: + +* `Clickhouse -> JDBC `_ +* `JDBC -> Spark `_ +* `Spark -> JDBC `_ +* `JDBC -> Clickhouse `_ + +Supported types +--------------- + Generic types -------------- +~~~~~~~~~~~~~ * ``LowCardinality(T)`` is same as ``T`` * ``Nullable(T)`` is same as ``T``, but Spark column is inferred as ``nullable=True`` Numeric types -------------- - -+--------------------------------------+--------------------------------------------+----------------------------------------+--------------------------------------------+ -| Clickhouse type (read) | Spark type | Clickhousetype (write) | Clickhouse type (create table using Spark) | -+======================================+============================================+========================================+============================================+ -| ``Bool`` | ``IntegerType()`` | ``Int32`` | ``Int32`` | -+--------------------------------------+--------------------------------------------+----------------------------------------+--------------------------------------------+ -| ``Decimal`` | ``DecimalType(10,0)`` | ``Decimal(10,0)`` | ``Decimal(10,0)`` | -+--------------------------------------+--------------------------------------------+----------------------------------------+--------------------------------------------+ -| ``Decimal(P)`` | ``DecimalType(P,0)`` | ``Decimal(P,0)`` | ``Decimal(P,0)`` | -+--------------------------------------+--------------------------------------------+----------------------------------------+--------------------------------------------+ -| ``Decimal(P,S)``, 0 <= P <= S <= 38 | ``DecimalType(P,S)`` | ``Decimal(P,S)`` | ``Decimal(P,S)`` | -+--------------------------------------+--------------------------------------------+----------------------------------------+--------------------------------------------+ -| ``Decimal(P,S)``, P,S > 38 | ``DecimalType(38,38)``, **precision loss** | ``Decimal(38,38)``, **precision loss** | ``Decimal(38,38)``, **precision loss** | -+--------------------------------------+--------------------------------------------+----------------------------------------+--------------------------------------------+ -| ``Decimal32(S)`` | ``DecimalType(9,S)`` | ``Decimal(9,S)`` | ``Decimal(9,S)`` | -+--------------------------------------+--------------------------------------------+----------------------------------------+--------------------------------------------+ -| ``Decimal64(S)`` | ``DecimalType(18,S)`` | ``Decimal(18,S)`` | ``Decimal(18,S)`` | -+--------------------------------------+--------------------------------------------+----------------------------------------+--------------------------------------------+ -| ``Decimal128(S)`` | ``DecimalType(38,S)`` | ``Decimal(38,S)`` | ``Decimal(38,S)`` | -+--------------------------------------+--------------------------------------------+----------------------------------------+--------------------------------------------+ -| ``Decimal256(S)`` | ``DecimalType(38,S)``, **precision loss** | ``Decimal(38,S)``, **precision loss** | ``Decimal(38,S)``, **precision loss** | -+--------------------------------------+--------------------------------------------+----------------------------------------+--------------------------------------------+ -| ``Float32`` | ``DoubleType()`` | ``Float64`` | ``Float64`` | -+--------------------------------------+ + + + -| ``Float64`` | | | | -+--------------------------------------+--------------------------------------------+----------------------------------------+--------------------------------------------+ -| ``Int8`` | ``IntegerType()`` | ``Int32`` | ``INTEGER`` | -+--------------------------------------+ + + + -| ``Int16`` | | | | -+--------------------------------------+ + + + -| ``Int32`` | | | | -+--------------------------------------+--------------------------------------------+----------------------------------------+--------------------------------------------+ -| ``Int64`` | ``LongType()`` | ``Int64`` | ``BIGINT`` | -+--------------------------------------+--------------------------------------------+----------------------------------------+--------------------------------------------+ -| ``Int128`` | ``DecimalType(20,0)`` | ``Decimal(20,0)`` | ``Decimal(20,0)`` | -+--------------------------------------+--------------------------------------------+----------------------------------------+--------------------------------------------+ -| ``Int256`` | ``DecimalType(38,0)``, **precision loss** | ``Decimal(38,0)``, **precision loss** | ``Decimal(38,0)``, **precision loss** | -+--------------------------------------+--------------------------------------------+----------------------------------------+--------------------------------------------+ -| ``UInt8`` | ``IntegerType()`` | ``Int32`` | ``INTEGER`` | -+--------------------------------------+ + + + -| ``UInt16`` | | | | -+--------------------------------------+--------------------------------------------+----------------------------------------+--------------------------------------------+ -| ``UInt32`` | ``DecimalType(20,0)`` | ``Decimal(20,0)`` | ``Decimal(20,0)`` | -+--------------------------------------+ + + + -| ``UInt64`` | | | | -+--------------------------------------+ + + + -| ``UInt128`` | | | | -+--------------------------------------+--------------------------------------------+----------------------------------------+--------------------------------------------+ -| ``UInt256`` | ``DecimalType(38,0)``, **precision loss** | ``Decimal(38,0)``, **precision loss** | ``Decimal(38,0)``, **precision loss** | -+--------------------------------------+--------------------------------------------+----------------------------------------+--------------------------------------------+ +~~~~~~~~~~~~~ + ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| Clickhouse type (read) | Spark type | Clickhousetype (write) | Clickhouse type (create) | ++================================+===================================+===============================+===============================+ +| ``Bool`` | ``IntegerType()`` | ``Int32`` | ``Int32`` | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``-`` | ``BooleanType()`` | ``UInt64`` | ``UInt64`` | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``Decimal`` | ``DecimalType(P=10, S=0)`` | ``Decimal(P=10, S=0)`` | ``Decimal(P=10, S=0)`` | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``Decimal(P=0..38)`` | ``DecimalType(P=0..38, S=0)`` | ``Decimal(P=0..38, S=0)`` | ``Decimal(P=0..38, S=0)`` | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``Decimal(P=0..38, S=0..38)`` | ``DecimalType(P=0..38, S=0..38)`` | ``Decimal(P=0..38, S=0..38)`` | ``Decimal(P=0..38, S=0..38)`` | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``Decimal(P=39..76, S=0..76)`` | unsupported [3]_ | | | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``Decimal32(P=0..9)`` | ``DecimalType(P=9, S=0..9)`` | ``Decimal(P=9, S=0..9)`` | ``Decimal(P=9, S=0..9)`` | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``Decimal64(S=0..18)`` | ``DecimalType(P=18, S=0..18)`` | ``Decimal(P=18, S=0..18)`` | ``Decimal(P=18, S=0..18)`` | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``Decimal128(S=0..38)`` | ``DecimalType(P=38, S=0..38)`` | ``Decimal(P=38, S=0..38)`` | ``Decimal(P=38, S=0..38)`` | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``Decimal256(S=0..76)`` | unsupported [3]_ | | | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``Float32`` | ``DoubleType()`` | ``Float64`` | ``Float64`` | ++--------------------------------+ | | | +| ``Float64`` | | | | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``-`` | ``FloatType()`` | ``Float32`` | ``Float32`` | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``Int8`` | ``IntegerType()`` | ``Int32`` | ``Int32`` | ++--------------------------------+ | | | +| ``Int16`` | | | | ++--------------------------------+ | | | +| ``Int32`` | | | | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``Int64`` | ``LongType()`` | ``Int64`` | ``Int64`` | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``Int128`` | ``DecimalType(20,0)`` | ``Decimal(20,0)`` | ``Decimal(20,0)`` | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``Int256`` | unsupported [3]_ | | | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``-`` | ``ByteType()`` | ``Int8`` | ``Int8`` | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``-`` | ``ShortType()`` | ``Int32`` | ``Int32`` | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``UInt8`` | ``IntegerType()`` | ``Int32`` | ``Int32`` | ++--------------------------------+ | | | +| ``UInt16`` | | | | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``UInt32`` | ``DecimalType(20,0)`` | ``Decimal(20,0)`` | ``Decimal(20,0)`` | ++--------------------------------+ | | | +| ``UInt64`` | | | | ++--------------------------------+ | | | +| ``UInt128`` | | | | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``UInt256`` | unsupported [3]_ | | | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ + +.. [3] + + Clickhouse support numeric types up to 256 bit - ``Int256``, ``UInt256``, ``Decimal256(S)``, ``Decimal(P=39..76, S=0..76)``. + + But Spark's ``DecimalType(P, S)`` supports maximum ``P=38`` (128 bit). It is impossible to read, write or operate with values of larger precision, + this leads to an exception. Temporal types --------------- - -+--------------------------------------+--------------------------------------------+----------------------------------+--------------------------------------------+ -| Clickhouse type (read) | Spark type | Clickhousetype (write) | Clickhouse type (create table using Spark) | -+======================================+============================================+==================================+============================================+ -| ``Date`` | ``DateType()`` | ``Date`` | ``Date`` | -+--------------------------------------+--------------------------------------------+----------------------------------+--------------------------------------------+ -| ``Date32`` | unsupported | | | -+--------------------------------------+--------------------------------------------+----------------------------------+--------------------------------------------+ -| ``DateTime32``, seconds | ``TimestampType()``, milliseconds | ``DateTime64(6)``, milliseconds, | ``DateTime``, seconds, **precision loss** | -+--------------------------------------+ + + + -| ``DateTime64(3)``, milliseconds | | | | -+--------------------------------------+ + + + -| ``DateTime64(6)``, microseconds | | | | -+--------------------------------------+--------------------------------------------+----------------------------------+ + -| ``DateTime64(7..9)``, nanoseconds | ``TimestampType()``, milliseconds, | ``DateTime64(6)``, milliseconds, | | -| | **precision loss** | **precision loss** | + -+--------------------------------------+--------------------------------------------+----------------------------------+--------------------------------------------+ -| ``IntervalNanosecond`` | unsupported | | | -+--------------------------------------+ + + + -| ``IntervalMicrosecond`` | | | | -+--------------------------------------+ + + + -| ``IntervalMillisecond`` | | | | -+--------------------------------------+--------------------------------------------+----------------------------------+--------------------------------------------+ -| ``IntervalSecond`` | ``IntegerType()`` | ``Int32`` | ``Int32`` | -+--------------------------------------+ + + + -| ``IntervalMinute`` | | | | -+--------------------------------------+ + + + -| ``IntervalHour`` | | | | -+--------------------------------------+ + + + -| ``IntervalDay`` | | | | -+--------------------------------------+ + + + -| ``IntervalMonth`` | | | | -+--------------------------------------+ + + + -| ``IntervalQuarter`` | | | | -+--------------------------------------+ + + + -| ``IntervalWeek`` | | | | -+--------------------------------------+ + + + -| ``IntervalYear`` | | | | -+--------------------------------------+--------------------------------------------+----------------------------------+--------------------------------------------+ +~~~~~~~~~~~~~~ + +Note: ``DateTime(P, TZ)`` has the same precision as ``DateTime(P)``. + ++-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ +| Clickhouse type (read) | Spark type | Clickhousetype (write) | Clickhouse type (create) | ++===================================+======================================+==================================+===============================+ +| ``Date`` | ``DateType()`` | ``Date`` | ``Date`` | ++-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ +| ``Date32`` | unsupported | | | ++-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ +| ``DateTime32``, seconds | ``TimestampType()``, microseconds | ``DateTime64(6)``, microseconds | ``DateTime32``, seconds, | ++-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ +| ``DateTime64(3)``, milliseconds | ``TimestampType()``, microseconds | ``DateTime64(6)``, microseconds | ``DateTime32``, seconds, | +| | | | **precision loss** [5]_ | +| | | | | ++-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ +| ``DateTime64(6)``, microseconds | ``TimestampType()``, microseconds | ``DateTime64(6)``, microseconds | ``DateTime32``, seconds, | ++-----------------------------------+--------------------------------------+----------------------------------+ **cannot be inserted** [6]_ | +| ``DateTime64(7..9)``, nanoseconds | ``TimestampType()``, microseconds, | ``DateTime64(6)``, microseconds, | | +| | **precision loss** [4]_ | **precision loss** [4]_ | | +| | | | | ++-----------------------------------+--------------------------------------+----------------------------------+ | +| ``-`` | ``TimestampNTZType()``, microseconds | ``DateTime64(6)`` | | ++-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ +| ``IntervalNanosecond`` | unsupported | | | ++-----------------------------------+ | | | +| ``IntervalMicrosecond`` | | | | ++-----------------------------------+ | | | +| ``IntervalMillisecond`` | | | | ++-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ +| ``IntervalSecond`` | ``IntegerType()`` | ``Int32`` | ``Int32`` | ++-----------------------------------+ | | | +| ``IntervalMinute`` | | | | ++-----------------------------------+ | | | +| ``IntervalHour`` | | | | ++-----------------------------------+ | | | +| ``IntervalDay`` | | | | ++-----------------------------------+ | | | +| ``IntervalMonth`` | | | | ++-----------------------------------+ | | | +| ``IntervalQuarter`` | | | | ++-----------------------------------+ | | | +| ``IntervalWeek`` | | | | ++-----------------------------------+ | | | +| ``IntervalYear`` | | | | ++-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ + +.. [4] + Clickhouse support datetime up to nanoseconds precision (``23:59:59.999999999``), + but Spark ``TimestampType()`` supports datetime up to microseconds precision (``23:59:59.999999``). + Nanoseconds will be lost during read or write operations. + +.. [5] + Generic JDBC dialect generates DDL with Clickhouse type ``TIMESTAMP`` which is alias for ``DateTime32`` with precision up to seconds (``23:59:59``). + Inserting data with milliseconds precision (``23:59:59.999``) will lead to throwing away milliseconds (``23:59:59``). + +.. [6] + Clickhouse will raise an exception that data in format ``2001-01-01 23:59:59.999999`` has data ``.999999`` which does not match format ``YYYY-MM-DD hh:mm:ss``. + So you can create Clickhouse table with Spark, but cannot write data to column of this type. String types ------------- - -+--------------------------------------+------------------+------------------------+--------------------------------------------+ -| Clickhouse type (read) | Spark type | Clickhousetype (write) | Clickhouse type (create table using Spark) | -+======================================+==================+========================+============================================+ -| ``IPv4`` | ``StringType()`` | ``String`` | ``String`` | -+--------------------------------------+ + + + -| ``IPv6`` | | | | -+--------------------------------------+ + + + -| ``Enum8`` | | | | -+--------------------------------------+ + + + -| ``Enum16`` | | | | -+--------------------------------------+ + + + -| ``FixedString(N)`` | | | | -+--------------------------------------+ + + + -| ``String`` | | | | -+--------------------------------------+------------------+------------------------+--------------------------------------------+ +~~~~~~~~~~~~~ + ++--------------------------------------+------------------+------------------------+--------------------------+ +| Clickhouse type (read) | Spark type | Clickhousetype (write) | Clickhouse type (create) | ++======================================+==================+========================+==========================+ +| ``IPv4`` | ``StringType()`` | ``String`` | ``String`` | ++--------------------------------------+ | | | +| ``IPv6`` | | | | ++--------------------------------------+ | | | +| ``Enum8`` | | | | ++--------------------------------------+ | | | +| ``Enum16`` | | | | ++--------------------------------------+ | | | +| ``FixedString(N)`` | | | | ++--------------------------------------+ | | | +| ``String`` | | | | ++--------------------------------------+------------------+ | | +| ``-`` | ``BinaryType()`` | | | ++--------------------------------------+------------------+------------------------+--------------------------+ Unsupported types ----------------- -Columns of these types cannot be read/written by Spark: +Columns of these Clickhouse types cannot be read by Spark: * ``AggregateFunction(func, T)`` * ``Array(T)`` * ``JSON`` @@ -141,38 +284,49 @@ Columns of these types cannot be read/written by Spark: * ``Tuple(T1, T2, ...)`` * ``UUID`` -This is because there is no dedicated Clickhouse dialect for Spark, so some types cannot be properly converted to Spark types (e.g. ``Array``), -even if Spark does support such type (e.g. ``ArrayType()``). +Dataframe with these Spark types be written to Clickhouse: + * ``ArrayType(T)`` + * ``BinaryType()`` + * ``CharType(N)`` + * ``DayTimeIntervalType(P, S)`` + * ``MapType(K, V)`` + * ``NullType()`` + * ``StructType([...])`` + * ``TimestampNTZType()`` + * ``VarcharType(N)`` + +This is because Spark does not have dedicated Clickhouse dialect, and uses Generic JDBC dialect instead. +This dialect does not have type conversion between some types, like Clickhouse ``Array`` -> Spark ``ArrayType()``, and vice versa. -The is a way to avoid this - just cast unsupported type to ``String``. +The is a way to avoid this - just cast everything to ``String``. Read unsupported column type ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Use ``CAST`` or ``toJSONString`` to get column data as string in JSON format: - -.. code:: sql - - SELECT CAST(array_column AS String) FROM ... - - -- or - - SELECT toJSONString(array_column) FROM ... - -And then cast string column in resulting dataframe to proper type using `from_json `_: +Use ``CAST`` or ``toJSONString`` to get column data as string in JSON format, +and then cast string column in resulting dataframe to proper type using `from_json `_: .. code:: python from pyspark.sql.functions import from_json from pyspark.sql.types import ArrayType, IntegerType - df = clickhouse.sql(...) + reader = DBReader( + connection=clickhouse, + columns=[ + "id", + "toJSONString(array_column) array_column", + ], + ) + df = reader.run() # Spark requires all columns to have some specific type, describe it column_type = ArrayType(IntegerType()) - parsed_array = from_json(df.array_column, schema).alias("array_column") - df = df.select(parsed_array) + df = df.select( + df.id, + from_json(df.array_column, column_type).alias("array_column"), + ) Write unsupported column type ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -180,21 +334,25 @@ Write unsupported column type Convert dataframe column to JSON using `to_json `_, and write it as ``String`` column in Clickhouse: -.. code:: sql +.. code:: python - CREATE TABLE target_tbl AS ( - id Int32, - array_column_json String, + clickhouse.execute( + """ + CREATE TABLE target_tbl AS ( + id Int32, + array_column_json String, + ) + ENGINE = MergeTree() + ORDER BY time + """, ) - ENGINE = MergeTree() - ORDER BY time - -.. code:: python from pyspark.sql.functions import to_json - array_column_json = to_json(df.array_column) - df = df.select(df.id, array_column_json.alias("array_column_json")) + df = df.select( + df.id, + to_json(df.array_column).alias("array_column_json"), + ) writer.run(df) @@ -204,54 +362,16 @@ Then you can parse this column on Clickhouse side: SELECT id, JSONExtract(json_column, 'Array(String)') FROM target_tbl -You can also use `MATERIALIZED `_ -and `ALIAS `_ columns +You can also use `ALIAS `_ columns to avoid writing such expression in every ``SELECT`` clause all the time. Downsides: * Using ``SELECT JSONExtract(...)`` or ``ALIAS`` column can be expensive, because value is calculated on every row access. This can be especially harmful if such column is used in ``WHERE`` clause. -* Using ``MATERIALIZED`` column allows to perform such expensive calculation just once, but this requires up to 2x storage, because Clickhouse stores both raw and parsed column. -* Both ``ALIAS`` and ``MATERIALIZED`` columns are not included in ``SELECT *`` clause, they should be added explicitly: ``SELECT *, calculated_column FROM table``. +* Both ``ALIAS`` columns are not included in ``SELECT *`` clause, they should be added explicitly: ``SELECT *, calculated_column FROM table``. .. warning:: + `MATERIALIZED `_ and `EPHEMERAL `_ columns are not supported by Spark because they cannot be selected to determine target column type. - -Creating tables using Spark ---------------------------- - -.. warning:: - - Absolutely not recommended! - -If Spark dataframe is written to Clickhouse table which does not exists yet, it will be automatically created. - -But Spark will use types from Generic JDBC dialect, and generic types like ``TIMESTAMP`` have different precision -than Clickhouse-specific ``DateTime32`` / ``DateTime64``. - -Always prefer creating tables with specific types **BEFORE WRITING DATA** using Spark: - -.. code:: python - - clickhouse.execute( - """ - CREATE TABLE target_tbl AS ( - id UInt8, - value DateTime64(6) -- specific type and precision - ) - ENGINE = MergeTree() - ORDER BY value - """, - ) - -References ----------- - -Here you can find the source code used by Clickhouse JDBC and Spark for performing type conversion: - -* `Clickhouse -> JDBC `_ -* `JDBC -> Spark `_ -* `Spark -> JDBC `_ -* `JDBC -> Clickhouse `_ diff --git a/docs/connection/db_connection/clickhouse/write.rst b/docs/connection/db_connection/clickhouse/write.rst index 0bc305930..09f5749f2 100644 --- a/docs/connection/db_connection/clickhouse/write.rst +++ b/docs/connection/db_connection/clickhouse/write.rst @@ -3,6 +3,8 @@ Writing to Clickhouse using ``DBWriter`` ======================================== +For writing data to Clickhouse, use :obj:`DBWriter `. + .. warning:: Please take into account :ref:`clickhouse-types` @@ -15,8 +17,6 @@ Writing to Clickhouse using ``DBWriter`` This is because Spark's DDL generator can create columns with different precision and types than it is expected, causing precision loss or other issues. -For writing data to Clickhouse, use :obj:`DBWriter `. - Examples -------- @@ -33,7 +33,8 @@ Examples connection=clickhouse, target="schema.table", options=Clickhouse.WriteOptions( - if_exists="replace_entire_table", + if_exists="append", + # ENGINE is required by Clickhouse createTableOptions="ENGINE = MergeTree() ORDER BY id", ), ) From 5aa67dcdb48fd58ada481b5d7a2f76932a57c4af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Mon, 4 Mar 2024 11:55:09 +0000 Subject: [PATCH 31/63] [DOP-13253] Add Postgres types & prerequisites documentation --- docs/changelog/next_release/229.feature.rst | 4 + .../next_release/229.improvement.rst | 4 + .../db_connection/clickhouse/read.rst | 4 +- .../db_connection/clickhouse/types.rst | 465 ++++++++++------- .../db_connection/clickhouse/write.rst | 7 +- .../db_connection/postgres/execute.rst | 95 +++- .../db_connection/postgres/index.rst | 8 + .../db_connection/postgres/prerequisites.rst | 70 +++ .../db_connection/postgres/read.rst | 72 ++- .../connection/db_connection/postgres/sql.rst | 55 ++ .../db_connection/postgres/types.rst | 469 ++++++++++++++++++ .../db_connection/postgres/write.rst | 44 +- .../db_connection/greenplum/connection.py | 2 +- .../db_connection/postgres/connection.py | 27 +- .../test_jdbc_options_unit.py | 4 +- .../test_postgres_unit.py | 9 +- 16 files changed, 1120 insertions(+), 219 deletions(-) create mode 100644 docs/changelog/next_release/229.feature.rst create mode 100644 docs/changelog/next_release/229.improvement.rst create mode 100644 docs/connection/db_connection/postgres/prerequisites.rst create mode 100644 docs/connection/db_connection/postgres/sql.rst create mode 100644 docs/connection/db_connection/postgres/types.rst diff --git a/docs/changelog/next_release/229.feature.rst b/docs/changelog/next_release/229.feature.rst new file mode 100644 index 000000000..1175fe4d0 --- /dev/null +++ b/docs/changelog/next_release/229.feature.rst @@ -0,0 +1,4 @@ +Update default ``Postgres(extra={...})`` to include ``{"stringtype": "unspecified"}`` option. +This allows to write text data to non-text column (or vice versa), relying to Postgres cast capabilities. + +For example, now it is possible to read column of type ``money`` as Spark's ``StringType()``, and write it back to the same column, without using intermediate columns or tables. diff --git a/docs/changelog/next_release/229.improvement.rst b/docs/changelog/next_release/229.improvement.rst new file mode 100644 index 000000000..30eae3549 --- /dev/null +++ b/docs/changelog/next_release/229.improvement.rst @@ -0,0 +1,4 @@ +Improve Postgres documentation: + * Add "Types" section describing mapping between Postgres and Spark types + * Add "Prerequisites" section describing different aspects of connecting to Postgres + * Separate documentation of DBReader and Postgres.sql diff --git a/docs/connection/db_connection/clickhouse/read.rst b/docs/connection/db_connection/clickhouse/read.rst index fcc165062..e421f340b 100644 --- a/docs/connection/db_connection/clickhouse/read.rst +++ b/docs/connection/db_connection/clickhouse/read.rst @@ -67,8 +67,8 @@ Incremental strategy: with IncrementalStrategy(): df = reader.run() -References ----------- +Read options +------------ .. currentmodule:: onetl.connection.db_connection.jdbc_connection.options diff --git a/docs/connection/db_connection/clickhouse/types.rst b/docs/connection/db_connection/clickhouse/types.rst index 34b785b3c..ec4ce55df 100644 --- a/docs/connection/db_connection/clickhouse/types.rst +++ b/docs/connection/db_connection/clickhouse/types.rst @@ -3,130 +3,269 @@ Clickhouse <-> Spark type mapping ================================= +Type detection & casting +------------------------ + +Spark's DataFrames always have a ``schema`` which is a list of columns with corresponding Spark types. All operations on a column are performed using column type. + +Reading from Clickhouse +~~~~~~~~~~~~~~~~~~~~~~~ + +This is how Clickhouse connector performs this: + +* For each column in query result (``SELECT column1, column2, ... FROM table ...``) get column name and Clickhouse type. +* Find corresponding ``Clickhouse type (read)`` -> ``Spark type`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* Create DataFrame from query with specific column names and Spark types. + +Writing to some existing Clickhuse table +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This is how Clickhouse connector performs this: + +* Get names of columns in DataFrame. +* Perform ``SELECT column1, colum2, ... FROM table LIMIT 0`` query +* For each column in query result get column name and Clickhouse type. +* **Find corresponding** ``Clickhouse type (read)`` -> ``Spark type`` **combination** (see below) for each DataFrame column. If no combination is found, raise exception. [1]_ +* Find corresponding ``Spark type`` -> ``Clickhousetype (write)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* If ``Clickhousetype (write)`` match ``Clickhouse type (read)``, no additional casts will be performed, DataFrame column will be written to Clickhouse as is. +* If ``Clickhousetype (write)`` does not match ``Clickhouse type (read)``, DataFrame column will be casted to target column type **on Clickhouse side**. For example, you can write column with text data to ``Int32`` column, if column contains valid integer values within supported value range and precision. + +.. [1] + + Yes, this is weird. + +Create new table using Spark +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. warning:: + + ABSOLUTELY NOT RECOMMENDED! + +This is how Clickhouse connector performs this: + +* Find corresponding ``Spark type`` -> ``Clickhouse type (create)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* Generate DDL for creating table in Clickhouse, like ``CREATE TABLE (col1 ...)``, and run it. +* Write DataFrame to created table as is. + +But Spark does not have specific dialect for Clickhouse, so Generic JDBC dialect is used. +Generic dialect is using SQL ANSI type names while creating tables in target database, not database-specific types. + +If some cases this may lead to using wrong column type. For example, Spark creates column of type ``TIMESTAMP`` +which corresponds to Clickhouse's type ``DateTime32`` (precision up to seconds) +instead of more precise ``DateTime64`` (precision up to nanoseconds). +This may lead to incidental precision loss, or sometimes data cannot be written to created table at all. + +So instead of relying on Spark to create tables: + +.. code:: python + + writer = DBWriter( + connection=clickhouse, + table="default.target_tbl", + options=Clickhouse.WriteOptions( + if_exists="append", + # ENGINE is required by Clickhouse + createTableOptions="ENGINE = MergeTree() ORDER BY id", + ), + ) + writer.run(df) + +Always prefer creating tables with specific types **BEFORE WRITING DATA**: + +.. code:: python + + clickhouse.execute( + """ + CREATE TABLE default.target_tbl AS ( + id UInt8, + value DateTime64(6) -- specific type and precision + ) + ENGINE = MergeTree() + ORDER BY id + """, + ) + + writer = DBWriter( + connection=clickhouse, + table="default.target_tbl", + options=Clickhouse.WriteOptions(if_exists="append"), + ) + writer.run(df) + +References +~~~~~~~~~~ + +Here you can find source code with type conversions: + +* `Clickhouse -> JDBC `_ +* `JDBC -> Spark `_ +* `Spark -> JDBC `_ +* `JDBC -> Clickhouse `_ + +Supported types +--------------- + Generic types -------------- +~~~~~~~~~~~~~ * ``LowCardinality(T)`` is same as ``T`` * ``Nullable(T)`` is same as ``T``, but Spark column is inferred as ``nullable=True`` Numeric types -------------- - -+--------------------------------------+--------------------------------------------+----------------------------------------+--------------------------------------------+ -| Clickhouse type (read) | Spark type | Clickhousetype (write) | Clickhouse type (create table using Spark) | -+======================================+============================================+========================================+============================================+ -| ``Bool`` | ``IntegerType()`` | ``Int32`` | ``Int32`` | -+--------------------------------------+--------------------------------------------+----------------------------------------+--------------------------------------------+ -| ``Decimal`` | ``DecimalType(10,0)`` | ``Decimal(10,0)`` | ``Decimal(10,0)`` | -+--------------------------------------+--------------------------------------------+----------------------------------------+--------------------------------------------+ -| ``Decimal(P)`` | ``DecimalType(P,0)`` | ``Decimal(P,0)`` | ``Decimal(P,0)`` | -+--------------------------------------+--------------------------------------------+----------------------------------------+--------------------------------------------+ -| ``Decimal(P,S)``, 0 <= P <= S <= 38 | ``DecimalType(P,S)`` | ``Decimal(P,S)`` | ``Decimal(P,S)`` | -+--------------------------------------+--------------------------------------------+----------------------------------------+--------------------------------------------+ -| ``Decimal(P,S)``, P,S > 38 | ``DecimalType(38,38)``, **precision loss** | ``Decimal(38,38)``, **precision loss** | ``Decimal(38,38)``, **precision loss** | -+--------------------------------------+--------------------------------------------+----------------------------------------+--------------------------------------------+ -| ``Decimal32(S)`` | ``DecimalType(9,S)`` | ``Decimal(9,S)`` | ``Decimal(9,S)`` | -+--------------------------------------+--------------------------------------------+----------------------------------------+--------------------------------------------+ -| ``Decimal64(S)`` | ``DecimalType(18,S)`` | ``Decimal(18,S)`` | ``Decimal(18,S)`` | -+--------------------------------------+--------------------------------------------+----------------------------------------+--------------------------------------------+ -| ``Decimal128(S)`` | ``DecimalType(38,S)`` | ``Decimal(38,S)`` | ``Decimal(38,S)`` | -+--------------------------------------+--------------------------------------------+----------------------------------------+--------------------------------------------+ -| ``Decimal256(S)`` | ``DecimalType(38,S)``, **precision loss** | ``Decimal(38,S)``, **precision loss** | ``Decimal(38,S)``, **precision loss** | -+--------------------------------------+--------------------------------------------+----------------------------------------+--------------------------------------------+ -| ``Float32`` | ``DoubleType()`` | ``Float64`` | ``Float64`` | -+--------------------------------------+ + + + -| ``Float64`` | | | | -+--------------------------------------+--------------------------------------------+----------------------------------------+--------------------------------------------+ -| ``Int8`` | ``IntegerType()`` | ``Int32`` | ``INTEGER`` | -+--------------------------------------+ + + + -| ``Int16`` | | | | -+--------------------------------------+ + + + -| ``Int32`` | | | | -+--------------------------------------+--------------------------------------------+----------------------------------------+--------------------------------------------+ -| ``Int64`` | ``LongType()`` | ``Int64`` | ``BIGINT`` | -+--------------------------------------+--------------------------------------------+----------------------------------------+--------------------------------------------+ -| ``Int128`` | ``DecimalType(20,0)`` | ``Decimal(20,0)`` | ``Decimal(20,0)`` | -+--------------------------------------+--------------------------------------------+----------------------------------------+--------------------------------------------+ -| ``Int256`` | ``DecimalType(38,0)``, **precision loss** | ``Decimal(38,0)``, **precision loss** | ``Decimal(38,0)``, **precision loss** | -+--------------------------------------+--------------------------------------------+----------------------------------------+--------------------------------------------+ -| ``UInt8`` | ``IntegerType()`` | ``Int32`` | ``INTEGER`` | -+--------------------------------------+ + + + -| ``UInt16`` | | | | -+--------------------------------------+--------------------------------------------+----------------------------------------+--------------------------------------------+ -| ``UInt32`` | ``DecimalType(20,0)`` | ``Decimal(20,0)`` | ``Decimal(20,0)`` | -+--------------------------------------+ + + + -| ``UInt64`` | | | | -+--------------------------------------+ + + + -| ``UInt128`` | | | | -+--------------------------------------+--------------------------------------------+----------------------------------------+--------------------------------------------+ -| ``UInt256`` | ``DecimalType(38,0)``, **precision loss** | ``Decimal(38,0)``, **precision loss** | ``Decimal(38,0)``, **precision loss** | -+--------------------------------------+--------------------------------------------+----------------------------------------+--------------------------------------------+ +~~~~~~~~~~~~~ + ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| Clickhouse type (read) | Spark type | Clickhousetype (write) | Clickhouse type (create) | ++================================+===================================+===============================+===============================+ +| ``Bool`` | ``IntegerType()`` | ``Int32`` | ``Int32`` | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``-`` | ``BooleanType()`` | ``UInt64`` | ``UInt64`` | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``Decimal`` | ``DecimalType(P=10, S=0)`` | ``Decimal(P=10, S=0)`` | ``Decimal(P=10, S=0)`` | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``Decimal(P=0..38)`` | ``DecimalType(P=0..38, S=0)`` | ``Decimal(P=0..38, S=0)`` | ``Decimal(P=0..38, S=0)`` | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``Decimal(P=0..38, S=0..38)`` | ``DecimalType(P=0..38, S=0..38)`` | ``Decimal(P=0..38, S=0..38)`` | ``Decimal(P=0..38, S=0..38)`` | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``Decimal(P=39..76, S=0..76)`` | unsupported [2]_ | | | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``Decimal32(P=0..9)`` | ``DecimalType(P=9, S=0..9)`` | ``Decimal(P=9, S=0..9)`` | ``Decimal(P=9, S=0..9)`` | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``Decimal64(S=0..18)`` | ``DecimalType(P=18, S=0..18)`` | ``Decimal(P=18, S=0..18)`` | ``Decimal(P=18, S=0..18)`` | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``Decimal128(S=0..38)`` | ``DecimalType(P=38, S=0..38)`` | ``Decimal(P=38, S=0..38)`` | ``Decimal(P=38, S=0..38)`` | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``Decimal256(S=0..76)`` | unsupported [2]_ | | | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``Float32`` | ``DoubleType()`` | ``Float64`` | ``Float64`` | ++--------------------------------+ | | | +| ``Float64`` | | | | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``-`` | ``FloatType()`` | ``Float32`` | ``Float32`` | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``Int8`` | ``IntegerType()`` | ``Int32`` | ``Int32`` | ++--------------------------------+ | | | +| ``Int16`` | | | | ++--------------------------------+ | | | +| ``Int32`` | | | | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``Int64`` | ``LongType()`` | ``Int64`` | ``Int64`` | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``Int128`` | ``DecimalType(20,0)`` | ``Decimal(20,0)`` | ``Decimal(20,0)`` | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``Int256`` | unsupported [2]_ | | | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``-`` | ``ByteType()`` | ``Int8`` | ``Int8`` | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``-`` | ``ShortType()`` | ``Int32`` | ``Int32`` | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``UInt8`` | ``IntegerType()`` | ``Int32`` | ``Int32`` | ++--------------------------------+ | | | +| ``UInt16`` | | | | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``UInt32`` | ``DecimalType(20,0)`` | ``Decimal(20,0)`` | ``Decimal(20,0)`` | ++--------------------------------+ | | | +| ``UInt64`` | | | | ++--------------------------------+ | | | +| ``UInt128`` | | | | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``UInt256`` | unsupported [2]_ | | | ++--------------------------------+-----------------------------------+-------------------------------+-------------------------------+ + +.. [2] + + Clickhouse support numeric types up to 256 bit - ``Int256``, ``UInt256``, ``Decimal256(S)``, ``Decimal(P=39..76, S=0..76)``. + + But Spark's ``DecimalType(P, S)`` supports maximum ``P=38`` (128 bit). It is impossible to read, write or operate with values of larger precision, + this leads to an exception. Temporal types --------------- - -+--------------------------------------+--------------------------------------------+----------------------------------+--------------------------------------------+ -| Clickhouse type (read) | Spark type | Clickhousetype (write) | Clickhouse type (create table using Spark) | -+======================================+============================================+==================================+============================================+ -| ``Date`` | ``DateType()`` | ``Date`` | ``Date`` | -+--------------------------------------+--------------------------------------------+----------------------------------+--------------------------------------------+ -| ``Date32`` | unsupported | | | -+--------------------------------------+--------------------------------------------+----------------------------------+--------------------------------------------+ -| ``DateTime32``, seconds | ``TimestampType()``, milliseconds | ``DateTime64(6)``, milliseconds, | ``DateTime``, seconds, **precision loss** | -+--------------------------------------+ + + + -| ``DateTime64(3)``, milliseconds | | | | -+--------------------------------------+ + + + -| ``DateTime64(6)``, microseconds | | | | -+--------------------------------------+--------------------------------------------+----------------------------------+ + -| ``DateTime64(7..9)``, nanoseconds | ``TimestampType()``, milliseconds, | ``DateTime64(6)``, milliseconds, | | -| | **precision loss** | **precision loss** | + -+--------------------------------------+--------------------------------------------+----------------------------------+--------------------------------------------+ -| ``IntervalNanosecond`` | unsupported | | | -+--------------------------------------+ + + + -| ``IntervalMicrosecond`` | | | | -+--------------------------------------+ + + + -| ``IntervalMillisecond`` | | | | -+--------------------------------------+--------------------------------------------+----------------------------------+--------------------------------------------+ -| ``IntervalSecond`` | ``IntegerType()`` | ``Int32`` | ``Int32`` | -+--------------------------------------+ + + + -| ``IntervalMinute`` | | | | -+--------------------------------------+ + + + -| ``IntervalHour`` | | | | -+--------------------------------------+ + + + -| ``IntervalDay`` | | | | -+--------------------------------------+ + + + -| ``IntervalMonth`` | | | | -+--------------------------------------+ + + + -| ``IntervalQuarter`` | | | | -+--------------------------------------+ + + + -| ``IntervalWeek`` | | | | -+--------------------------------------+ + + + -| ``IntervalYear`` | | | | -+--------------------------------------+--------------------------------------------+----------------------------------+--------------------------------------------+ +~~~~~~~~~~~~~~ + +Note: ``DateTime(P, TZ)`` has the same precision as ``DateTime(P)``. + ++-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ +| Clickhouse type (read) | Spark type | Clickhousetype (write) | Clickhouse type (create) | ++===================================+======================================+==================================+===============================+ +| ``Date`` | ``DateType()`` | ``Date`` | ``Date`` | ++-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ +| ``Date32`` | unsupported | | | ++-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ +| ``DateTime32``, seconds | ``TimestampType()``, microseconds | ``DateTime64(6)``, microseconds | ``DateTime32``, seconds, | ++-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ +| ``DateTime64(3)``, milliseconds | ``TimestampType()``, microseconds | ``DateTime64(6)``, microseconds | ``DateTime32``, seconds, | +| | | | **precision loss** [4]_ | +| | | | | ++-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ +| ``DateTime64(6)``, microseconds | ``TimestampType()``, microseconds | ``DateTime64(6)``, microseconds | ``DateTime32``, seconds, | ++-----------------------------------+--------------------------------------+----------------------------------+ **cannot be inserted** [5]_ | +| ``DateTime64(7..9)``, nanoseconds | ``TimestampType()``, microseconds, | ``DateTime64(6)``, microseconds, | | +| | **precision loss** [3]_ | **precision loss** [3]_ | | +| | | | | ++-----------------------------------+--------------------------------------+----------------------------------+ | +| ``-`` | ``TimestampNTZType()``, microseconds | ``DateTime64(6)`` | | ++-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ +| ``IntervalNanosecond`` | unsupported | | | ++-----------------------------------+ | | | +| ``IntervalMicrosecond`` | | | | ++-----------------------------------+ | | | +| ``IntervalMillisecond`` | | | | ++-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ +| ``IntervalSecond`` | ``IntegerType()`` | ``Int32`` | ``Int32`` | ++-----------------------------------+ | | | +| ``IntervalMinute`` | | | | ++-----------------------------------+ | | | +| ``IntervalHour`` | | | | ++-----------------------------------+ | | | +| ``IntervalDay`` | | | | ++-----------------------------------+ | | | +| ``IntervalMonth`` | | | | ++-----------------------------------+ | | | +| ``IntervalQuarter`` | | | | ++-----------------------------------+ | | | +| ``IntervalWeek`` | | | | ++-----------------------------------+ | | | +| ``IntervalYear`` | | | | ++-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ + +.. [3] + Clickhouse support datetime up to nanoseconds precision (``23:59:59.999999999``), + but Spark ``TimestampType()`` supports datetime up to microseconds precision (``23:59:59.999999``). + Nanoseconds will be lost during read or write operations. + +.. [4] + Generic JDBC dialect generates DDL with Clickhouse type ``TIMESTAMP`` which is alias for ``DateTime32`` with precision up to seconds (``23:59:59``). + Inserting data with milliseconds precision (``23:59:59.999``) will lead to throwing away milliseconds (``23:59:59``). + +.. [5] + Clickhouse will raise an exception that data in format ``2001-01-01 23:59:59.999999`` has data ``.999999`` which does not match format ``YYYY-MM-DD hh:mm:ss``. + So you can create Clickhouse table with Spark, but cannot write data to column of this type. String types ------------- - -+--------------------------------------+------------------+------------------------+--------------------------------------------+ -| Clickhouse type (read) | Spark type | Clickhousetype (write) | Clickhouse type (create table using Spark) | -+======================================+==================+========================+============================================+ -| ``IPv4`` | ``StringType()`` | ``String`` | ``String`` | -+--------------------------------------+ + + + -| ``IPv6`` | | | | -+--------------------------------------+ + + + -| ``Enum8`` | | | | -+--------------------------------------+ + + + -| ``Enum16`` | | | | -+--------------------------------------+ + + + -| ``FixedString(N)`` | | | | -+--------------------------------------+ + + + -| ``String`` | | | | -+--------------------------------------+------------------+------------------------+--------------------------------------------+ +~~~~~~~~~~~~~ + ++--------------------------------------+------------------+------------------------+--------------------------+ +| Clickhouse type (read) | Spark type | Clickhousetype (write) | Clickhouse type (create) | ++======================================+==================+========================+==========================+ +| ``IPv4`` | ``StringType()`` | ``String`` | ``String`` | ++--------------------------------------+ | | | +| ``IPv6`` | | | | ++--------------------------------------+ | | | +| ``Enum8`` | | | | ++--------------------------------------+ | | | +| ``Enum16`` | | | | ++--------------------------------------+ | | | +| ``FixedString(N)`` | | | | ++--------------------------------------+ | | | +| ``String`` | | | | ++--------------------------------------+------------------+ | | +| ``-`` | ``BinaryType()`` | | | ++--------------------------------------+------------------+------------------------+--------------------------+ Unsupported types ----------------- -Columns of these types cannot be read/written by Spark: +Columns of these Clickhouse types cannot be read by Spark: * ``AggregateFunction(func, T)`` * ``Array(T)`` * ``JSON`` @@ -141,38 +280,49 @@ Columns of these types cannot be read/written by Spark: * ``Tuple(T1, T2, ...)`` * ``UUID`` -This is because there is no dedicated Clickhouse dialect for Spark, so some types cannot be properly converted to Spark types (e.g. ``Array``), -even if Spark does support such type (e.g. ``ArrayType()``). +Dataframe with these Spark types be written to Clickhouse: + * ``ArrayType(T)`` + * ``BinaryType()`` + * ``CharType(N)`` + * ``DayTimeIntervalType(P, S)`` + * ``MapType(K, V)`` + * ``NullType()`` + * ``StructType([...])`` + * ``TimestampNTZType()`` + * ``VarcharType(N)`` + +This is because Spark does not have dedicated Clickhouse dialect, and uses Generic JDBC dialect instead. +This dialect does not have type conversion between some types, like Clickhouse ``Array`` -> Spark ``ArrayType()``, and vice versa. -The is a way to avoid this - just cast unsupported type to ``String``. +The is a way to avoid this - just cast everything to ``String``. Read unsupported column type ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Use ``CAST`` or ``toJSONString`` to get column data as string in JSON format: - -.. code:: sql - - SELECT CAST(array_column AS String) FROM ... - - -- or - - SELECT toJSONString(array_column) FROM ... - -And then cast string column in resulting dataframe to proper type using `from_json `_: +Use ``CAST`` or ``toJSONString`` to get column data as string in JSON format, +and then cast string column in resulting dataframe to proper type using `from_json `_: .. code:: python from pyspark.sql.functions import from_json from pyspark.sql.types import ArrayType, IntegerType - df = clickhouse.sql(...) + reader = DBReader( + connection=clickhouse, + columns=[ + "id", + "toJSONString(array_column) array_column", + ], + ) + df = reader.run() # Spark requires all columns to have some specific type, describe it column_type = ArrayType(IntegerType()) - parsed_array = from_json(df.array_column, schema).alias("array_column") - df = df.select(parsed_array) + df = df.select( + df.id, + from_json(df.array_column, column_type).alias("array_column"), + ) Write unsupported column type ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -180,21 +330,25 @@ Write unsupported column type Convert dataframe column to JSON using `to_json `_, and write it as ``String`` column in Clickhouse: -.. code:: sql +.. code:: python - CREATE TABLE target_tbl AS ( - id Int32, - array_column_json String, + clickhouse.execute( + """ + CREATE TABLE target_tbl AS ( + id Int32, + array_column_json String, + ) + ENGINE = MergeTree() + ORDER BY time + """, ) - ENGINE = MergeTree() - ORDER BY time - -.. code:: python from pyspark.sql.functions import to_json - array_column_json = to_json(df.array_column) - df = df.select(df.id, array_column_json.alias("array_column_json")) + df = df.select( + df.id, + to_json(df.array_column).alias("array_column_json"), + ) writer.run(df) @@ -218,40 +372,3 @@ Downsides: `EPHEMERAL `_ columns are not supported by Spark because they cannot be selected to determine target column type. - -Creating tables using Spark ---------------------------- - -.. warning:: - - Absolutely not recommended! - -If Spark dataframe is written to Clickhouse table which does not exists yet, it will be automatically created. - -But Spark will use types from Generic JDBC dialect, and generic types like ``TIMESTAMP`` have different precision -than Clickhouse-specific ``DateTime32`` / ``DateTime64``. - -Always prefer creating tables with specific types **BEFORE WRITING DATA** using Spark: - -.. code:: python - - clickhouse.execute( - """ - CREATE TABLE target_tbl AS ( - id UInt8, - value DateTime64(6) -- specific type and precision - ) - ENGINE = MergeTree() - ORDER BY value - """, - ) - -References ----------- - -Here you can find the source code used by Clickhouse JDBC and Spark for performing type conversion: - -* `Clickhouse -> JDBC `_ -* `JDBC -> Spark `_ -* `Spark -> JDBC `_ -* `JDBC -> Clickhouse `_ diff --git a/docs/connection/db_connection/clickhouse/write.rst b/docs/connection/db_connection/clickhouse/write.rst index 0bc305930..09f5749f2 100644 --- a/docs/connection/db_connection/clickhouse/write.rst +++ b/docs/connection/db_connection/clickhouse/write.rst @@ -3,6 +3,8 @@ Writing to Clickhouse using ``DBWriter`` ======================================== +For writing data to Clickhouse, use :obj:`DBWriter `. + .. warning:: Please take into account :ref:`clickhouse-types` @@ -15,8 +17,6 @@ Writing to Clickhouse using ``DBWriter`` This is because Spark's DDL generator can create columns with different precision and types than it is expected, causing precision loss or other issues. -For writing data to Clickhouse, use :obj:`DBWriter `. - Examples -------- @@ -33,7 +33,8 @@ Examples connection=clickhouse, target="schema.table", options=Clickhouse.WriteOptions( - if_exists="replace_entire_table", + if_exists="append", + # ENGINE is required by Clickhouse createTableOptions="ENGINE = MergeTree() ORDER BY id", ), ) diff --git a/docs/connection/db_connection/postgres/execute.rst b/docs/connection/db_connection/postgres/execute.rst index 042b97197..ea9bd5048 100644 --- a/docs/connection/db_connection/postgres/execute.rst +++ b/docs/connection/db_connection/postgres/execute.rst @@ -1,7 +1,100 @@ .. _postgres-execute: Executing statements in Postgres -================================ +================================== + +How to +------ + +There are 2 ways to execute some statement in Postgres + +Use :obj:`Postgres.fetch ` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Use this method to execute some ``SELECT`` query which returns **small number or rows**, like reading +Postgres config, or reading data from some reference table. + +Method accepts :obj:`JDBCOptions `. + +Connection opened using this method should be then closed with :obj:`Postgres.close `. + +Syntax support +^^^^^^^^^^^^^^ + +This method supports **any** query syntax supported by Postgres, like: + +* ``SELECT ... FROM ...`` +* ``WITH alias AS (...) SELECT ...`` + +Queries like ``SHOW ...`` are not supported. + +It does not support multiple queries in the same operation, like ``SET ...; SELECT ...;``. + +Examples +^^^^^^^^ + +.. code-block:: python + + from onetl.connection import Postgres + + postgres = Postgres(...) + + df = postgres.fetch( + "SELECT value FROM some.reference_table WHERE key = 'some_constant'", + options=Postgres.JDBCOptions(query_timeout=10), + ) + postgres.close() + value = df.collect()[0][0] # get value from first row and first column + +Use :obj:`Postgres.execute ` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it. + +Method accepts :obj:`JDBCOptions `. + +Connection opened using this method should be then closed with :obj:`Postgres.close `. + +Syntax support +^^^^^^^^^^^^^^ + +This method supports **any** query syntax supported by Postgres, like: + +* ``CREATE TABLE ...``, ``CREATE VIEW ...`` +* ``ALTER ...`` +* ``INSERT INTO ... AS SELECT ...`` +* ``DROP TABLE ...``, ``DROP VIEW ...``, and so on +* ``CALL procedure(arg1, arg2) ...`` +* ``SELECT func(arg1, arg2)`` or ``{call func(arg1, arg2)}`` - special syntax for calling functions +* etc + +It does not support multiple queries in the same operation, like ``SET ...; CREATE TABLE ...;``. + +Examples +^^^^^^^^ + +.. code-block:: python + + from onetl.connection import Postgres + + postgres = Postgres(...) + + with postgres: + postgres.execute("DROP TABLE schema.table") + postgres.execute( + """ + CREATE TABLE schema.table AS ( + id biging ALWAYS GENERATED AS IDENTITY, + key text, + value real + ) + """, + options=Postgres.JDBCOptions(query_timeout=10), + ) + + +References +---------- .. currentmodule:: onetl.connection.db_connection.postgres.connection diff --git a/docs/connection/db_connection/postgres/index.rst b/docs/connection/db_connection/postgres/index.rst index f5376ee93..74d199bbc 100644 --- a/docs/connection/db_connection/postgres/index.rst +++ b/docs/connection/db_connection/postgres/index.rst @@ -7,6 +7,7 @@ Postgres :maxdepth: 1 :caption: Connection + prerequisites connection .. toctree:: @@ -14,5 +15,12 @@ Postgres :caption: Operations read + sql write execute + +.. toctree:: + :maxdepth: 1 + :caption: Troubleshooting + + types diff --git a/docs/connection/db_connection/postgres/prerequisites.rst b/docs/connection/db_connection/postgres/prerequisites.rst new file mode 100644 index 000000000..1921a3830 --- /dev/null +++ b/docs/connection/db_connection/postgres/prerequisites.rst @@ -0,0 +1,70 @@ +.. _postgres-prerequisites: + +Prerequisites +============= + +Version Compatibility +--------------------- + +* PostgreSQL server versions: 8.2 or higher +* Spark versions: 2.3.x - 3.5.x +* Java versions: 8 - 20 + +See `official documentation `_. + +Installing PySpark +------------------ + +To use Postgres connector you should have PySpark installed (or injected to ``sys.path``) +BEFORE creating the connector instance. + +See :ref:`install-spark` installation instruction for more details. + +Connecting to Postgres +----------------------- + +Connection port +~~~~~~~~~~~~~~~ + +Connection is usuallu performed to port 5432. Port may differ for different Postgres instances. +Please ask your Postgres administrator to provide required information. + +Allowing connection to Postgres instance +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Ask your Postgres administrator to allow your user (and probably IP) to connect to instance, +e.g. by updating ``pg_hba.conf`` file. + +See `official documentation `_. + +Postgres cluster interaction +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +If you're using Postgres cluster, it is currently possible to connect only to **one specific node**. +Connecting to multiple nodes to perform load balancing, as well as automatic failover to new master/replica are not supported. + +Required grants +~~~~~~~~~~~~~~~ + +Ask your Postgres cluster administrator to set following grants for a user, +used for creating a connection: + +.. tabs:: + + .. code-tab:: sql Read + Write + + -- allow creating tables in specific schema + GRANT USAGE, CREATE ON SCHEMA myschema TO username; + + -- allow read & write access to specific table + GRANT SELECT, INSERT, TRUNCATE ON myschema.mytable TO username; + + .. code-tab:: sql Read only + + -- allow creating tables in specific schema + GRANT USAGE ON SCHEMA myschema TO username; + + -- allow read access to specific table + GRANT SELECT ON myschema.mytable TO username; + +More details can be found in `official documentation `_. diff --git a/docs/connection/db_connection/postgres/read.rst b/docs/connection/db_connection/postgres/read.rst index 737a731db..fc4a8e3b3 100644 --- a/docs/connection/db_connection/postgres/read.rst +++ b/docs/connection/db_connection/postgres/read.rst @@ -1,18 +1,74 @@ .. _postgres-read: -Reading from Postgres -===================== +Reading from Postgres using ``DBReader`` +========================================== -There are 2 ways of distributed data reading from Postgres: +.. warning:: -* Using :obj:`DBReader ` with different :ref:`strategy` -* Using :obj:`Postgres.sql ` + Please take into account :ref:`postgres-types` -Both methods accept :obj:`JDBCReadOptions ` +:obj:`DBReader ` supports :ref:`strategy` for incremental data reading, +but does not support custom queries, like JOINs. -.. currentmodule:: onetl.connection.db_connection.postgres.connection +Supported DBReader features +--------------------------- -.. automethod:: Postgres.sql +* ✅︎ ``columns`` +* ✅︎ ``where`` +* ✅︎ ``hwm``, supported strategies: +* * ✅︎ :ref:`snapshot-strategy` +* * ✅︎ :ref:`incremental-strategy` +* * ✅︎ :ref:`snapshot-batch-strategy` +* * ✅︎ :ref:`incremental-batch-strategy` +* ❌ ``hint`` (is not supported by Postgres) +* ❌ ``df_schema`` +* ✅︎ ``options`` (see :obj:`JDBCReadOptions `) + +Examples +-------- + +Snapshot strategy: + +.. code-block:: python + + from onetl.connection import Postgres + from onetl.db import DBReader + + postgres = Postgres(...) + + reader = DBReader( + connection=postgres, + source="schema.table", + columns=["id", "key", "CAST(value AS text) value", "updated_dt"], + where="key = 'something'", + options=Postgres.ReadOptions(partition_column="id", num_partitions=10), + ) + df = reader.run() + +Incremental strategy: + +.. code-block:: python + + from onetl.connection import Postgres + from onetl.db import DBReader + from onetl.strategy import IncrementalStrategy + + postgres = Postgres(...) + + reader = DBReader( + connection=postgres, + source="schema.table", + columns=["id", "key", "CAST(value AS text) value", "updated_dt"], + where="key = 'something'", + hwm=DBReader.AutoDetectHWM(name="postgres_hwm", expression="updated_dt"), + options=Postgres.ReadOptions(partition_column="id", num_partitions=10), + ) + + with IncrementalStrategy(): + df = reader.run() + +Read options +------------ .. currentmodule:: onetl.connection.db_connection.jdbc_connection.options diff --git a/docs/connection/db_connection/postgres/sql.rst b/docs/connection/db_connection/postgres/sql.rst new file mode 100644 index 000000000..ae6d5602c --- /dev/null +++ b/docs/connection/db_connection/postgres/sql.rst @@ -0,0 +1,55 @@ +.. _postgres-sql: + +Reading from Postgres using ``Postgres.sql`` +================================================ + +.. warning:: + + Please take into account :ref:`postgres-types` + +:obj:`Postgres.sql ` allows passing custom SQL query, +but does not support incremental strategies. + +Method also accepts :obj:`JDBCReadOptions `. + +Syntax support +-------------- + +Only queries with the following syntax are supported: + +* ``SELECT ...`` +* ``WITH ... SELECT ...`` + +Queries like ``SHOW ...`` are not supported. + +This method also does not support multiple queries in the same operation, like ``SET ...; SELECT ...;``. + +Examples +-------- + +.. code-block:: python + + from onetl.connection import Postgres + + postgres = Postgres(...) + df = postgres.sql( + """ + SELECT + id, + key, + CAST(value AS text) value, + updated_at + FROM + some.mytable + WHERE + key = 'something' + """, + options=Postgres.ReadOptions(partition_column="id", num_partitions=10), + ) + +References +---------- + +.. currentmodule:: onetl.connection.db_connection.postgres.connection + +.. automethod:: Postgres.sql diff --git a/docs/connection/db_connection/postgres/types.rst b/docs/connection/db_connection/postgres/types.rst new file mode 100644 index 000000000..210e3bd6c --- /dev/null +++ b/docs/connection/db_connection/postgres/types.rst @@ -0,0 +1,469 @@ +.. _postgres-types: + +Postgres <-> Spark type mapping +================================= + +Type detection & casting +------------------------ + +Spark's DataFrames always have a ``schema`` which is a list of columns with corresponding Spark types. All operations on a column are performed using column type. + +Reading from Postgres +~~~~~~~~~~~~~~~~~~~~~~~ + +This is how Postgres connector performs this: + +* For each column in query result (``SELECT column1, column2, ... FROM table ...``) get column name and Postgres type. +* Find corresponding ``Postgres type (read)`` -> ``Spark type`` combination (see below) for each DataFrame column [1]_. If no combination is found, raise exception. +* Create DataFrame from query with specific column names and Spark types. + +.. [1] + All Postgres types that doesn't have corresponding Java type are converted to ``String``. + +Writing to some existing Clickhuse table +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This is how Postgres connector performs this: + +* Get names of columns in DataFrame. [1]_ +* Perform ``SELECT column1, colum2, ... FROM table LIMIT 0`` query. For each column in query result get Postgres type. +* Find corresponding ``Spark type`` -> ``Postgres type (write)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* If ``Postgres type (write)`` match ``Postgres type (read)``, no additional casts will be performed, DataFrame column will be written to Postgres as is. +* If ``Postgres type (write)`` does not match ``Postgres type (read)``, DataFrame column will be casted to target column type **on Postgres side**. + For example, you can write column with text data to ``int`` column, if column contains valid integer values within supported value range and precision [3]_. + +.. [2] + This allows to write data to tables with ``DEFAULT`` and ``GENERATED`` columns - if DataFrame has no such column, + it will be populated by Postgres. + +.. [3] + This is true only if conversion of ``T_df -> T_tbl`` target or source types is ``text`` or ``StringType()``. So it is possible to + insert data back to column with original type, if data format is matching the column type. + + But other types cannot be silently converted, like ``bytea -> bit(N)``. This requires explicit casting, see `Manual conversion to string`_. + +Create new table using Spark +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. warning:: + + ABSOLUTELY NOT RECOMMENDED! + +This is how Postgres connector performs this: + +* Find corresponding ``Spark type`` -> ``Postgres type (create)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* Generate DDL for creating table in Postgres, like ``CREATE TABLE (col1 ...)``, and run it. +* Write DataFrame to created table as is. + +But Postgres connector support only limited number of types and almost no custom clauses (like ``PARTITION BY``, ``INDEX``, etc). +So instead of relying on Spark to create tables: + +.. dropdown:: See example + + .. code:: python + + writer = DBWriter( + connection=postgres, + table="public.table", + options=Postgres.WriteOptions( + if_exists="append", + createTableOptions="PARTITION BY RANGE (id)", + ), + ) + writer.run(df) + +Always prefer creating table with desired DDL **BEFORE WRITING DATA**: + +.. dropdown:: See example + + .. code:: python + + postgres.execute( + """ + CREATE TABLE public.table AS ( + id bigint, + business_dt timestamp(6), + value json + ) + PARTITION BY RANGE (Id) + """, + ) + + writer = DBWriter( + connection=postgres, + table="public.table", + options=Postgres.WriteOptions(if_exists="append"), + ) + writer.run(df) + +See Postgres `CREATE TABLE `_ documentation. + +Supported types +--------------- + +References +~~~~~~~~~~ + +See `List of Postgres types `_. + +Here you can find source code with type conversions: + +* `Postgres <-> JDBC `_ +* `JDBC -> Spark `_ +* `Spark -> JDBC `_ + +Numeric types +~~~~~~~~~~~~~ + ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +| Postgres type (read) | Spark type | Postgres type (write) | Postgres type (create) | ++==================================+===================================+===============================+=========================+ +| ``decimal`` | ``DecimalType(P=38, S=18)`` | ``decimal(P=38, S=18)`` | ``decimal`` (unbounded) | ++----------------------------------+-----------------------------------+-------------------------------+ | +| ``decimal(P=0..38)`` | ``DecimalType(P=0..38, S=0)`` | ``decimal(P=0..38, S=0)`` | | ++----------------------------------+-----------------------------------+-------------------------------+ | +| ``decimal(P=0..38, S=0..38)`` | ``DecimalType(P=0..38, S=0..38)`` | ``decimal(P=0..38, S=0..38)`` | | ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +| ``decimal(P=39.., S=0..)`` | unsupported [4]_ | | | ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +| ``decimal(P=.., S=..-1)`` | unsupported [5]_ | | | ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +| ``real`` | ``FloatType()`` | ``real`` | ``real`` | ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +| ``double precision`` | ``DoubleType()`` | ``double precision`` | ``double precision`` | ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +| ``smallint`` | ``ShortType()`` | ``smallint`` | ``smallint`` | ++----------------------------------+-----------------------------------+ | | +| ``-`` | ``ByteType()`` | | | ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +| ``integer`` | ``IntegerType()`` | ``integer`` | ``integer`` | ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +| ``bigint`` | ``LongType()`` | ``bigint`` | ``bigint`` | ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +| ``money`` | ``StringType()`` [1]_ | ``text`` | ``text`` | ++----------------------------------+ | | | +| ``int4range`` | | | | ++----------------------------------+ | | | +| ``int8range`` | | | | ++----------------------------------+ | | | +| ``numrange`` | | | | ++----------------------------------+ | | | +| ``int2vector`` | | | | ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ + +.. [4] + + Postgres support decimal types with unlimited precision. + + But Spark's ``DecimalType(P, S)`` supports maximum ``P=38`` (128 bit). It is impossible to read, write or operate with values of larger precision, + this leads to an exception. + +.. [5] + + Postgres support decimal types with negative scale, like ``decimal(38, -10)``. Spark doesn't. + +Temporal types +~~~~~~~~~~~~~~ + ++------------------------------------+------------------------------+-----------------------+-------------------------+ +| Postgres type (read) | Spark type | Postgres type (write) | Postgres type (create) | ++====================================+==============================+=======================+=========================+ +| ``date`` | ``DateType()`` | ``date`` | ``date`` | ++------------------------------------+------------------------------+-----------------------+-------------------------+ +| ``time`` | ``TimestampType()``, | ``timestamp(6)`` | ``timestamp(6)`` | ++------------------------------------+ with time format quirks [6]_ | | | +| ``time(0..6)`` | | | | ++------------------------------------+ | | | +| ``time with time zone`` | | | | ++------------------------------------+ | | | +| ``time(0..6) with time zone`` | | | | ++------------------------------------+------------------------------+-----------------------+-------------------------+ +| ``timestamp`` | ``TimestampType()`` | ``timestamp(6)`` | ``timestamp(6)`` | ++------------------------------------+ | | | +| ``timestamp(0..6)`` | | | | ++------------------------------------+ | | | +| ``timestamp with time zone`` | | | | ++------------------------------------+ | | | +| ``timestamp(0..6) with time zone`` | | | | ++------------------------------------+------------------------------+-----------------------+-------------------------+ +| ``-`` | ``TimestampNTZType()`` | ``timestamp(6)`` | ``timestamp(6)`` | ++------------------------------------+------------------------------+-----------------------+-------------------------+ +| ``interval`` of any precision | ``StringType()`` [1]_ | ``text`` | ``text`` | ++------------------------------------+------------------------------+-----------------------+-------------------------+ +| ``-`` | ``DayTimeIntervalType()`` | unsupported | unsupported | ++------------------------------------+------------------------------+-----------------------+-------------------------+ +| ``-`` | ``YearMonthIntervalType()`` | unsupported | unsupported | ++------------------------------------+------------------------------+-----------------------+-------------------------+ +| ``daterange`` | ``StringType()`` [1]_ | ``text`` | ``text`` | ++------------------------------------+ | | | +| ``tsrange`` | | | | ++------------------------------------+ | | | +| ``tstzrange`` | | | | ++------------------------------------+------------------------------+-----------------------+-------------------------+ + +.. [6] + + ``time`` type is the same as ``timestamp`` with date ``1970-01-01``. So instead of reading data from Postgres like ``23:59:59`` + it is actually read ``1970-01-01 23:59:59``, and vice versa. + +String types +~~~~~~~~~~~~ + ++-----------------------------+-----------------------+-----------------------+-------------------------+ +| Postgres type (read) | Spark type | Postgres type (write) | Postgres type (create) | ++=============================+=======================+=======================+=========================+ +| ``character`` | ``StringType()`` | ``text`` | ``text`` | ++-----------------------------+ | | | +| ``character(N)`` | | | | ++-----------------------------+ | | | +| ``character varying`` | | | | ++-----------------------------+ | | | +| ``character varying(N)`` | | | | ++-----------------------------+ | | | +| ``text`` | | | | ++-----------------------------+ | | | +| ``json`` | | | | ++-----------------------------+ | | | +| ``jsonb`` | | | | ++-----------------------------+ | | | +| ``xml`` | | | | ++-----------------------------+-----------------------| | | +| ``CREATE TYPE ... AS ENUM`` | ``StringType()`` [1]_ | | | ++-----------------------------+ | | | +| ``tsvector`` | | | | ++-----------------------------+ | | | +| ``tsquery`` | | | | ++-----------------------------+-----------------------+-----------------------+-------------------------+ +| ``-`` | ``CharType()`` | ``unsupported`` | ``unsupported`` | ++-----------------------------+-----------------------+-----------------------+-------------------------+ +| ``-`` | ``VarcharType()`` | ``unsupported`` | ``unsupported`` | ++-----------------------------+-----------------------+-----------------------+-------------------------+ + +Binary types +~~~~~~~~~~~~ + ++--------------------------+-----------------------+-----------------------------+-------------------------+ +| Postgres type (read) | Spark type | Postgres type (write) | Postgres type (create) | ++==========================+=======================+=============================+=========================+ +| ``boolean`` | ``BooleanType()`` | ``boolean`` | ``boolean`` | ++--------------------------+-----------------------+-----------------------------+-------------------------+ +| ``bit`` | ``BooleanType()`` | ``bool``, | ``bool`` | ++--------------------------+ | **cannot insert data** [3]_ | | +| ``bit(N=1)`` | | | | ++--------------------------+-----------------------+-----------------------------+-------------------------+ +| ``bit(N=2..)`` | ``ByteType()`` | ``bytea``, | ``bytea`` | +| | | **cannot insert data** [3]_ | | ++--------------------------+-----------------------+-----------------------------+-------------------------+ +| ``bit varying`` | ``StringType()`` [1]_ | ``text`` | ``text`` | ++--------------------------+ | | | +| ``bit varying(N)`` | | | | ++--------------------------+-----------------------+-----------------------------+-------------------------+ +| ``bytea`` | ``BinaryType()`` | ``bytea`` | ``bytea`` | ++--------------------------+-----------------------+-----------------------------+-------------------------+ + + +Struct types +~~~~~~~~~~~~ + ++--------------------------------+-----------------------+-----------------------+-------------------------+ +| Postgres type (read) | Spark type | Postgres type (write) | Postgres type (create) | ++================================+=======================+=======================+=========================+ +| ``T[]`` | ``ArrayType(T)`` | ``T[]`` | ``T[]`` | ++--------------------------------+-----------------------+-----------------------+-------------------------+ +| ``T[][]`` | unsupported | | | ++--------------------------------+-----------------------+-----------------------+-------------------------+ +| ``CREATE TYPE sometype (...)`` | ``StringType()`` [1]_ | ``text`` | ``text`` | ++--------------------------------+-----------------------+-----------------------+-------------------------+ +| ``-`` | ``StructType()`` | unsupported | | ++--------------------------------+-----------------------+ | | +| ``-`` | ``MapType()`` | | | ++--------------------------------+-----------------------+-----------------------+-------------------------+ + +Network types +~~~~~~~~~~~~~ + ++----------------------+-----------------------+-----------------------+-------------------------+ +| Postgres type (read) | Spark type | Postgres type (write) | Postgres type (create) | ++======================+=======================+=======================+=========================+ +| ``cidr`` | ``StringType()`` [1]_ | ``text`` | ``text`` | ++----------------------+ | | | +| ``inet`` | | | | ++----------------------+ | | | +| ``macaddr`` | | | | ++----------------------+ | | | +| ``macaddr8`` | | | | ++----------------------+-----------------------+-----------------------+-------------------------+ + +Geo types +~~~~~~~~~ + ++----------------------+-----------------------+-----------------------+-------------------------+ +| Postgres type (read) | Spark type | Postgres type (write) | Postgres type (create) | ++======================+=======================+=======================+=========================+ +| ``circle`` | ``StringType()`` [1]_ | ``text`` | ``text`` | ++----------------------+ | | | +| ``box`` | | | | ++----------------------+ | | | +| ``line`` | | | | ++----------------------+ | | | +| ``lseg`` | | | | ++----------------------+ | | | +| ``path`` | | | | ++----------------------+ | | | +| ``point`` | | | | ++----------------------+ | | | +| ``polygon`` | | | | ++----------------------+ | | | +| ``polygon`` | | | | ++----------------------+-----------------------+-----------------------+-------------------------+ + +Explicit type cast +------------------- + +``DBReader`` +~~~~~~~~~~~ + +It is possible to explicitly cast column of unsupported type using ``DBReader(columns=...)`` syntax. + +For example, you can use ``CAST(column AS text)`` to convert data to string representation on Postgres side, and so it will be read as Spark's ``StringType()``. + +It is also possible to use ``to_json`` Postgres function for convert column of any type to string representation, +and then parse this column on Spark side using `from_json `_: + +.. code-block:: python + + from pyspark.sql.functions import from_json + from pyspark.sql.types import IntegerType + + from onetl.connection import Postgres + from onetl.db import DBReader + + postgres = Postgres(...) + + DBReader( + connection=postgres, + columns=[ + "id", + "supported_column", + "CAST(unsupported_column AS text) unsupported_column_str", + # or + "to_json(unsupported_column) unsupported_column_json", + ], + ) + df = reader.run() + + # Spark requires all columns to have some type, describe it + column_type = IntegerType() + + # cast column content to proper Spark type + df = df.select( + df.id, + df.supported_column, + # explicit cast + df.unsupported_column_str.cast("integer").alias("parsed_integer"), + # or explicit json parsing + from_json(df.unsupported_column_json, schema).alias("parsed_json"), + ) + +``DBWriter`` +~~~~~~~~~~~~ + +It is always possible to convert data on Spark side to string, and then write it to ``text`` column in Postgres table. + +Using ``to_json`` +^^^^^^^^^^^^^^^^^ + +For example, you can convert data using `to_json `_ function. + +.. code:: python + + from pyspark.sql.functions import to_json + + from onetl.connection import Postgres + from onetl.db import DBReader + + postgres = Postgres(...) + + postgres.execute( + """ + CREATE TABLE schema.target_table ( + id int, + supported_column timestamp, + unsupported_column_json jsonb -- any column type, actually + ) + """, + ) + + write_df = df.select( + df.id, + df.supported_column, + to_json(df.unsupported_column).alias("unsupported_column_json"), + ) + + writer = DBWriter( + connection=postgres, + target="schema.target_table", + ) + writer.run(write_df) + +Then you can parse this column on Postgres side (for example, by creating a view): + +.. code-block:: sql + + SELECT + id, + supported_column, + -- access some nested field + unsupported_column_json->'field' + FROM + schema.target_table + +To avoid casting the value on every table read you can use `GENERATED ALWAYS STORED `_ column, but this requires 2x space (for original and parsed value). + +Manual conversion to string +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Postgres connector also supports conversion text value directly to target column type, if this value has a proper format. + +For example, you can write data like ``[123, 345)`` to ``int8range`` type because Postgres allows cast ``'[123, 345)'::int8range'``: + +.. code:: python + + from pyspark.sql.ftypes import StringType + from pyspark.sql.functions import udf + + from onetl.connection import Postgres + from onetl.db import DBReader + + postgres = Postgres(...) + + postgres.execute( + """ + CREATE TABLE schema.target_table ( + id int, + range_column int8range -- any column type, actually + ) + """, + ) + + + @udf(returnType=StringType()) + def array_to_range(value: tuple): + """This UDF allows to convert tuple[start, end] to Postgres' range format""" + start, end = value + return f"[{start},{end})" + + + write_df = df.select( + df.id, + array_to_range(df.range_column).alias("range_column"), + ) + + writer = DBWriter( + connection=postgres, + target="schema.target_table", + ) + writer.run(write_df) + +This can be tricky to implement and may lead to longer write process. +But this does not require extra space on Postgres side, and allows to avoid explicit value cast on every table read. diff --git a/docs/connection/db_connection/postgres/write.rst b/docs/connection/db_connection/postgres/write.rst index db96b4ec8..1f5f2ed5c 100644 --- a/docs/connection/db_connection/postgres/write.rst +++ b/docs/connection/db_connection/postgres/write.rst @@ -1,9 +1,47 @@ .. _postgres-write: -Writing to Postgres -=================== +Writing to Postgres using ``DBWriter`` +======================================== -For writing data to Postgres, use :obj:`DBWriter ` with options below. +For writing data to Postgres, use :obj:`DBWriter `. + +.. warning:: + + Please take into account :ref:`postgres-types` + +.. warning:: + + It is always recommended to create table explicitly using :obj:`Postgres.execute ` + instead of relying on Spark's table DDL generation. + + This is because Spark's DDL generator can create columns with different precision and types than it is expected, + causing precision loss or other issues. + +Examples +-------- + +.. code-block:: python + + from onetl.connection import Postgres + from onetl.db import DBWriter + + postgres = Postgres(...) + + df = ... # data is here + + writer = DBWriter( + connection=postgres, + target="schema.table", + options=Postgres.WriteOptions(if_exists="append"), + ) + + writer.run(df) + + +Write options +------------- + +Method above accepts :obj:`JDBCWriteOptions ` .. currentmodule:: onetl.connection.db_connection.jdbc_connection.options diff --git a/onetl/connection/db_connection/greenplum/connection.py b/onetl/connection/db_connection/greenplum/connection.py index a525713f9..b508ea3ba 100644 --- a/onetl/connection/db_connection/greenplum/connection.py +++ b/onetl/connection/db_connection/greenplum/connection.py @@ -98,7 +98,7 @@ class Greenplum(JDBCMixin, DBConnection): For example: ``{"tcpKeepAlive": "true", "server.port": "50000-65535"}`` Supported options are: - * All `Postgres JDBC driver properties `_ + * All `Postgres JDBC driver properties `_ * Properties from `Greenplum connector for Spark documentation `_ page, but only starting with ``server.`` or ``pool.`` Examples diff --git a/onetl/connection/db_connection/postgres/connection.py b/onetl/connection/db_connection/postgres/connection.py index 1353204fd..a4d27bcea 100644 --- a/onetl/connection/db_connection/postgres/connection.py +++ b/onetl/connection/db_connection/postgres/connection.py @@ -16,6 +16,9 @@ class PostgresExtra(GenericOptions): + # allows automatic conversion from text to target column type during write + stringtype: str = "unspecified" + class Config: extra = "allow" @@ -27,29 +30,9 @@ class Postgres(JDBCConnection): Based on Maven package ``org.postgresql:postgresql:42.6.0`` (`official Postgres JDBC driver `_). - .. dropdown:: Version compatibility - - * PostgreSQL server versions: 8.2 or higher - * Spark versions: 2.3.x - 3.5.x - * Java versions: 8 - 20 - - See `official documentation `_. - .. warning:: - To use Postgres connector you should have PySpark installed (or injected to ``sys.path``) - BEFORE creating the connector instance. - - You can install PySpark as follows: - - .. code:: bash - - pip install onetl[spark] # latest PySpark version - - # or - pip install onetl pyspark=3.5.0 # pass specific PySpark version - - See :ref:`install-spark` installation instruction for more details. + Before using this connector please take into account :ref:`postgres-prerequisites` Parameters ---------- @@ -78,7 +61,7 @@ class Postgres(JDBCConnection): For example: ``{"ssl": "false"}`` - See `Postgres JDBC driver properties documentation `_ + See `Postgres JDBC driver properties documentation `_ for more details Examples diff --git a/tests/tests_unit/tests_db_connection_unit/test_jdbc_options_unit.py b/tests/tests_unit/tests_db_connection_unit/test_jdbc_options_unit.py index f932408d0..ff33ef2af 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_jdbc_options_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_jdbc_options_unit.py @@ -228,7 +228,7 @@ def test_jdbc_read_options_to_jdbc(spark_mock): "CamelCaseOption": "left unchanged", }, "upperBound": "1000", - "url": "jdbc:postgresql://local:5432/default?ApplicationName=abc", + "url": "jdbc:postgresql://local:5432/default?ApplicationName=abc&stringtype=unspecified", } @@ -257,7 +257,7 @@ def test_jdbc_write_options_to_jdbc(spark_mock): "camelCaseOption": "left unchanged", "CamelCaseOption": "left unchanged", }, - "url": "jdbc:postgresql://local:5432/default?ApplicationName=abc", + "url": "jdbc:postgresql://local:5432/default?ApplicationName=abc&stringtype=unspecified", } diff --git a/tests/tests_unit/tests_db_connection_unit/test_postgres_unit.py b/tests/tests_unit/tests_db_connection_unit/test_postgres_unit.py index 01f85eb08..f6b5f4e9b 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_postgres_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_postgres_unit.py @@ -55,7 +55,7 @@ def test_postgres(spark_mock): assert conn.password.get_secret_value() == "passwd" assert conn.database == "database" - assert conn.jdbc_url == "jdbc:postgresql://some_host:5432/database?ApplicationName=abc" + assert conn.jdbc_url == "jdbc:postgresql://some_host:5432/database?ApplicationName=abc&stringtype=unspecified" assert "password='passwd'" not in str(conn) assert "password='passwd'" not in repr(conn) @@ -71,7 +71,7 @@ def test_postgres_with_port(spark_mock): assert conn.password.get_secret_value() == "passwd" assert conn.database == "database" - assert conn.jdbc_url == "jdbc:postgresql://some_host:5000/database?ApplicationName=abc" + assert conn.jdbc_url == "jdbc:postgresql://some_host:5000/database?ApplicationName=abc&stringtype=unspecified" def test_postgres_without_database_error(spark_mock): @@ -89,7 +89,10 @@ def test_postgres_with_extra(spark_mock): spark=spark_mock, ) - assert conn.jdbc_url == "jdbc:postgresql://some_host:5432/database?ApplicationName=abc&autosave=always&ssl=true" + assert ( + conn.jdbc_url + == "jdbc:postgresql://some_host:5432/database?ApplicationName=abc&autosave=always&ssl=true&stringtype=unspecified" + ) def test_postgres_without_mandatory_args(spark_mock): From 8db3ac25c27f28b6346a5124d511bc08c5e7de16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Thu, 29 Feb 2024 16:25:57 +0000 Subject: [PATCH 32/63] [DOP-13251] Add Greenplum type mapping to documentation --- .../next_release/228.improvement.rst | 3 + .../db_connection/clickhouse/execute.rst | 2 +- .../db_connection/greenplum/execute.rst | 95 +++++ .../db_connection/greenplum/index.rst | 6 + .../db_connection/greenplum/prerequisites.rst | 4 +- .../db_connection/greenplum/read.rst | 296 ++++++++++++-- .../db_connection/greenplum/types.rst | 386 ++++++++++++++++++ .../db_connection/greenplum/write.rst | 56 ++- .../db_connection/greenplum/connection.py | 2 +- 9 files changed, 788 insertions(+), 62 deletions(-) create mode 100644 docs/changelog/next_release/228.improvement.rst create mode 100644 docs/connection/db_connection/greenplum/types.rst diff --git a/docs/changelog/next_release/228.improvement.rst b/docs/changelog/next_release/228.improvement.rst new file mode 100644 index 000000000..ef936eaaa --- /dev/null +++ b/docs/changelog/next_release/228.improvement.rst @@ -0,0 +1,3 @@ +Improve Greenplum documentation: + * Add "Types" section describing mapping between Greenplum and Spark types + * Add more examples of reading and writing data from Greenplum diff --git a/docs/connection/db_connection/clickhouse/execute.rst b/docs/connection/db_connection/clickhouse/execute.rst index f1ee68ff5..59d956a89 100644 --- a/docs/connection/db_connection/clickhouse/execute.rst +++ b/docs/connection/db_connection/clickhouse/execute.rst @@ -62,6 +62,7 @@ This method supports **any** query syntax supported by Clickhouse, like: * ``CREATE TABLE ...`` * ``ALTER ...`` * ``INSERT INTO ... AS SELECT ...`` +* etc It does not support multiple queries in the same operation, like ``SET ...; CREATE TABLE ...;``. @@ -89,7 +90,6 @@ Examples options=Clickhouse.JDBCOptions(query_timeout=10), ) - References ---------- diff --git a/docs/connection/db_connection/greenplum/execute.rst b/docs/connection/db_connection/greenplum/execute.rst index e2179a4ec..356df27b3 100644 --- a/docs/connection/db_connection/greenplum/execute.rst +++ b/docs/connection/db_connection/greenplum/execute.rst @@ -3,6 +3,101 @@ Executing statements in Greenplum ================================== +.. warning:: + + Unlike ``DBReader``, ``Greenplum.fetch`` and ``Greenplum.execute`` are implemented using Postgres JDBC connection, + and so types are handled a bit differently. See :ref:`postgres-types`. + +How to +------ + +There are 2 ways to execute some statement in Greenplum + +Use :obj:`Greenplum.fetch ` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Use this method to execute some ``SELECT`` query which returns **small number or rows**, like reading +Greenplum config, or reading data from some reference table. + +Method accepts :obj:`JDBCOptions `. + +Connection opened using this method should be then closed with :obj:`Greenplum.close `. + +Syntax support +^^^^^^^^^^^^^^ + +This method supports **any** query syntax supported by Greenplum, like: + +* ``SELECT ... FROM ...`` +* ``WITH alias AS (...) SELECT ...`` + +Queries like ``SHOW ...`` are not supported. + +It does not support multiple queries in the same operation, like ``SET ...; SELECT ...;``. + +Examples +^^^^^^^^ + +.. code-block:: python + + from onetl.connection import Greenplum + + greenplum = Greenplum(...) + + df = greenplum.fetch( + "SELECT value FROM some.reference_table WHERE key = 'some_constant'", + options=Greenplum.JDBCOptions(query_timeout=10), + ) + greenplum.close() + value = df.collect()[0][0] # get value from first row and first column + +Use :obj:`Greenplum.execute ` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it. + +Method accepts :obj:`JDBCOptions `. + +Connection opened using this method should be then closed with :obj:`Greenplum.close `. + +Syntax support +^^^^^^^^^^^^^^ + +This method supports **any** query syntax supported by Greenplum, like: + +* ``CREATE TABLE ...``, ``CREATE VIEW ...``, and so on +* ``ALTER ...`` +* ``INSERT INTO ... AS SELECT ...`` +* ``DROP TABLE ...``, ``DROP VIEW ...``, and so on +* ``CALL procedure(arg1, arg2) ...`` +* ``SELECT func(arg1, arg2)`` or ``{call func(arg1, arg2)}`` - special syntax for calling functions +* etc + +It does not support multiple queries in the same operation, like ``SET ...; CREATE TABLE ...;``. + +Examples +^^^^^^^^ + +.. code-block:: python + + from onetl.connection import Greenplum + + greenplum = Greenplum(...) + + with greenplum: + greenplum.execute("DROP TABLE schema.table") + greenplum.execute( + """ + CREATE TABLE schema.table AS ( + id int, + key text, + value real + ) + DISTRIBUTED BY id + """, + options=Greenplum.JDBCOptions(query_timeout=10), + ) + Interaction schema ------------------ diff --git a/docs/connection/db_connection/greenplum/index.rst b/docs/connection/db_connection/greenplum/index.rst index b4ff40331..af445579f 100644 --- a/docs/connection/db_connection/greenplum/index.rst +++ b/docs/connection/db_connection/greenplum/index.rst @@ -17,3 +17,9 @@ Greenplum read write execute + +.. toctree:: + :maxdepth: 1 + :caption: Troubleshooting + + types diff --git a/docs/connection/db_connection/greenplum/prerequisites.rst b/docs/connection/db_connection/greenplum/prerequisites.rst index 4f33b51a0..4796e9d12 100644 --- a/docs/connection/db_connection/greenplum/prerequisites.rst +++ b/docs/connection/db_connection/greenplum/prerequisites.rst @@ -24,7 +24,7 @@ Downloading VMware package -------------------------- To use Greenplum connector you should download connector ``.jar`` file from -`VMware website `_ +`VMware website `_ and then pass it to Spark session. .. warning:: @@ -197,7 +197,7 @@ used for creating a connection: .. tabs:: - .. code-tab:: sql Read + write + .. code-tab:: sql Read + Write -- get access to get tables metadata & cluster information GRANT SELECT ON information_schema.tables TO username; diff --git a/docs/connection/db_connection/greenplum/read.rst b/docs/connection/db_connection/greenplum/read.rst index feecaf0bb..9f7d925cc 100644 --- a/docs/connection/db_connection/greenplum/read.rst +++ b/docs/connection/db_connection/greenplum/read.rst @@ -1,15 +1,109 @@ .. _greenplum-read: -Reading from Greenplum -======================= +Reading from Greenplum using ``DBReader`` +========================================== -For reading data from Greenplum, use :obj:`DBReader ` with options below. +.. warning:: + + Please take into account :ref:`greenplum-types`. + +Data can be read from Greenplum to Spark using :obj:`DBReader `. +It also supports :ref:`strategy` for incremental data reading. .. note:: Unlike JDBC connectors, *Greenplum connector for Spark* does not support - executing **custom** SQL queries using ``.sql`` method. Connector can be used to only read some table data - (with filters, if needed) using DBReader. + executing **custom** SQL queries using ``.sql`` method. Connector can be used to only read data from a table or view. + +Supported DBReader features +--------------------------- + +* ✅︎ ``columns`` (see note below) +* ✅︎ ``where`` (see note below) +* ✅︎ ``hwm`` (see note below), supported strategies: +* * ✅︎ :ref:`snapshot-strategy` +* * ✅︎ :ref:`incremental-strategy` +* * ✅︎ :ref:`snapshot-batch-strategy` +* * ✅︎ :ref:`incremental-batch-strategy` +* ❌ ``hint`` (is not supported by Greenplum) +* ❌ ``df_schema`` +* ✅︎ ``options`` (see :obj:`GreenplumReadOptions `) + +.. warning:: + + In case of Greenplum connector, ``DBReader`` does not generate raw ``SELECT`` query. Instead it relies on Spark SQL syntax + which in some cases (using column projection and predicate pushdown) can be converted to Greenplum SQL. + + So ``columns``, ``where`` and ``hwm.expression`` should be specified in Spark SQL syntax, not Greenplum SQL. + + This is OK: + + .. code-block:: python + + DBReader( + columns=[ + "some_column", + # this cast is executed on Spark side + "CAST(another_column AS STRING)", + ], + # this predicate is parsed by Spark, and can be pushed down to Greenplum + where="some_column LIKE 'val1%'", + ) + + This is will fail: + + .. code-block:: python + + DBReader( + columns=[ + "some_column", + # Spark does not have `text` type + "CAST(another_column AS text)", + ], + # Spark does not support ~ syntax for regexp matching + where="some_column ~ 'val1.*'", + ) + +Examples +-------- + +Snapshot strategy: + +.. code-block:: python + + from onetl.connection import Greenplum + from onetl.db import DBReader + + greenplum = Greenplum(...) + + reader = DBReader( + connection=greenplum, + source="schema.table", + columns=["id", "key", "CAST(value AS string) value", "updated_dt"], + where="key = 'something'", + ) + df = reader.run() + +Incremental strategy: + +.. code-block:: python + + from onetl.connection import Greenplum + from onetl.db import DBReader + from onetl.strategy import IncrementalStrategy + + greenplum = Greenplum(...) + + reader = DBReader( + connection=greenplum, + source="schema.table", + columns=["id", "key", "CAST(value AS string) value", "updated_dt"], + where="key = 'something'", + hwm=DBReader.AutoDetectHWM(name="greenplum_hwm", expression="updated_dt"), + ) + + with IncrementalStrategy(): + df = reader.run() Interaction schema ------------------ @@ -91,60 +185,176 @@ High-level schema is described in :ref:`greenplum-prerequisites`. You can find d Recommendations --------------- -Reading from views -~~~~~~~~~~~~~~~~~~ +``DBReader`` in case of Greenplum connector requires view or table to have some column which is used by Spark +for parallel reads. + +Choosing proper column allows each Spark executor to read only part of data stored in the specified segment, +avoiding moving large amounts of data between segments, which improves reading performance. + +Parallel read using ``gp_segment_id`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +By default, ``DBReader`` will use `gp_segment_id `_ +column for parallel data reading. Each DataFrame partition will contain data of a specific Greenplum segment. -This connector is **NOT** designed to read data from views. +This allows each Spark executor read only data from specific Greenplum segment, avoiding moving large amounts of data between segments. -You can technically read data from a view which has -`gp_segment_id `_ column. -But this is **not** recommended because each Spark executor will run the same query, which may lead to running duplicated calculations -and sending data between segments only to skip most of the result and select only small part. +If view is used, it is recommended to include ``gp_segment_id`` column to this view: -Prefer following option: - * Create staging table to store result data, using :obj:`Greenplum.execute ` - * Use the same ``.execute`` method run a query ``INSERT INTO staging_table AS SELECT FROM some_view``. This will be done on Greenplum segments side, query will be run only once. - * Read data from staging table to Spark executor using :obj:`DBReader `. - * Drop staging table using ``.execute`` method. +.. dropdown:: Reading from view with gp_segment_id column -Using ``JOIN`` on Greenplum side -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + .. code-block:: python -If you need to get data of joining 2 tables in Greenplum, you should: - * Create staging table to store result data, using ``Greenplum.execute`` - * Use the same ``Greenplum.execute`` run a query ``INSERT INTO staging_table AS SELECT FROM table1 JOIN table2``. This will be done on Greenplum segments side, in a distributed way. - * Read data from staging table to Spark executor using ``DBReader``. - * Drop staging table using ``Greenplum.execute``. + from onetl.connection import Greenplum + from onetl.db import DBReader + + greenplum = Greenplum(...) + + greenplum.execute( + """ + CREATE VIEW schema.view_with_gp_segment_id AS + SELECT + id, + some_column, + another_column, + gp_segment_id -- IMPORTANT + FROM schema.some_table + """, + ) + + reader = DBReader( + connection=greenplum, + source="schema.view_with_gp_segment_id", + ) + df = reader.run() + +Parallel read using custom ``partition_column`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Sometimes table or view is lack of ``gp_segment_id`` column, but there is some column +with value range correlated with Greenplum segment distribution. + +In this case, custom column can be used instead: + +.. dropdown:: Reading from view with custom partition_column + + .. code-block:: python + + from onetl.connection import Greenplum + from onetl.db import DBReader + + greenplum = Greenplum(...) + + greenplum.execute( + """ + CREATE VIEW schema.view_with_partition_column AS + SELECT + id, + some_column, + part_column -- correlated to greenplum segment ID + FROM schema.some_table + """, + ) + + reader = DBReader( + connection=greenplum, + source="schema.view_with_partition_column", + options=Greenplum.Options( + # parallelize data using specified column + partition_column="part_column", + # create 10 Spark tasks, each will read only part of table data + num_partitions=10, + ), + ) + df = reader.run() + +Reading ``DISTRIBUTED REPLICATED`` tables +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Replicated tables do not have ``gp_segment_id`` column at all, so you need to set ``partition_column`` to some column name +of type integer/bigint/smallint. + +Parallel ``JOIN`` execution +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In case of using views which require some data motion between Greenplum segments, like ``JOIN`` queries, another approach should be used. + +Each Spark executor N will run the same query, so each of N query will start its own JOIN process, leading to really heavy load on Greenplum segments. +**This should be avoided**. + +Instead is recommended to run ``JOIN`` query on Greenplum side, save the result to an intermediate table, +and then read this table using ``DBReader``: + +.. dropdown:: Reading from view using intermediate table + + .. code-block:: python + + from onetl.connection import Greenplum + from onetl.db import DBReader + + greenplum = Greenplum(...) + + greenplum.execute( + """ + CREATE UNLOGGED TABLE schema.intermediate_table AS + SELECT + id, + tbl1.col1, + tbl1.data, + tbl2.another_data + FROM + schema.table1 as tbl1 + JOIN + schema.table2 as tbl2 + ON + tbl1.col1 = tbl2.col2 + WHERE ... + """, + ) + + reader = DBReader( + connection=greenplum, + source="schema.intermediate_table", + ) + df = reader.run() + + # write dataframe somethere + + greenplum.execute( + """ + DROP TABLE schema.intermediate_table + """, + ) .. warning:: - Do **NOT** try to read data from ``table1`` and ``table2`` using ``DBReader``, and then join the resulting dataframes! + **NEVER** do that: - This will lead to sending all the data from both tables to Spark executor memory, and then ``JOIN`` - will be performed on Spark side, not Greenplum. This is **very** inefficient. + .. code-block:: python -Using ``TEMPORARY`` tables -~~~~~~~~~~~~~~~~~~~~~~~~~~ + df1 = DBReader(connection=greenplum, table="public.table1", ...).run() + df2 = DBReader(connection=greenplum, table="public.table2", ...).run() -Someone could think that writing data from ``VIEW`` or result of ``JOIN`` to ``TEMPORARY`` table, -and then passing it to DBReader, is an efficient way to read data from Greenplum, because temp tables are not generating WAL files, -and are automatically deleted after finishing the transaction. + joined_df = df1.join(df2, on="col") -That's will **not** work. Each Spark executor establishes its own connection to Greenplum, -and thus reads its own temporary table, which does not contain any data. + This will lead to sending all the data from both ``table1`` and ``table2`` to Spark executor memory, and then ``JOIN`` + will be performed on Spark side, not inside Greenplum. This is **VERY** inefficient. -You should use `UNLOGGED `_ tables -to write data to staging table without generating useless WAL logs. +``TEMPORARY`` tables notice +^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Mapping of Greenplum types to Spark types -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Someone could think that writing data from view or result of ``JOIN`` to ``TEMPORARY`` table, +and then passing it to ``DBReader``, is an efficient way to read data from Greenplum. This is because temp tables are not generating WAL files, +and are automatically deleted after finishing the transaction. + +That will **NOT** work. Each Spark executor establishes its own connection to Greenplum. +And each connection starts its own transaction which means that every executor will read empty temporary table. -See `official documentation `_ -for more details. -onETL does not perform any additional casting of types while reading data. +You should use `UNLOGGED `_ tables +to write data to intermediate table without generating WAL logs. -Options -------- +Read options +------------ .. currentmodule:: onetl.connection.db_connection.greenplum.options diff --git a/docs/connection/db_connection/greenplum/types.rst b/docs/connection/db_connection/greenplum/types.rst new file mode 100644 index 000000000..d6404569e --- /dev/null +++ b/docs/connection/db_connection/greenplum/types.rst @@ -0,0 +1,386 @@ +.. _greenplum-types: + +Greenplum <-> Spark type mapping +================================= + +Type detection & casting +------------------------ + +Spark's DataFrames always have a ``schema`` which is a list of columns with corresponding Spark types. All operations on a column are performed using column type. + +Reading from Greenplum +~~~~~~~~~~~~~~~~~~~~~~~ + +This is how Greenplum connector performs this: + +* Execute query ``SELECT * FROM table LIMIT 0`` [1]_. +* For each column in query result get column name and Greenplum type. +* Find corresponding ``Greenplum type (read)`` -> ``Spark type`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* Use Spark column projection and predicate pushdown features to build a final query. +* Create DataFrame from generated query with inferred schema. + +.. [1] + Yes, **all columns of a table**, not just selected ones. + This means that if source table **contains** columns with unsupported type, the entire table cannot be read. + +Writing to some existing Clickhuse table +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This is how Greenplum connector performs this: + +* Get names of columns in DataFrame. +* Perform ``SELECT * FROM table LIMIT 0`` query. +* For each column in query result get column name and Greenplum type. +* Match table columns with DataFrame columns (by name, case insensitive). + If some column is present only in target table, but not in DataFrame (like ``DEFAULT`` or ``SERIAL`` column), and vice versa, raise an exception. + See `Write unsupported column type`_. +* Find corresponding ``Spark type`` -> ``Greenplumtype (write)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* If ``Greenplumtype (write)`` match ``Greenplum type (read)``, no additional casts will be performed, DataFrame column will be written to Greenplum as is. +* If ``Greenplumtype (write)`` does not match ``Greenplum type (read)``, DataFrame column will be casted to target column type **on Greenplum side**. For example, you can write column with text data to ``json`` column which Greenplum connector currently does not support. + +Create new table using Spark +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. warning:: + + ABSOLUTELY NOT RECOMMENDED! + +This is how Greenplum connector performs this: + +* Find corresponding ``Spark type`` -> ``Greenplum type (create)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* Generate DDL for creating table in Greenplum, like ``CREATE TABLE (col1 ...)``, and run it. +* Write DataFrame to created table as is. + +More details `can be found here `_. + +But Greenplum connector support only limited number of types and almost no custom clauses (like ``PARTITION BY``). +So instead of relying on Spark to create tables: + +.. dropdown:: See example + + .. code:: python + + writer = DBWriter( + connection=greenplum, + table="public.table", + options=Greenplum.WriteOptions( + if_exists="append", + # by default distribution is random + distributedBy="id", + # partitionBy is not supported + ), + ) + writer.run(df) + +Always prefer creating table with desired DDL **BEFORE WRITING DATA**: + +.. dropdown:: See example + + .. code:: python + + greenplum.execute( + """ + CREATE TABLE public.table AS ( + id int32, + business_dt timestamp(6), + value json + ) + PARTITION BY RANGE (business_dt) + DISTRIBUTED BY id + """, + ) + + writer = DBWriter( + connection=greenplum, + table="public.table", + options=Greenplum.WriteOptions(if_exists="append"), + ) + writer.run(df) + +See Greenplum `CREATE TABLE `_ documentation. + +Supported types +--------------- + +See `list of Greenplum types `_. + +Numeric types +~~~~~~~~~~~~~ + ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +| Greenplum type (read) | Spark type | Greenplumtype (write) | Greenplum type (create) | ++==================================+===================================+===============================+=========================+ +| ``decimal`` | ``DecimalType(P=38, S=18)`` | ``decimal(P=38, S=18)`` | ``decimal`` (unbounded) | ++----------------------------------+-----------------------------------+-------------------------------+ | +| ``decimal(P=0..38)`` | ``DecimalType(P=0..38, S=0)`` | ``decimal(P=0..38, S=0)`` | | ++----------------------------------+-----------------------------------+-------------------------------+ | +| ``decimal(P=0..38, S=0..38)`` | ``DecimalType(P=0..38, S=0..38)`` | ``decimal(P=0..38, S=0..38)`` | | ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +| ``decimal(P=39.., S=0..)`` | unsupported [2]_ | | | ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +| ``real`` | ``FloatType()`` | ``real`` | ``real`` | ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +| ``double precision`` | ``DoubleType()`` | ``double precision`` | ``double precision`` | ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +| ``-`` | ``ByteType()`` | unsupported | unsupported | ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +| ``smallint`` | ``ShortType()`` | ``smallint`` | ``smallint`` | ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +| ``integer`` | ``IntegerType()`` | ``integer`` | ``integer`` | ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +| ``bigint`` | ``LongType()`` | ``bigint`` | ``bigint`` | ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ +| ``money`` | unsupported | | | ++----------------------------------+ | | | +| ``int4range`` | | | | ++----------------------------------+ | | | +| ``int8range`` | | | | ++----------------------------------+ | | | +| ``numrange`` | | | | ++----------------------------------+ | | | +| ``int2vector`` | | | | ++----------------------------------+-----------------------------------+-------------------------------+-------------------------+ + +.. [2] + + Greenplum support decimal types with unlimited precision. + + But Spark's ``DecimalType(P, S)`` supports maximum ``P=38`` (128 bit). It is impossible to read, write or operate with values of larger precision, + this leads to an exception. + +Temporal types +~~~~~~~~~~~~~~ + ++------------------------------------+-------------------------+-----------------------+-------------------------+ +| Greenplum type (read) | Spark type | Greenplumtype (write) | Greenplum type (create) | ++====================================+=========================+=======================+=========================+ +| ``date`` | ``DateType()`` | ``date`` | ``date`` | ++------------------------------------+-------------------------+-----------------------+-------------------------+ +| ``time`` | ``TimestampType()``, | ``timestamp`` | ``timestamp`` | ++------------------------------------+ time format quirks [3]_ | | | +| ``time(0..6)`` | | | | ++------------------------------------+ | | | +| ``time with time zone`` | | | | ++------------------------------------+ | | | +| ``time(0..6) with time zone`` | | | | ++------------------------------------+-------------------------+-----------------------+-------------------------+ +| ``timestamp`` | ``TimestampType()`` | ``timestamp`` | ``timestamp`` | ++------------------------------------+ | | | +| ``timestamp(0..6)`` | | | | ++------------------------------------+ | | | +| ``timestamp with time zone`` | | | | ++------------------------------------+ | | | +| ``timestamp(0..6) with time zone`` | | | | ++------------------------------------+-------------------------+-----------------------+-------------------------+ +| ``interval`` or any precision | unsupported | | | ++------------------------------------+ | | | +| ``daterange`` | | | | ++------------------------------------+ | | | +| ``tsrange`` | | | | ++------------------------------------+ | | | +| ``tstzrange`` | | | | ++------------------------------------+-------------------------+-----------------------+-------------------------+ + +.. [3] + + ``time`` type is the same as ``timestamp`` with date ``1970-01-01``. So instead of reading data from Postgres like ``23:59:59`` + it is actually read ``1970-01-01 23:59:59``, and vice versa. + +String types +~~~~~~~~~~~~ + ++-----------------------------+------------------+-----------------------+-------------------------+ +| Greenplum type (read) | Spark type | Greenplumtype (write) | Greenplum type (create) | ++=============================+==================+=======================+=========================+ +| ``character`` | ``StringType()`` | ``text`` | ``text`` | ++-----------------------------+ | | | +| ``character(N)`` | | | | ++-----------------------------+ | | | +| ``character varying`` | | | | ++-----------------------------+ | | | +| ``character varying(N)`` | | | | ++-----------------------------+ | | | +| ``text`` | | | | ++-----------------------------+ | | | +| ``xml`` | | | | ++-----------------------------+ | | | +| ``CREATE TYPE ... AS ENUM`` | | | | ++-----------------------------+------------------+-----------------------+-------------------------+ +| ``json`` | unsupported | | | ++-----------------------------+ | | | +| ``jsonb`` | | | | ++-----------------------------+------------------+-----------------------+-------------------------+ + +Binary types +~~~~~~~~~~~~ + ++--------------------------+-------------------+-----------------------+-------------------------+ +| Greenplum type (read) | Spark type | Greenplumtype (write) | Greenplum type (create) | ++==========================+===================+=======================+=========================+ +| ``boolean`` | ``BooleanType()`` | ``boolean`` | ``boolean`` | ++--------------------------+-------------------+-----------------------+-------------------------+ +| ``bit`` | unsupported | | | ++--------------------------+ | | | +| ``bit(N)`` | | | | ++--------------------------+ | | | +| ``bit varying`` | | | | ++--------------------------+ | | | +| ``bit varying(N)`` | | | | ++--------------------------+-------------------+-----------------------+-------------------------+ +| ``bytea`` | unsupported [4]_ | | | ++--------------------------+-------------------+-----------------------+-------------------------+ +| ``-`` | ``BinaryType()`` | ``bytea`` | ``bytea`` | ++--------------------------+-------------------+-----------------------+-------------------------+ + +.. [4] Yes, that's weird. + +Struct types +~~~~~~~~~~~~ + ++--------------------------------+------------------+-----------------------+-------------------------+ +| Greenplum type (read) | Spark type | Greenplumtype (write) | Greenplum type (create) | ++================================+==================+=======================+=========================+ +| ``T[]`` | unsupported | | | ++--------------------------------+------------------+-----------------------+-------------------------+ +| ``-`` | ``ArrayType()`` | unsupported | | ++--------------------------------+------------------+-----------------------+-------------------------+ +| ``CREATE TYPE sometype (...)`` | ``StringType()`` | ``text`` | ``text`` | ++--------------------------------+------------------+-----------------------+-------------------------+ +| ``-`` | ``StructType()`` | unsupported | | ++--------------------------------+------------------+ | | +| ``-`` | ``MapType()`` | | | ++--------------------------------+------------------+-----------------------+-------------------------+ + +Unsupported types +----------------- + +Columns of these types cannot be read/written by Spark: + * ``cidr`` + * ``inet`` + * ``macaddr`` + * ``macaddr8`` + * ``circle`` + * ``box`` + * ``line`` + * ``lseg`` + * ``path`` + * ``point`` + * ``polygon`` + * ``tsvector`` + * ``tsquery`` + * ``uuid`` + +The is a way to avoid this - just cast unsupported types to ``text``. But the way this can be done is not a straightforward. + +Read unsupported column type +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Unfortunately, it is not possible to cast unsupported column to some supported type on ``DBReader`` side: + +.. code-block:: python + + DBReader( + connection=greenplum, + # will fail + columns=["CAST(column AS text)"], + ) + +This is related to Greenplum connector implementation. Instead of passing this ``CAST`` expression to ``SELECT`` query +as is, it performs type cast on Spark side, so this syntax is not supported. + +But there is a workaround - create a view with casting unsupported column to ``text`` (or any other supported type). + +For example, you can use ``to_json`` Postgres function for convert column of any type to string representation. +You can then parse this column on Spark side using `from_json `_: + +.. code:: python + + from pyspark.sql.functions import from_json + from pyspark.sql.types import ArrayType, IntegerType + + from onetl.connection import Greenplum + from onetl.db import DBReader + + greenplum = Greenplum(...) + + # create view with proper type cast + greenplum.execute( + """ + CREATE VIEW schema.view_with_json_column AS + SELECT + id, + supported_column, + to_json(array_column) array_column_as_json, + gp_segment_id -- ! important ! + FROM + schema.table_with_unsupported_columns + """, + ) + + # create dataframe using this view + reader = DBReader( + connection=greenplum, + source="schema.view_with_json_column", + ) + df = reader.run() + + # Spark requires all columns to have some type, describe it + column_type = ArrayType(IntegerType()) + + # cast column content to proper Spark type + df = df.select( + df.id, + df.supported_column, + from_json(df.array_column_as_json, schema).alias("array_column"), + ) + +Write unsupported column type +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +It is always possible to convert data on Spark side to string, and then write it to ``text`` column in Greenplum table. + +For example, you can convert data using `to_json `_ function. + +.. code:: python + + from pyspark.sql.functions import to_json + + from onetl.connection import Greenplum + from onetl.db import DBReader + + greenplum = Greenplum(...) + + greenplum.execute( + """ + CREATE TABLE schema.target_table ( + id int, + supported_column timestamp, + array_column_as_json jsonb, -- or text + ) + DISTRIBUTED BY id + """, + ) + + write_df = df.select( + df.id, + df.supported_column, + to_json(df.array_column).alias("array_column_json"), + ) + + writer = DBWriter( + connection=greenplum, + target="schema.target_table", + ) + writer.run(write_df) + +Then you can parse this column on Greenplum side: + +.. code-block:: sql + + SELECT + id, + supported_column, + -- access first item of an array + array_column_as_json->0 + FROM + schema.target_table diff --git a/docs/connection/db_connection/greenplum/write.rst b/docs/connection/db_connection/greenplum/write.rst index 8dfa5a4c6..11309db14 100644 --- a/docs/connection/db_connection/greenplum/write.rst +++ b/docs/connection/db_connection/greenplum/write.rst @@ -1,10 +1,46 @@ .. _greenplum-write: -Writing to Greenplum -===================== +Writing to Greenplum using ``DBWriter`` +======================================== -For writing data to Greenplum, use :obj:`DBWriter ` with options below. +For writing data to Greenplum, use :obj:`DBWriter ` +with :obj:`GreenplumWriteOptions `. +.. warning:: + + Please take into account :ref:`greenplum-types`. + +.. warning:: + + It is always recommended to create table explicitly using :obj:`Greenplum.execute ` + instead of relying on Spark's table DDL generation. + + This is because Spark's DDL generator can create columns with different types than it is expected. + +Examples +-------- + +.. code-block:: python + + from onetl.connection import Greenplum + from onetl.db import DBWriter + + greenplum = Greenplum(...) + + df = ... # data is here + + writer = DBWriter( + connection=greenplum, + target="schema.table", + options=Greenplum.WriteOptions( + if_exists="append", + # by default distribution is random + distributedBy="id", + # partitionBy is not supported + ), + ) + + writer.run(df) Interaction schema ------------------ @@ -87,18 +123,8 @@ High-level schema is described in :ref:`greenplum-prerequisites`. You can find d deactivate "Spark driver" @enduml -Recommendations ---------------- - -Mapping of Spark types to Greenplum types -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -See `official documentation `_ -for more details. -onETL does not perform any additional casting of types while writing data. - -Options -------- +Write options +------------- .. currentmodule:: onetl.connection.db_connection.greenplum.options diff --git a/onetl/connection/db_connection/greenplum/connection.py b/onetl/connection/db_connection/greenplum/connection.py index b508ea3ba..d1c378cd7 100644 --- a/onetl/connection/db_connection/greenplum/connection.py +++ b/onetl/connection/db_connection/greenplum/connection.py @@ -64,7 +64,7 @@ class Greenplum(JDBCMixin, DBConnection): """Greenplum connection. |support_hooks| Based on package ``io.pivotal:greenplum-spark:2.3.0`` - (`VMware Greenplum connector for Spark `_). + (`VMware Greenplum connector for Spark `_). .. warning:: From e4ccd9f1a00b3cddabb488627462ee5c47e520a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Tue, 5 Mar 2024 11:06:40 +0000 Subject: [PATCH 33/63] Fix Options documentation --- onetl/connection/db_connection/greenplum/options.py | 6 +++--- onetl/connection/db_connection/kafka/options.py | 6 ++---- onetl/connection/db_connection/mongodb/options.py | 12 ++++++------ 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/onetl/connection/db_connection/greenplum/options.py b/onetl/connection/db_connection/greenplum/options.py index 123edec1b..8779722fa 100644 --- a/onetl/connection/db_connection/greenplum/options.py +++ b/onetl/connection/db_connection/greenplum/options.py @@ -76,7 +76,7 @@ class GreenplumReadOptions(JDBCOptions): .. warning:: Some options, like ``url``, ``dbtable``, ``server.*``, ``pool.*``, - etc are populated from connection attributes, and cannot be set in ``ReadOptions`` class + etc are populated from connection attributes, and cannot be overridden by the user in ``ReadOptions`` to avoid issues. Examples -------- @@ -208,8 +208,8 @@ class GreenplumWriteOptions(JDBCOptions): .. warning:: - Some options, like ``url``, ``dbtable``, ``server.*``, ``pool.*``, - etc are populated from connection attributes, and cannot be set in ``WriteOptions`` class + Some options, like ``url``, ``dbtable``, ``server.*``, ``pool.*``, etc + are populated from connection attributes, and cannot be overridden by the user in ``WriteOptions`` to avoid issues. Examples -------- diff --git a/onetl/connection/db_connection/kafka/options.py b/onetl/connection/db_connection/kafka/options.py index 28bf44f6b..7ca76c1fc 100644 --- a/onetl/connection/db_connection/kafka/options.py +++ b/onetl/connection/db_connection/kafka/options.py @@ -71,8 +71,7 @@ class KafkaReadOptions(GenericOptions): * ``subscribe`` * ``subscribePattern`` - are populated from connection attributes, and cannot be set in ``KafkaReadOptions`` class and be overridden - by the user to avoid issues. + are populated from connection attributes, and cannot be overridden by the user in ``ReadOptions`` to avoid issues. Examples -------- @@ -117,8 +116,7 @@ class KafkaWriteOptions(GenericOptions): * ``kafka.*`` * ``topic`` - are populated from connection attributes, and cannot be set in ``KafkaWriteOptions`` class and be overridden - by the user to avoid issues. + are populated from connection attributes, and cannot be overridden by the user in ``WriteOptions`` to avoid issues. Examples -------- diff --git a/onetl/connection/db_connection/mongodb/options.py b/onetl/connection/db_connection/mongodb/options.py index 86ff82f1b..4b4656fa5 100644 --- a/onetl/connection/db_connection/mongodb/options.py +++ b/onetl/connection/db_connection/mongodb/options.py @@ -96,7 +96,7 @@ class MongoDBPipelineOptions(GenericOptions): .. note :: You can pass any value - `supported by connector `_, + `supported by connector `_, even if it is not mentioned in this documentation. The set of supported options depends on connector version. See link above. @@ -104,7 +104,7 @@ class MongoDBPipelineOptions(GenericOptions): .. warning:: Options ``uri``, ``database``, ``collection``, ``pipeline`` are populated from connection attributes, - and cannot be set in ``PipelineOptions`` class. + and cannot be overridden by the user in ``PipelineOptions`` to avoid issues. Examples -------- @@ -130,7 +130,7 @@ class MongoDBReadOptions(GenericOptions): .. note :: You can pass any value - `supported by connector `_, + `supported by connector `_, even if it is not mentioned in this documentation. The set of supported options depends on connector version. See link above. @@ -138,7 +138,7 @@ class MongoDBReadOptions(GenericOptions): .. warning:: Options ``uri``, ``database``, ``collection``, ``pipeline``, ``hint`` are populated from connection - attributes, and cannot be set in ``ReadOptions`` class. + attributes, and cannot be overridden by the user in ``ReadOptions`` to avoid issues. Examples -------- @@ -164,7 +164,7 @@ class MongoDBWriteOptions(GenericOptions): .. note :: You can pass any value - `supported by connector `_, + `supported by connector `_, even if it is not mentioned in this documentation. The set of supported options depends on connector version. See link above. @@ -172,7 +172,7 @@ class MongoDBWriteOptions(GenericOptions): .. warning:: Options ``uri``, ``database``, ``collection`` are populated from connection attributes, - and cannot be set in ``WriteOptions`` class. + and cannot be overridden by the user in ``WriteOptions`` to avoid issues. Examples -------- From 82442bbc4e83eb634d15967de391f5500950fa07 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Tue, 5 Mar 2024 11:20:06 +0000 Subject: [PATCH 34/63] [DOP-13337] Use typing_extensions.deprecated instead of deprecated package --- onetl/connection/db_connection/hive/options.py | 6 ++---- onetl/connection/db_connection/jdbc_connection/options.py | 6 ++---- onetl/log.py | 6 ++---- requirements/core.txt | 5 ++--- 4 files changed, 8 insertions(+), 15 deletions(-) diff --git a/onetl/connection/db_connection/hive/options.py b/onetl/connection/db_connection/hive/options.py index 222f914fc..54acccc64 100644 --- a/onetl/connection/db_connection/hive/options.py +++ b/onetl/connection/db_connection/hive/options.py @@ -6,8 +6,8 @@ from enum import Enum from typing import List, Optional, Tuple, Union -from deprecated import deprecated from pydantic import Field, root_validator, validator +from typing_extensions import deprecated from onetl.impl import GenericOptions @@ -339,9 +339,7 @@ def _mode_is_deprecated(cls, values): @deprecated( - version="0.5.0", - reason="Please use 'WriteOptions' class instead. Will be removed in v1.0.0", - action="always", + "Deprecated in 0.5.0 and will be removed in 1.0.0. Use 'WriteOptions' instead", category=UserWarning, ) class HiveLegacyOptions(HiveWriteOptions): diff --git a/onetl/connection/db_connection/jdbc_connection/options.py b/onetl/connection/db_connection/jdbc_connection/options.py index 8793fc084..5a43c585f 100644 --- a/onetl/connection/db_connection/jdbc_connection/options.py +++ b/onetl/connection/db_connection/jdbc_connection/options.py @@ -6,8 +6,8 @@ from enum import Enum from typing import Optional -from deprecated import deprecated from pydantic import Field, PositiveInt, root_validator +from typing_extensions import deprecated from onetl._internal import to_camel from onetl.connection.db_connection.jdbc_mixin.options import JDBCOptions @@ -512,9 +512,7 @@ def _mode_is_deprecated(cls, values): @deprecated( - version="0.5.0", - reason="Please use 'ReadOptions' or 'WriteOptions' class instead. Will be removed in v1.0.0", - action="always", + "Deprecated in 0.5.0 and will be removed in 1.0.0. Use 'ReadOptions' or 'WriteOptions' instead", category=UserWarning, ) class JDBCLegacyOptions(JDBCReadOptions, JDBCWriteOptions): diff --git a/onetl/log.py b/onetl/log.py index c97fe6620..32c239b8b 100644 --- a/onetl/log.py +++ b/onetl/log.py @@ -11,8 +11,8 @@ from textwrap import dedent from typing import TYPE_CHECKING, Any, Collection, Iterable -from deprecated import deprecated from etl_entities.hwm import HWM +from typing_extensions import deprecated if TYPE_CHECKING: from pyspark.sql import DataFrame @@ -33,9 +33,7 @@ @deprecated( - version="0.5.0", - reason="Will be removed in 1.0.0, use 'setup_logging' instead", - action="always", + "Deprecated in 0.5.0 and will be removed in 1.0.0. Use 'setup_logging' instead", category=UserWarning, ) def setup_notebook_logging(level: int | str = logging.INFO) -> None: diff --git a/requirements/core.txt b/requirements/core.txt index f0ca45793..27c17ceaa 100644 --- a/requirements/core.txt +++ b/requirements/core.txt @@ -1,4 +1,3 @@ -deprecated etl-entities>=2.2,<2.3 evacuator>=1.0,<1.1 frozendict @@ -12,5 +11,5 @@ platformdirs pydantic>=1.9.2,!=1.10.2,<2 pyyaml # https://github.com/python/typing_extensions/pull/188 -typing_extensions>=4.0.0,!=4.6.1; python_version == "3.7" -typing_extensions>=4.0.0; python_version > "3.7" +typing_extensions>=4.5.0,!=4.6.1; python_version == "3.7" +typing_extensions>=4.5.0; python_version > "3.7" From 66057e0f4e280d235dda0a8bf4112c28e4fc8e01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Tue, 5 Mar 2024 12:11:47 +0000 Subject: [PATCH 35/63] [DOP-13337] Use typing_extensions.deprecated instead of deprecated package --- tests/tests_unit/tests_db_connection_unit/test_hive_unit.py | 2 +- .../tests_db_connection_unit/test_jdbc_options_unit.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/tests_unit/tests_db_connection_unit/test_hive_unit.py b/tests/tests_unit/tests_db_connection_unit/test_hive_unit.py index 01cffd4a8..891406ba4 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_hive_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_hive_unit.py @@ -103,7 +103,7 @@ def get_current_cluster() -> str: def test_hive_old_options_deprecated(): - warning_msg = "Please use 'WriteOptions' class instead. Will be removed in v1.0.0" + warning_msg = "Deprecated in 0.5.0 and will be removed in 1.0.0. Use 'WriteOptions' instead" with pytest.warns(UserWarning, match=warning_msg): options = Hive.Options(some="value") diff --git a/tests/tests_unit/tests_db_connection_unit/test_jdbc_options_unit.py b/tests/tests_unit/tests_db_connection_unit/test_jdbc_options_unit.py index ff33ef2af..7c8ecfcca 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_jdbc_options_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_jdbc_options_unit.py @@ -129,7 +129,7 @@ def test_jdbc_read_options_cannot_be_used_in_write_options(arg, value): ], ) def test_jdbc_old_options_allowed_but_deprecated(arg, value): - warning_msg = "Please use 'ReadOptions' or 'WriteOptions' class instead. Will be removed in v1.0.0" + warning_msg = "Deprecated in 0.5.0 and will be removed in 1.0.0. Use 'ReadOptions' or 'WriteOptions' instead" with pytest.warns(UserWarning, match=warning_msg): options = Postgres.Options.parse({arg: value}) From a994486ddb744df383b6f4ef93c229321682256a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Tue, 5 Mar 2024 12:17:16 +0000 Subject: [PATCH 36/63] [DOP-13337] Use typing_extensions.deprecated on legacy FileFilter and FileLimit --- onetl/core/file_filter/file_filter.py | 2 ++ onetl/core/file_limit/file_limit.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/onetl/core/file_filter/file_filter.py b/onetl/core/file_filter/file_filter.py index a3dac8654..5f71ff25a 100644 --- a/onetl/core/file_filter/file_filter.py +++ b/onetl/core/file_filter/file_filter.py @@ -10,11 +10,13 @@ from typing import List, Optional, Union from pydantic import Field, root_validator, validator +from typing_extensions import deprecated from onetl.base import BaseFileFilter, PathProtocol from onetl.impl import FrozenModel, RemotePath +@deprecated("Deprecated in 0.8.0 and will be removed in 1.0.0. Use Glob, Regexp or ExcludeDir instead", category=None) class FileFilter(BaseFileFilter, FrozenModel): r"""Filter files or directories by their path. diff --git a/onetl/core/file_limit/file_limit.py b/onetl/core/file_limit/file_limit.py index b178433e7..0e5637f8b 100644 --- a/onetl/core/file_limit/file_limit.py +++ b/onetl/core/file_limit/file_limit.py @@ -6,11 +6,13 @@ import warnings from pydantic import validator +from typing_extensions import deprecated from onetl.base import BaseFileLimit, PathProtocol from onetl.impl import FrozenModel +@deprecated("Deprecated in 0.8.0 and will be removed in 1.0.0. Use MaxFilesCount instead", category=None) class FileLimit(BaseFileLimit, FrozenModel): """Limits the number of downloaded files. From f92c602041b0ad281251bbe124f746b722d73f0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Tue, 5 Mar 2024 12:42:24 +0000 Subject: [PATCH 37/63] [DOP-13337] Add Pydantic v2 support --- .github/workflows/data/base/tracked.txt | 1 + .github/workflows/data/clickhouse/matrix.yml | 3 + .github/workflows/data/core/matrix.yml | 3 + .github/workflows/data/ftp/matrix.yml | 15 ++++- .github/workflows/data/ftps/matrix.yml | 15 ++++- .github/workflows/data/greenplum/matrix.yml | 18 +++++- .github/workflows/data/hdfs/matrix.yml | 3 + .github/workflows/data/hive/matrix.yml | 3 + .github/workflows/data/kafka/matrix.yml | 3 + .github/workflows/data/local-fs/matrix.yml | 5 ++ .github/workflows/data/mongodb/matrix.yml | 3 + .github/workflows/data/mssql/matrix.yml | 3 + .github/workflows/data/mysql/matrix.yml | 3 + .github/workflows/data/oracle/matrix.yml | 3 + .github/workflows/data/postgres/matrix.yml | 3 + .github/workflows/data/s3/matrix.yml | 3 + .github/workflows/data/samba/matrix.yml | 15 ++++- .github/workflows/data/sftp/matrix.yml | 9 ++- .github/workflows/data/teradata/matrix.yml | 2 + .github/workflows/data/webdav/matrix.yml | 15 ++++- .github/workflows/nightly.yml | 57 ++++++++++++------- .github/workflows/test-clickhouse.yml | 19 ++++--- .github/workflows/test-core.yml | 19 ++++--- .github/workflows/test-ftp.yml | 13 +++-- .github/workflows/test-ftps.yml | 13 +++-- .github/workflows/test-greenplum.yml | 19 ++++--- .github/workflows/test-hdfs.yml | 19 ++++--- .github/workflows/test-hive.yml | 19 ++++--- .github/workflows/test-kafka.yml | 19 ++++--- .github/workflows/test-local-fs.yml | 19 ++++--- .github/workflows/test-mongodb.yml | 19 ++++--- .github/workflows/test-mssql.yml | 19 ++++--- .github/workflows/test-mysql.yml | 19 ++++--- .github/workflows/test-oracle.yml | 19 ++++--- .github/workflows/test-postgres.yml | 19 ++++--- .github/workflows/test-s3.yml | 19 ++++--- .github/workflows/test-samba.yml | 13 +++-- .github/workflows/test-sftp.yml | 13 +++-- .github/workflows/test-teradata.yml | 19 ++++--- .github/workflows/test-webdav.yml | 13 +++-- .github/workflows/tests.yml | 57 ++++++++++++------- CONTRIBUTING.rst | 1 + docker/Dockerfile | 3 +- docs/changelog/next_release/230.feature.rst | 1 + onetl/_internal.py | 6 +- .../db_connection/db_connection/connection.py | 5 +- .../db_connection/greenplum/connection.py | 6 +- .../db_connection/greenplum/options.py | 5 +- .../db_connection/hive/connection.py | 6 +- .../connection/db_connection/hive/options.py | 6 +- .../db_connection/jdbc_connection/options.py | 6 +- .../db_connection/jdbc_mixin/connection.py | 5 +- .../db_connection/jdbc_mixin/options.py | 5 +- .../db_connection/kafka/connection.py | 6 +- .../db_connection/kafka/kafka_basic_auth.py | 5 +- .../kafka/kafka_kerberos_auth.py | 5 +- .../db_connection/kafka/kafka_scram_auth.py | 6 +- .../db_connection/kafka/kafka_ssl_protocol.py | 5 +- .../connection/db_connection/kafka/options.py | 5 +- .../db_connection/mongodb/connection.py | 6 +- .../db_connection/mongodb/options.py | 5 +- .../db_connection/oracle/connection.py | 5 +- onetl/connection/file_connection/ftp.py | 6 +- .../file_connection/hdfs/connection.py | 6 +- onetl/connection/file_connection/s3.py | 7 ++- onetl/connection/file_connection/samba.py | 6 +- onetl/connection/file_connection/sftp.py | 6 +- onetl/connection/file_connection/webdav.py | 7 ++- .../spark_file_df_connection.py | 5 +- .../spark_hdfs/connection.py | 6 +- .../file_df_connection/spark_local_fs.py | 5 +- .../file_df_connection/spark_s3/connection.py | 7 ++- onetl/core/file_filter/file_filter.py | 6 +- onetl/core/file_limit/file_limit.py | 6 +- onetl/db/db_reader/db_reader.py | 6 +- onetl/db/db_writer/db_writer.py | 5 +- onetl/file/file_df_reader/file_df_reader.py | 6 +- onetl/file/file_df_reader/options.py | 5 +- onetl/file/file_df_writer/file_df_writer.py | 5 +- onetl/file/file_df_writer/options.py | 5 +- onetl/file/file_downloader/file_downloader.py | 6 +- onetl/file/file_downloader/options.py | 5 +- onetl/file/file_downloader/result.py | 5 +- onetl/file/file_mover/file_mover.py | 6 +- onetl/file/file_mover/options.py | 5 +- onetl/file/file_mover/result.py | 5 +- onetl/file/file_result.py | 6 +- onetl/file/file_uploader/file_uploader.py | 6 +- onetl/file/file_uploader/options.py | 5 +- onetl/file/file_uploader/result.py | 5 +- onetl/file/filter/exclude_dir.py | 5 +- onetl/file/filter/glob.py | 5 +- onetl/file/filter/regexp.py | 5 +- onetl/file/format/avro.py | 5 +- onetl/file/format/csv.py | 5 +- onetl/file/format/xml.py | 5 +- onetl/hwm/auto_hwm.py | 7 ++- onetl/hwm/store/yaml_hwm_store.py | 6 +- onetl/impl/base_model.py | 7 ++- onetl/impl/generic_options.py | 5 +- onetl/strategy/batch_hwm_strategy.py | 5 +- requirements/core.txt | 4 +- requirements/docs-plantuml.txt | 2 - requirements/docs.txt | 2 +- requirements/tests/pydantic-1.txt | 1 + requirements/tests/pydantic-2.txt | 1 + requirements/tests/pydantic-latest.txt | 1 + setup.cfg | 2 +- 108 files changed, 658 insertions(+), 250 deletions(-) create mode 100644 docs/changelog/next_release/230.feature.rst delete mode 100644 requirements/docs-plantuml.txt create mode 100644 requirements/tests/pydantic-1.txt create mode 100644 requirements/tests/pydantic-2.txt create mode 100644 requirements/tests/pydantic-latest.txt diff --git a/.github/workflows/data/base/tracked.txt b/.github/workflows/data/base/tracked.txt index a2fac112a..c7bb90eb0 100644 --- a/.github/workflows/data/base/tracked.txt +++ b/.github/workflows/data/base/tracked.txt @@ -3,4 +3,5 @@ .github/workflows/data/base/** requirements/core.txt requirements/tests/base.txt +requirements/tests/pydantic-*.txt .env.local diff --git a/.github/workflows/data/clickhouse/matrix.yml b/.github/workflows/data/clickhouse/matrix.yml index 7c9b53384..9c8c558ba 100644 --- a/.github/workflows/data/clickhouse/matrix.yml +++ b/.github/workflows/data/clickhouse/matrix.yml @@ -1,17 +1,20 @@ min: &min spark-version: 2.3.1 + pydantic-version: 1 python-version: '3.7' java-version: 8 os: ubuntu-latest max: &max spark-version: 3.5.0 + pydantic-version: 2 python-version: '3.12' java-version: 20 os: ubuntu-latest latest: &latest spark-version: latest + pydantic-version: latest python-version: '3.12' java-version: 20 os: ubuntu-latest diff --git a/.github/workflows/data/core/matrix.yml b/.github/workflows/data/core/matrix.yml index 522e2160a..a7339e139 100644 --- a/.github/workflows/data/core/matrix.yml +++ b/.github/workflows/data/core/matrix.yml @@ -1,17 +1,20 @@ min: &min spark-version: 2.3.1 + pydantic-version: 1 python-version: '3.7' java-version: 8 os: ubuntu-latest max: &max spark-version: 3.5.0 + pydantic-version: 2 python-version: '3.12' java-version: 20 os: ubuntu-latest latest: &latest spark-version: latest + pydantic-version: latest python-version: '3.12' java-version: 20 os: ubuntu-latest diff --git a/.github/workflows/data/ftp/matrix.yml b/.github/workflows/data/ftp/matrix.yml index 86243228c..49468d914 100644 --- a/.github/workflows/data/ftp/matrix.yml +++ b/.github/workflows/data/ftp/matrix.yml @@ -1,8 +1,15 @@ min: &min + pydantic-version: 1 python-version: '3.7' os: ubuntu-latest max: &max + pydantic-version: 2 + python-version: '3.12' + os: ubuntu-latest + +latest: &latest + pydantic-version: latest python-version: '3.12' os: ubuntu-latest @@ -11,9 +18,13 @@ matrix: # chonjay21/ftps image has only latest tag - ftp-version: latest <<: *max - full: &full + full: - ftp-version: latest <<: *min - ftp-version: latest <<: *max - nightly: *full + nightly: + - ftp-version: latest + <<: *min + - ftp-version: latest + <<: *latest diff --git a/.github/workflows/data/ftps/matrix.yml b/.github/workflows/data/ftps/matrix.yml index bb61dc6b0..ec5a862cd 100644 --- a/.github/workflows/data/ftps/matrix.yml +++ b/.github/workflows/data/ftps/matrix.yml @@ -1,8 +1,15 @@ min: &min + pydantic-version: 1 python-version: '3.7' os: ubuntu-latest max: &max + pydantic-version: 2 + python-version: '3.12' + os: ubuntu-latest + +latest: &latest + pydantic-version: latest python-version: '3.12' os: ubuntu-latest @@ -11,9 +18,13 @@ matrix: # chonjay21/ftps image has only latest tag - ftps-version: latest <<: *max - full: &full + full: - ftps-version: latest <<: *min - ftps-version: latest <<: *max - nightly: *full + nightly: + - ftps-version: latest + <<: *min + - ftps-version: latest + <<: *latest diff --git a/.github/workflows/data/greenplum/matrix.yml b/.github/workflows/data/greenplum/matrix.yml index 7715ed5c3..05afbb3bd 100644 --- a/.github/workflows/data/greenplum/matrix.yml +++ b/.github/workflows/data/greenplum/matrix.yml @@ -1,6 +1,7 @@ min: &min # Spark 2.3.0 does not support passing ivysettings.xml spark-version: 2.3.1 + pydantic-version: 1 python-version: '3.7' java-version: 8 os: ubuntu-latest @@ -8,6 +9,15 @@ min: &min max: &max # Greenplum connector does not support Spark 3.3+ spark-version: 3.2.4 + pydantic-version: 2 + python-version: '3.10' + java-version: 11 + os: ubuntu-latest + +latest: &latest + # Greenplum connector does not support Spark 3.3+ + spark-version: 3.2.4 + pydantic-version: latest python-version: '3.10' java-version: 11 os: ubuntu-latest @@ -16,9 +26,13 @@ matrix: small: - greenplum-version: 7.0.0 <<: *max - full: &full + full: - greenplum-version: 6.25.3 <<: *min - greenplum-version: 7.0.0 <<: *max - nightly: *full + nightly: + - greenplum-version: 6.25.3 + <<: *min + - greenplum-version: 7.0.0 + <<: *latest diff --git a/.github/workflows/data/hdfs/matrix.yml b/.github/workflows/data/hdfs/matrix.yml index c117c8d6f..e62f0242a 100644 --- a/.github/workflows/data/hdfs/matrix.yml +++ b/.github/workflows/data/hdfs/matrix.yml @@ -1,6 +1,7 @@ min: &min hadoop-version: hadoop2-hdfs spark-version: 2.3.1 + pydantic-version: 1 python-version: '3.7' java-version: 8 os: ubuntu-latest @@ -8,6 +9,7 @@ min: &min max: &max hadoop-version: hadoop3-hdfs spark-version: 3.5.0 + pydantic-version: 2 python-version: '3.12' java-version: 20 os: ubuntu-latest @@ -15,6 +17,7 @@ max: &max latest: &latest hadoop-version: hadoop3-hdfs spark-version: latest + pydantic-version: latest python-version: '3.12' java-version: 20 os: ubuntu-latest diff --git a/.github/workflows/data/hive/matrix.yml b/.github/workflows/data/hive/matrix.yml index 370d22d95..0f7d4ba6b 100644 --- a/.github/workflows/data/hive/matrix.yml +++ b/.github/workflows/data/hive/matrix.yml @@ -1,17 +1,20 @@ min: &min spark-version: 2.3.1 + pydantic-version: 1 python-version: '3.7' java-version: 8 os: ubuntu-latest max: &max spark-version: 3.5.0 + pydantic-version: 2 python-version: '3.12' java-version: 20 os: ubuntu-latest latest: &latest spark-version: latest + pydantic-version: latest python-version: '3.12' java-version: 20 os: ubuntu-latest diff --git a/.github/workflows/data/kafka/matrix.yml b/.github/workflows/data/kafka/matrix.yml index 26ccf4af7..29e587721 100644 --- a/.github/workflows/data/kafka/matrix.yml +++ b/.github/workflows/data/kafka/matrix.yml @@ -1,6 +1,7 @@ min: &min # kafka_version: 0.10.2-1-r3 kafka-version: 3.5.1 + pydantic-version: 1 spark-version: 2.4.8 python-version: '3.7' java-version: 8 @@ -8,6 +9,7 @@ min: &min max: &max kafka-version: 3.5.1 + pydantic-version: 2 spark-version: 3.5.0 python-version: '3.12' java-version: 20 @@ -15,6 +17,7 @@ max: &max latest: &latest kafka-version: latest + pydantic-version: latest spark-version: latest python-version: '3.12' java-version: 20 diff --git a/.github/workflows/data/local-fs/matrix.yml b/.github/workflows/data/local-fs/matrix.yml index 6dcf8efd3..b3db2391f 100644 --- a/.github/workflows/data/local-fs/matrix.yml +++ b/.github/workflows/data/local-fs/matrix.yml @@ -1,29 +1,34 @@ min: &min spark-version: 2.3.1 + pydantic-version: 1 python-version: '3.7' java-version: 8 os: ubuntu-latest min_avro: &min_avro spark-version: 2.4.8 + pydantic-version: 1 python-version: '3.7' java-version: 8 os: ubuntu-latest min_excel: &min_excel spark-version: 3.2.4 + pydantic-version: 1 python-version: '3.7' java-version: 8 os: ubuntu-latest max: &max spark-version: 3.5.0 + pydantic-version: 2 python-version: '3.12' java-version: 20 os: ubuntu-latest latest: &latest spark-version: latest + pydantic-version: latest python-version: '3.12' java-version: 20 os: ubuntu-latest diff --git a/.github/workflows/data/mongodb/matrix.yml b/.github/workflows/data/mongodb/matrix.yml index 34e66b85a..c916cc306 100644 --- a/.github/workflows/data/mongodb/matrix.yml +++ b/.github/workflows/data/mongodb/matrix.yml @@ -1,18 +1,21 @@ min: &min # MongoDB connector does not support Spark 2 spark-version: 3.2.4 + pydantic-version: 1 python-version: '3.7' java-version: 8 os: ubuntu-latest max: &max spark-version: 3.4.2 + pydantic-version: 2 python-version: '3.12' java-version: 20 os: ubuntu-latest latest: &latest spark-version: latest + pydantic-version: latest python-version: '3.12' java-version: 20 os: ubuntu-latest diff --git a/.github/workflows/data/mssql/matrix.yml b/.github/workflows/data/mssql/matrix.yml index bbb121c39..0138805bb 100644 --- a/.github/workflows/data/mssql/matrix.yml +++ b/.github/workflows/data/mssql/matrix.yml @@ -1,17 +1,20 @@ min: &min spark-version: 2.3.1 + pydantic-version: 1 python-version: '3.7' java-version: 8 os: ubuntu-latest max: &max spark-version: 3.5.0 + pydantic-version: 2 python-version: '3.12' java-version: 20 os: ubuntu-latest latest: &latest spark-version: latest + pydantic-version: latest python-version: '3.12' java-version: 20 os: ubuntu-latest diff --git a/.github/workflows/data/mysql/matrix.yml b/.github/workflows/data/mysql/matrix.yml index aa1c575d1..9b64e3b93 100644 --- a/.github/workflows/data/mysql/matrix.yml +++ b/.github/workflows/data/mysql/matrix.yml @@ -1,17 +1,20 @@ min: &min spark-version: 2.3.1 + pydantic-version: 1 python-version: '3.7' java-version: 8 os: ubuntu-latest max: &max spark-version: 3.5.0 + pydantic-version: 2 python-version: '3.12' java-version: 20 os: ubuntu-latest latest: &latest spark-version: latest + pydantic-version: latest python-version: '3.12' java-version: 20 os: ubuntu-latest diff --git a/.github/workflows/data/oracle/matrix.yml b/.github/workflows/data/oracle/matrix.yml index 1230f659c..55dc4c185 100644 --- a/.github/workflows/data/oracle/matrix.yml +++ b/.github/workflows/data/oracle/matrix.yml @@ -1,17 +1,20 @@ min: &min spark-version: 2.3.1 + pydantic-version: 1 python-version: '3.7' java-version: 8 os: ubuntu-latest max: &max spark-version: 3.5.0 + pydantic-version: 2 python-version: '3.12' java-version: 20 os: ubuntu-latest latest: &latest spark-version: latest + pydantic-version: latest python-version: '3.12' java-version: 20 os: ubuntu-latest diff --git a/.github/workflows/data/postgres/matrix.yml b/.github/workflows/data/postgres/matrix.yml index 78de0b38b..8cdff4f63 100644 --- a/.github/workflows/data/postgres/matrix.yml +++ b/.github/workflows/data/postgres/matrix.yml @@ -1,17 +1,20 @@ min: &min spark-version: 2.3.1 + pydantic-version: 1 python-version: '3.7' java-version: 8 os: ubuntu-latest max: &max spark-version: 3.5.0 + pydantic-version: 2 python-version: '3.12' java-version: 20 os: ubuntu-latest latest: &latest spark-version: latest + pydantic-version: latest python-version: '3.12' java-version: 20 os: ubuntu-latest diff --git a/.github/workflows/data/s3/matrix.yml b/.github/workflows/data/s3/matrix.yml index b2d595d6b..a0825603b 100644 --- a/.github/workflows/data/s3/matrix.yml +++ b/.github/workflows/data/s3/matrix.yml @@ -3,6 +3,7 @@ min: &min minio-version: 2021.3.17 # Minimal Spark version with Hadoop 3.x support spark-version: 3.2.4 + pydantic-version: 1 python-version: '3.7' java-version: 8 os: ubuntu-latest @@ -10,6 +11,7 @@ min: &min max: &max minio-version: 2023.7.18 spark-version: 3.5.0 + pydantic-version: 2 python-version: '3.12' java-version: 20 os: ubuntu-latest @@ -17,6 +19,7 @@ max: &max latest: &latest minio-version: latest spark-version: latest + pydantic-version: latest python-version: '3.12' java-version: 20 os: ubuntu-latest diff --git a/.github/workflows/data/samba/matrix.yml b/.github/workflows/data/samba/matrix.yml index f03adcd08..5b0b2628e 100644 --- a/.github/workflows/data/samba/matrix.yml +++ b/.github/workflows/data/samba/matrix.yml @@ -1,8 +1,15 @@ min: &min + pydantic-version: 1 python-version: '3.7' os: ubuntu-latest max: &max + pydantic-version: 2 + python-version: '3.12' + os: ubuntu-latest + +latest: &latest + pydantic-version: latest python-version: '3.12' os: ubuntu-latest @@ -10,9 +17,13 @@ matrix: small: - server-version: latest <<: *max - full: &full + full: - server-version: latest <<: *min - server-version: latest <<: *max - nightly: *full + nightly: + - server-version: latest + <<: *min + - server-version: latest + <<: *latest diff --git a/.github/workflows/data/sftp/matrix.yml b/.github/workflows/data/sftp/matrix.yml index 3379a6e3e..0dfd9e730 100644 --- a/.github/workflows/data/sftp/matrix.yml +++ b/.github/workflows/data/sftp/matrix.yml @@ -1,8 +1,15 @@ min: &min + pydantic-version: 1 python-version: '3.7' os: ubuntu-latest max: &max + pydantic-version: 2 + python-version: '3.12' + os: ubuntu-latest + +latest: &latest + pydantic-version: latest python-version: '3.12' os: ubuntu-latest @@ -20,4 +27,4 @@ matrix: - openssh-version: 8.1_p1-r0-ls5 <<: *min - openssh-version: latest - <<: *max + <<: *latest diff --git a/.github/workflows/data/teradata/matrix.yml b/.github/workflows/data/teradata/matrix.yml index 5391077c9..9647daec6 100644 --- a/.github/workflows/data/teradata/matrix.yml +++ b/.github/workflows/data/teradata/matrix.yml @@ -1,11 +1,13 @@ max: &max spark-version: 3.5.0 + pydantic-version: 2 python-version: '3.12' java-version: 20 os: ubuntu-latest latest: &latest spark-version: latest + pydantic-version: latest python-version: '3.12' java-version: 20 os: ubuntu-latest diff --git a/.github/workflows/data/webdav/matrix.yml b/.github/workflows/data/webdav/matrix.yml index 0a67ff838..8d8f012a7 100644 --- a/.github/workflows/data/webdav/matrix.yml +++ b/.github/workflows/data/webdav/matrix.yml @@ -1,8 +1,15 @@ min: &min + pydantic-version: 1 python-version: '3.7' os: ubuntu-latest max: &max + pydantic-version: 2 + python-version: '3.12' + os: ubuntu-latest + +latest: &latest + pydantic-version: latest python-version: '3.12' os: ubuntu-latest @@ -11,9 +18,13 @@ matrix: # chonjay21/webdav image has only latest tag - webdav-version: latest <<: *max - full: &full + full: - webdav-version: latest <<: *min - webdav-version: latest <<: *max - nightly: *full + nightly: + - webdav-version: latest + <<: *min + - webdav-version: latest + <<: *latest diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index 58b34cee1..3c3b9d3bc 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -20,7 +20,7 @@ jobs: nightly: true tests-core: - name: Run core tests (spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run core tests (spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -30,13 +30,14 @@ jobs: uses: ./.github/workflows/test-core.yml with: spark-version: ${{ matrix.spark-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} with-cache: false tests-clickhouse: - name: Run Clickhouse tests (server=${{ matrix.clickhouse-version }}, spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run Clickhouse tests (server=${{ matrix.clickhouse-version }}, spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -48,13 +49,14 @@ jobs: clickhouse-image: ${{ matrix.clickhouse-image }} clickhouse-version: ${{ matrix.clickhouse-version }} spark-version: ${{ matrix.spark-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} with-cache: false tests-greenplum: - name: Run Greenplum tests (server=${{ matrix.greenplum-version }}, spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run Greenplum tests (server=${{ matrix.greenplum-version }}, spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -65,6 +67,7 @@ jobs: with: greenplum-version: ${{ matrix.greenplum-version }} spark-version: ${{ matrix.spark-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} @@ -74,7 +77,7 @@ jobs: GREENPLUM_PACKAGES_PASSWORD: ${{ secrets.GREENPLUM_PACKAGES_PASSWORD }} tests-hive: - name: Run Hive tests (spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run Hive tests (spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -84,13 +87,14 @@ jobs: uses: ./.github/workflows/test-hive.yml with: spark-version: ${{ matrix.spark-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} with-cache: false tests-kafka: - name: Run Kafka tests (server=${{ matrix.kafka-version }}, spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run Kafka tests (server=${{ matrix.kafka-version }}, spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -101,13 +105,14 @@ jobs: with: kafka-version: ${{ matrix.kafka-version }} spark-version: ${{ matrix.spark-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} with-cache: false tests-local-fs: - name: Run LocalFS tests (spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run LocalFS tests (spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -117,13 +122,14 @@ jobs: uses: ./.github/workflows/test-local-fs.yml with: spark-version: ${{ matrix.spark-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} with-cache: false tests-mongodb: - name: Run MongoDB tests (server=${{ matrix.mongodb-version }}, spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run MongoDB tests (server=${{ matrix.mongodb-version }}, spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -134,13 +140,14 @@ jobs: with: mongodb-version: ${{ matrix.mongodb-version }} spark-version: ${{ matrix.spark-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} with-cache: false tests-mssql: - name: Run MSSQL tests (server=${{ matrix.mssql-version }}, spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run MSSQL tests (server=${{ matrix.mssql-version }}, spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -151,13 +158,14 @@ jobs: with: mssql-version: ${{ matrix.mssql-version }} spark-version: ${{ matrix.spark-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} with-cache: false tests-mysql: - name: Run MySQL tests (server=${{ matrix.mysql-version }}, spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run MySQL tests (server=${{ matrix.mysql-version }}, spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -168,13 +176,14 @@ jobs: with: mysql-version: ${{ matrix.mysql-version }} spark-version: ${{ matrix.spark-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} with-cache: false tests-oracle: - name: Run Oracle tests (server=${{ matrix.oracle-version }}, spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run Oracle tests (server=${{ matrix.oracle-version }}, spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -187,13 +196,14 @@ jobs: oracle-version: ${{ matrix.oracle-version }} db-name: ${{ matrix.db-name }} spark-version: ${{ matrix.spark-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} with-cache: false tests-postgres: - name: Run Postgres tests (server=${{ matrix.postgres-version }}, spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run Postgres tests (server=${{ matrix.postgres-version }}, spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -204,13 +214,14 @@ jobs: with: postgres-version: ${{ matrix.postgres-version }} spark-version: ${{ matrix.spark-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} with-cache: false tests-teradata: - name: Run Teradata tests (spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run Teradata tests (spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -220,13 +231,14 @@ jobs: uses: ./.github/workflows/test-teradata.yml with: spark-version: ${{ matrix.spark-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} with-cache: false tests-ftp: - name: Run FTP tests (server=${{ matrix.ftp-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run FTP tests (server=${{ matrix.ftp-version }}, pydantic=${{ matrix.pydantic-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -236,12 +248,13 @@ jobs: uses: ./.github/workflows/test-ftp.yml with: ftp-version: ${{ matrix.ftp-version }} + pydantic-version: ${{ matrix.pydantic-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} with-cache: false tests-ftps: - name: Run FTPS tests (server=${{ matrix.ftps-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run FTPS tests (server=${{ matrix.ftps-version }}, pydantic=${{ matrix.pydantic-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -251,12 +264,13 @@ jobs: uses: ./.github/workflows/test-ftps.yml with: ftps-version: ${{ matrix.ftps-version }} + pydantic-version: ${{ matrix.pydantic-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} with-cache: false tests-hdfs: - name: Run HDFS tests (server=${{ matrix.hadoop-version }}, spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run HDFS tests (server=${{ matrix.hadoop-version }}, spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -267,13 +281,14 @@ jobs: with: hadoop-version: ${{ matrix.hadoop-version }} spark-version: ${{ matrix.spark-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} with-cache: false tests-s3: - name: Run S3 tests (server=${{ matrix.minio-version }}, spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run S3 tests (server=${{ matrix.minio-version }}, spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -284,13 +299,14 @@ jobs: with: minio-version: ${{ matrix.minio-version }} spark-version: ${{ matrix.spark-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} with-cache: false tests-sftp: - name: Run SFTP tests (server=${{ matrix.openssh-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run SFTP tests (server=${{ matrix.openssh-version }}, pydantic=${{ matrix.pydantic-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -300,12 +316,13 @@ jobs: uses: ./.github/workflows/test-sftp.yml with: openssh-version: ${{ matrix.openssh-version }} + pydantic-version: ${{ matrix.pydantic-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} with-cache: false tests-samba: - name: Run Samba tests (server=${{ matrix.server-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run Samba tests (server=${{ matrix.server-version }}, pydantic=${{ matrix.pydantic-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -315,12 +332,13 @@ jobs: uses: ./.github/workflows/test-samba.yml with: server-version: ${{ matrix.server-version }} + pydantic-version: ${{ matrix.pydantic-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} with-cache: false tests-webdav: - name: Run WebDAV tests (server=${{ matrix.openwebdavssh-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run WebDAV tests (server=${{ matrix.openwebdavssh-version }}, pydantic=${{ matrix.pydantic-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -330,6 +348,7 @@ jobs: uses: ./.github/workflows/test-webdav.yml with: webdav-version: ${{ matrix.webdav-version }} + pydantic-version: ${{ matrix.pydantic-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} with-cache: false diff --git a/.github/workflows/test-clickhouse.yml b/.github/workflows/test-clickhouse.yml index 236c1b125..4a29f84cb 100644 --- a/.github/workflows/test-clickhouse.yml +++ b/.github/workflows/test-clickhouse.yml @@ -11,6 +11,9 @@ on: spark-version: required: true type: string + pydantic-version: + required: true + type: string java-version: required: true type: string @@ -27,7 +30,7 @@ on: jobs: test-clickhouse: - name: Run Clickhouse tests (server=${{ inputs.clickhouse-version }}, spark=${{ inputs.spark-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) + name: Run Clickhouse tests (server=${{ inputs.clickhouse-version }}, spark=${{ inputs.spark-version }}, pydantic=${{ inputs.pydantic-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) runs-on: ${{ inputs.os }} services: clickhouse: @@ -58,27 +61,27 @@ jobs: if: inputs.with-cache with: path: ~/.ivy2 - key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-clickhouse-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-clickhouse-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} restore-keys: | - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-clickhouse-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-clickhouse- + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-clickhouse-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-clickhouse- - name: Cache pip uses: actions/cache@v4 if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-clickhouse-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/clickhouse.txt', 'requirements/tests/spark-*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-clickhouse-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/clickhouse.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-clickhouse-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/clickhouse.txt', 'requirements/tests/spark-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-clickhouse- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-clickhouse-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/clickhouse.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-clickhouse- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel - name: Install dependencies run: | - pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/clickhouse.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt + pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/clickhouse.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - name: Wait for Clickhouse to be ready run: | diff --git a/.github/workflows/test-core.yml b/.github/workflows/test-core.yml index 6e9036df1..cdd977398 100644 --- a/.github/workflows/test-core.yml +++ b/.github/workflows/test-core.yml @@ -5,6 +5,9 @@ on: spark-version: required: true type: string + pydantic-version: + required: true + type: string java-version: required: true type: string @@ -21,7 +24,7 @@ on: jobs: test-core: - name: Run core tests (spark=${{ inputs.spark-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) + name: Run core tests (spark=${{ inputs.spark-version }}, pydantic=${{ inputs.pydantic-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) runs-on: ${{ inputs.os }} steps: @@ -44,27 +47,27 @@ jobs: if: inputs.with-cache with: path: ~/.ivy2 - key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-clickhouse-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-clickhouse-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} restore-keys: | - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-clickhouse-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-clickhouse- + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-clickhouse-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-clickhouse- - name: Cache pip uses: actions/cache@v4 if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-tests-core-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-core-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-tests-core-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-tests-core- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-core-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-core- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel - name: Install dependencies run: | - pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt + pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - name: Run tests run: | diff --git a/.github/workflows/test-ftp.yml b/.github/workflows/test-ftp.yml index 962013077..4e947d738 100644 --- a/.github/workflows/test-ftp.yml +++ b/.github/workflows/test-ftp.yml @@ -5,6 +5,9 @@ on: ftp-version: required: true type: string + pydantic-version: + required: true + type: string python-version: required: true type: string @@ -18,7 +21,7 @@ on: jobs: test-ftp: - name: Run FTP tests (server=${{ inputs.ftp-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) + name: Run FTP tests (server=${{ inputs.ftp-version }}, pydantic=${{ inputs.pydantic-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) runs-on: ${{ inputs.os }} steps: @@ -35,17 +38,17 @@ jobs: if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-ftp-${{ hashFiles('requirements/core.txt', 'requirements/ftp.txt', 'requirements/tests/base.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-pydantic-${{ inputs.pydantic-version }}-tests-ftp-${{ hashFiles('requirements/core.txt', 'requirements/ftp.txt', 'requirements/tests/base.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-ftp-${{ hashFiles('requirements/core.txt', 'requirements/ftp.txt', 'requirements/tests/base.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-ftp- + ${{ runner.os }}-python-${{ inputs.python-version }}-pydantic-${{ inputs.pydantic-version }}-tests-ftp-${{ hashFiles('requirements/core.txt', 'requirements/ftp.txt', 'requirements/tests/base.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-pydantic-${{ inputs.pydantic-version }}-tests-ftp- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel - name: Install dependencies run: | - pip install -I -r requirements/core.txt -r requirements/ftp.txt -r requirements/tests/base.txt + pip install -I -r requirements/core.txt -r requirements/ftp.txt -r requirements/tests/base.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt # Replace with Github Actions' services after https://github.com/chonjay21/docker-ftp/pull/3 # Cannot use services because we need to mount config file from the repo, but services start before checkout. diff --git a/.github/workflows/test-ftps.yml b/.github/workflows/test-ftps.yml index 3be745aed..19cce458c 100644 --- a/.github/workflows/test-ftps.yml +++ b/.github/workflows/test-ftps.yml @@ -5,6 +5,9 @@ on: ftps-version: required: true type: string + pydantic-version: + required: true + type: string python-version: required: true type: string @@ -18,7 +21,7 @@ on: jobs: test-ftps: - name: Run FTPS tests (server=${{ inputs.ftps-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) + name: Run FTPS tests (server=${{ inputs.ftps-version }}, pydantic=${{ inputs.pydantic-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) runs-on: ${{ inputs.os }} steps: @@ -35,17 +38,17 @@ jobs: if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-ftps-${{ hashFiles('requirements/core.txt', 'requirements/ftp.txt', 'requirements/tests/base.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-pydantic-${{ inputs.pydantic-version }}-tests-ftps-${{ hashFiles('requirements/core.txt', 'requirements/ftp.txt', 'requirements/tests/base.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-ftps-${{ hashFiles('requirements/core.txt', 'requirements/ftp.txt', 'requirements/tests/base.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-ftps- + ${{ runner.os }}-python-${{ inputs.python-version }}-pydantic-${{ inputs.pydantic-version }}-tests-ftps-${{ hashFiles('requirements/core.txt', 'requirements/ftp.txt', 'requirements/tests/base.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-pydantic-${{ inputs.pydantic-version }}-tests-ftps- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel - name: Install dependencies run: | - pip install -I -r requirements/core.txt -r requirements/ftp.txt -r requirements/tests/base.txt + pip install -I -r requirements/core.txt -r requirements/ftp.txt -r requirements/tests/base.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt # Replace with Github Actions' services after https://github.com/chonjay21/docker-ftps/pull/3 # Cannot use services because we need to mount config file from the repo, but services start before checkout. diff --git a/.github/workflows/test-greenplum.yml b/.github/workflows/test-greenplum.yml index 94b99436c..d8174979e 100644 --- a/.github/workflows/test-greenplum.yml +++ b/.github/workflows/test-greenplum.yml @@ -8,6 +8,9 @@ on: spark-version: required: true type: string + pydantic-version: + required: true + type: string java-version: required: true type: string @@ -30,7 +33,7 @@ on: jobs: test-greenplum: if: github.repository == 'MobileTeleSystems/onetl' # prevent running on forks - name: Run Greenplum tests (server=${{ inputs.greenplum-version }}, spark=${{ inputs.spark-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) + name: Run Greenplum tests (server=${{ inputs.greenplum-version }}, spark=${{ inputs.spark-version }}, pydantic=${{ inputs.pydantic-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) runs-on: ${{ inputs.os }} services: greenplum: @@ -60,20 +63,20 @@ jobs: if: inputs.with-cache with: path: ~/.ivy2 - key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-greenplum-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-greenplum-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} restore-keys: | - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-greenplum-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-greenplum- + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-greenplum-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-greenplum- - name: Cache pip uses: actions/cache@v4 if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-tests-greenplum-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/postgres.txt', 'requirements/tests/spark-*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-greenplum-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/postgres.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-tests-greenplum-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/postgres.txt', 'requirements/tests/spark-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-tests-greenplum- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-greenplum-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/postgres.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-greenplum- - name: Set up Postgres client if: runner.os == 'Linux' @@ -86,7 +89,7 @@ jobs: - name: Install dependencies run: | - pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/postgres.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt + pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/postgres.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - name: Wait for Greenplum to be ready run: | diff --git a/.github/workflows/test-hdfs.yml b/.github/workflows/test-hdfs.yml index c97f5357c..e97ee6249 100644 --- a/.github/workflows/test-hdfs.yml +++ b/.github/workflows/test-hdfs.yml @@ -8,6 +8,9 @@ on: spark-version: required: true type: string + pydantic-version: + required: true + type: string java-version: required: true type: string @@ -24,7 +27,7 @@ on: jobs: test-hdfs: - name: Run HDFS tests (server=${{ inputs.hadoop-version }}, spark=${{ inputs.spark-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) + name: Run HDFS tests (server=${{ inputs.hadoop-version }}, spark=${{ inputs.spark-version }}, pydantic=${{ inputs.pydantic-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) runs-on: ${{ inputs.os }} steps: @@ -53,27 +56,27 @@ jobs: if: inputs.with-cache with: path: ~/.ivy2 - key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-hdfs-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-hdfs-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} restore-keys: | - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-hdfs-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-hdfs- + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-hdfs-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-hdfs- - name: Cache pip uses: actions/cache@v4 if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-hdfs-${{ hashFiles('requirements/core.txt', 'requirements/kerberos.txt', 'requirements/hdfs.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-hdfs-${{ hashFiles('requirements/core.txt', 'requirements/kerberos.txt', 'requirements/hdfs.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-hdfs-${{ hashFiles('requirements/core.txt', 'requirements/kerberos.txt', 'requirements/hdfs.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-hdfs- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-hdfs-${{ hashFiles('requirements/core.txt', 'requirements/kerberos.txt', 'requirements/hdfs.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-hdfs- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel - name: Install dependencies run: | - pip install -I -r requirements/core.txt -r requirements/kerberos.txt -r requirements/hdfs.txt -r requirements/tests/base.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt + pip install -I -r requirements/core.txt -r requirements/kerberos.txt -r requirements/hdfs.txt -r requirements/tests/base.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt # Cannot use services because we need to mount config file from the repo, but services start before checkout. # See https://github.com/orgs/community/discussions/25792 diff --git a/.github/workflows/test-hive.yml b/.github/workflows/test-hive.yml index 6c77690fb..446cb76d0 100644 --- a/.github/workflows/test-hive.yml +++ b/.github/workflows/test-hive.yml @@ -5,6 +5,9 @@ on: spark-version: required: true type: string + pydantic-version: + required: true + type: string java-version: required: true type: string @@ -21,7 +24,7 @@ on: jobs: test-hive: - name: Run Hive tests (spark=${{ inputs.spark-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) + name: Run Hive tests (spark=${{ inputs.spark-version }}, pydantic=${{ inputs.pydantic-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) runs-on: ${{ inputs.os }} steps: @@ -44,27 +47,27 @@ jobs: if: inputs.with-cache with: path: ~/.ivy2 - key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-hive-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-hive-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} restore-keys: | - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-hive-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-hive- + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-hive-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-hive- - name: Cache pip uses: actions/cache@v4 if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-hive-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-hive-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-hive-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-hive- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-hive-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-hive- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel - name: Install dependencies run: | - pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt + pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - name: Run tests run: | diff --git a/.github/workflows/test-kafka.yml b/.github/workflows/test-kafka.yml index a72172d84..d1389049f 100644 --- a/.github/workflows/test-kafka.yml +++ b/.github/workflows/test-kafka.yml @@ -8,6 +8,9 @@ on: spark-version: required: true type: string + pydantic-version: + required: true + type: string java-version: required: true type: string @@ -24,7 +27,7 @@ on: jobs: test-kafka: - name: Run Kafka tests (server=${{ inputs.kafka-version }}, spark=${{ inputs.spark-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) + name: Run Kafka tests (server=${{ inputs.kafka-version }}, spark=${{ inputs.spark-version }}, pydantic=${{ inputs.pydantic-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) runs-on: ${{ inputs.os }} services: @@ -91,27 +94,27 @@ jobs: if: inputs.with-cache with: path: ~/.ivy2 - key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-kafka-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-kafka-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} restore-keys: | - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-kafka-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-kafka- + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-kafka-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-kafka- - name: Cache pip uses: actions/cache@v4 if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-kafka-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/kafka.txt', 'requirements/tests/spark-*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-kafka-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/kafka.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-kafka-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/kafka.txt', 'requirements/tests/spark-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-kafka- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-kafka-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/kafka.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-kafka- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel - name: Install dependencies run: | - pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/kafka.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt + pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/kafka.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - name: Wait for Kafka to be ready run: | diff --git a/.github/workflows/test-local-fs.yml b/.github/workflows/test-local-fs.yml index 98c98e652..529deef41 100644 --- a/.github/workflows/test-local-fs.yml +++ b/.github/workflows/test-local-fs.yml @@ -5,6 +5,9 @@ on: spark-version: required: true type: string + pydantic-version: + required: true + type: string java-version: required: true type: string @@ -21,7 +24,7 @@ on: jobs: test-local-fs: - name: Run LocalFS tests (spark=${{ inputs.spark-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) + name: Run LocalFS tests (spark=${{ inputs.spark-version }}, pydantic=${{ inputs.pydantic-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) runs-on: ${{ inputs.os }} steps: @@ -44,27 +47,27 @@ jobs: if: inputs.with-cache with: path: ~/.ivy2 - key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-local-fs-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-local-fs-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} restore-keys: | - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-local-fs-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-local-fs- + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-local-fs-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-local-fs- - name: Cache pip uses: actions/cache@v4 if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-local-fs-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-local-fs-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-local-fs-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-local-fs- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-local-fs-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-local-fs- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel - name: Install dependencies run: | - pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt + pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - name: Run tests run: | diff --git a/.github/workflows/test-mongodb.yml b/.github/workflows/test-mongodb.yml index 3366bb3cb..c842ce7af 100644 --- a/.github/workflows/test-mongodb.yml +++ b/.github/workflows/test-mongodb.yml @@ -8,6 +8,9 @@ on: spark-version: required: true type: string + pydantic-version: + required: true + type: string java-version: required: true type: string @@ -24,7 +27,7 @@ on: jobs: test-mongodb: - name: Run MongoDB tests (server=${{ inputs.mongodb-version }}, spark=${{ inputs.spark-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) + name: Run MongoDB tests (server=${{ inputs.mongodb-version }}, spark=${{ inputs.spark-version }}, pydantic=${{ inputs.pydantic-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) runs-on: ${{ inputs.os }} services: mongodb: @@ -56,27 +59,27 @@ jobs: if: inputs.with-cache with: path: ~/.ivy2 - key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-mongodb-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-mongodb-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} restore-keys: | - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-mongodb-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-mongodb- + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-mongodb-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-mongodb- - name: Cache pip uses: actions/cache@v4 if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-mongodb-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/mongodb.txt', 'requirements/tests/spark-*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-mongodb-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/mongodb.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-mongodb-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/mongodb.txt', 'requirements/tests/spark-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-mongodb- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-mongodb-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/mongodb.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-mongodb- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel - name: Install dependencies run: | - pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/mongodb.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt + pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/mongodb.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - name: Wait for MongoDB to be ready run: | diff --git a/.github/workflows/test-mssql.yml b/.github/workflows/test-mssql.yml index 529f9edec..efc8edce8 100644 --- a/.github/workflows/test-mssql.yml +++ b/.github/workflows/test-mssql.yml @@ -8,6 +8,9 @@ on: spark-version: required: true type: string + pydantic-version: + required: true + type: string java-version: required: true type: string @@ -24,7 +27,7 @@ on: jobs: test-mssql: - name: Run MSSQL tests (server=${{ inputs.mssql-version }}, spark=${{ inputs.spark-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) + name: Run MSSQL tests (server=${{ inputs.mssql-version }}, spark=${{ inputs.spark-version }}, pydantic=${{ inputs.pydantic-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) runs-on: ${{ inputs.os }} services: mssql: @@ -59,27 +62,27 @@ jobs: if: inputs.with-cache with: path: ~/.ivy2 - key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-mssql-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-mssql-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} restore-keys: | - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-mssql-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-mssql- + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-mssql-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-mssql- - name: Cache pip uses: actions/cache@v4 if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-mssql-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/mssql.txt', 'requirements/tests/spark-*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-mssql-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/mssql.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-mssql-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/mssql.txt', 'requirements/tests/spark-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-mssql- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-mssql-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/mssql.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-mssql- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel - name: Install dependencies run: | - pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/mssql.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt + pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/mssql.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - name: Wait for MSSQL to be ready run: | diff --git a/.github/workflows/test-mysql.yml b/.github/workflows/test-mysql.yml index 36d800a57..c98562dd4 100644 --- a/.github/workflows/test-mysql.yml +++ b/.github/workflows/test-mysql.yml @@ -8,6 +8,9 @@ on: spark-version: required: true type: string + pydantic-version: + required: true + type: string java-version: required: true type: string @@ -24,7 +27,7 @@ on: jobs: test-mysql: - name: Run MySQL tests (server=${{ inputs.mysql-version }}, spark=${{ inputs.spark-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) + name: Run MySQL tests (server=${{ inputs.mysql-version }}, spark=${{ inputs.spark-version }}, pydantic=${{ inputs.pydantic-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) runs-on: ${{ inputs.os }} services: mysql: @@ -58,27 +61,27 @@ jobs: if: inputs.with-cache with: path: ~/.ivy2 - key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-mysql-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-mysql-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} restore-keys: | - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-mysql-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-mysql- + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-mysql-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-mysql- - name: Cache pip uses: actions/cache@v4 if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-mysql-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/mysql.txt', 'requirements/tests/spark-*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-mysql-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/mysql.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-mysql-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/mysql.txt', 'requirements/tests/spark-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-mysql- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-mysql-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/mysql.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-mysql- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel - name: Install dependencies run: | - pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/mysql.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt + pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/mysql.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - name: Wait for MySQL to be ready run: | diff --git a/.github/workflows/test-oracle.yml b/.github/workflows/test-oracle.yml index a61395c5b..c22bc9553 100644 --- a/.github/workflows/test-oracle.yml +++ b/.github/workflows/test-oracle.yml @@ -15,6 +15,9 @@ on: spark-version: required: true type: string + pydantic-version: + required: true + type: string java-version: required: true type: string @@ -31,7 +34,7 @@ on: jobs: test-oracle: - name: Run Oracle tests (server=${{ inputs.oracle-version }}, spark=${{ inputs.spark-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) + name: Run Oracle tests (server=${{ inputs.oracle-version }}, spark=${{ inputs.spark-version }}, pydantic=${{ inputs.pydantic-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) runs-on: ${{ inputs.os }} services: oracle: @@ -64,20 +67,20 @@ jobs: if: inputs.with-cache with: path: ~/.ivy2 - key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-oracle-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-oracle-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} restore-keys: | - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-oracle-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-oracle- + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-oracle-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-oracle- - name: Cache pip uses: actions/cache@v4 if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-oracle-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/oracle.txt', 'requirements/tests/spark-*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-oracle-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/oracle.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-oracle-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/oracle.txt', 'requirements/tests/spark-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-oracle- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-oracle-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/oracle.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-oracle- - name: Set up Oracle instantclient if: runner.os == 'Linux' @@ -93,7 +96,7 @@ jobs: - name: Install dependencies run: | - pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/oracle.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt + pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/oracle.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - name: Wait for Oracle to be ready run: | diff --git a/.github/workflows/test-postgres.yml b/.github/workflows/test-postgres.yml index bc2666248..7ac821bc1 100644 --- a/.github/workflows/test-postgres.yml +++ b/.github/workflows/test-postgres.yml @@ -8,6 +8,9 @@ on: spark-version: required: true type: string + pydantic-version: + required: true + type: string java-version: required: true type: string @@ -24,7 +27,7 @@ on: jobs: test-postgres: - name: Run Postgres tests (server=${{ inputs.postgres-version }}, spark=${{ inputs.spark-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) + name: Run Postgres tests (server=${{ inputs.postgres-version }}, spark=${{ inputs.spark-version }}, pydantic=${{ inputs.pydantic-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) runs-on: ${{ inputs.os }} services: postgres: @@ -57,27 +60,27 @@ jobs: if: inputs.with-cache with: path: ~/.ivy2 - key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-postgres-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-postgres-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} restore-keys: | - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-postgres-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-postgres- + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-postgres-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-postgres- - name: Cache pip uses: actions/cache@v4 if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-postgres-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/postgres.txt', 'requirements/tests/spark-*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-postgres-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/postgres.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-postgres-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/postgres.txt', 'requirements/tests/spark-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-postgres- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-postgres-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/postgres.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-postgres- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel - name: Install dependencies run: | - pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/postgres.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt + pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/postgres.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - name: Wait for Postgres to be ready run: | diff --git a/.github/workflows/test-s3.yml b/.github/workflows/test-s3.yml index 0796ed171..a75297fe3 100644 --- a/.github/workflows/test-s3.yml +++ b/.github/workflows/test-s3.yml @@ -8,6 +8,9 @@ on: spark-version: required: true type: string + pydantic-version: + required: true + type: string java-version: required: true type: string @@ -24,7 +27,7 @@ on: jobs: test-s3: - name: Run S3 tests (server=${{ inputs.minio-version }}, spark=${{ inputs.spark-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) + name: Run S3 tests (server=${{ inputs.minio-version }}, spark=${{ inputs.spark-version }}, pydantic=${{ inputs.pydantic-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) runs-on: ${{ inputs.os }} services: s3: @@ -58,27 +61,27 @@ jobs: if: inputs.with-cache with: path: ~/.ivy2 - key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-s3-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-s3-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} restore-keys: | - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-s3-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-s3- + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-s3-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-s3- - name: Cache pip uses: actions/cache@v4 if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-s3-${{ hashFiles('requirements/core.txt', 'requirements/s3.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-s3-${{ hashFiles('requirements/core.txt', 'requirements/s3.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-s3-${{ hashFiles('requirements/core.txt', 'requirements/s3.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-s3- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-s3-${{ hashFiles('requirements/core.txt', 'requirements/s3.txt', 'requirements/tests/base.txt', 'requirements/tests/spark-*.txt', 'requirements/tests/pydantic-*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-s3- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel - name: Install dependencies run: | - pip install -I -r requirements/core.txt -r requirements/s3.txt -r requirements/tests/base.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt + pip install -I -r requirements/core.txt -r requirements/s3.txt -r requirements/tests/base.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - name: Wait for S3 to be ready run: | diff --git a/.github/workflows/test-samba.yml b/.github/workflows/test-samba.yml index 4612ff95f..3a7c1c921 100644 --- a/.github/workflows/test-samba.yml +++ b/.github/workflows/test-samba.yml @@ -5,6 +5,9 @@ on: server-version: required: true type: string + pydantic-version: + required: true + type: string python-version: required: true type: string @@ -18,7 +21,7 @@ on: jobs: test-samba: - name: Run Samba tests (server=${{ inputs.server-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) + name: Run Samba tests (server=${{ inputs.server-version }}, pydantic=${{ inputs.pydantic-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) runs-on: ${{ inputs.os }} steps: @@ -35,17 +38,17 @@ jobs: if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-samba-${{ hashFiles('requirements/core.txt', 'requirements/samba.txt', 'requirements/tests/base.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-pydantic-${{ inputs.pydantic-version }}-tests-samba-${{ hashFiles('requirements/core.txt', 'requirements/samba.txt', 'requirements/tests/base.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-samba-${{ hashFiles('requirements/core.txt', 'requirements/samba.txt', 'requirements/tests/base.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-samba- + ${{ runner.os }}-python-${{ inputs.python-version }}-pydantic-${{ inputs.pydantic-version }}-tests-samba-${{ hashFiles('requirements/core.txt', 'requirements/samba.txt', 'requirements/tests/base.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-pydantic-${{ inputs.pydantic-version }}-tests-samba- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel - name: Install dependencies run: | - pip install -I -r requirements/core.txt -r requirements/samba.txt -r requirements/tests/base.txt + pip install -I -r requirements/core.txt -r requirements/samba.txt -r requirements/tests/base.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt # Replace with Github Actions' because of custom parameter for samba container start - name: Start Samba diff --git a/.github/workflows/test-sftp.yml b/.github/workflows/test-sftp.yml index 301dd27c8..569d580f7 100644 --- a/.github/workflows/test-sftp.yml +++ b/.github/workflows/test-sftp.yml @@ -5,6 +5,9 @@ on: openssh-version: required: true type: string + pydantic-version: + required: true + type: string python-version: required: true type: string @@ -18,7 +21,7 @@ on: jobs: test-sftp: - name: Run SFTP tests (server=${{ inputs.openssh-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) + name: Run SFTP tests (server=${{ inputs.openssh-version }}, pydantic=${{ inputs.pydantic-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) runs-on: ${{ inputs.os }} services: sftp: @@ -45,17 +48,17 @@ jobs: if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-sftp-${{ hashFiles('requirements/core.txt', 'requirements/sftp.txt', 'requirements/tests/base.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-pydantic-${{ inputs.pydantic-version }}-tests-sftp-${{ hashFiles('requirements/core.txt', 'requirements/sftp.txt', 'requirements/tests/base.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-sftp-${{ hashFiles('requirements/core.txt', 'requirements/sftp.txt', 'requirements/tests/base.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-sftp- + ${{ runner.os }}-python-${{ inputs.python-version }}-pydantic-${{ inputs.pydantic-version }}-tests-sftp-${{ hashFiles('requirements/core.txt', 'requirements/sftp.txt', 'requirements/tests/base.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-pydantic-${{ inputs.pydantic-version }}-tests-sftp- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel - name: Install dependencies run: | - pip install -I -r requirements/core.txt -r requirements/sftp.txt -r requirements/tests/base.txt + pip install -I -r requirements/core.txt -r requirements/sftp.txt -r requirements/tests/base.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - name: Wait for SFTP to be ready run: | diff --git a/.github/workflows/test-teradata.yml b/.github/workflows/test-teradata.yml index 64d332994..214859e2b 100644 --- a/.github/workflows/test-teradata.yml +++ b/.github/workflows/test-teradata.yml @@ -5,6 +5,9 @@ on: spark-version: required: true type: string + pydantic-version: + required: true + type: string java-version: required: true type: string @@ -21,7 +24,7 @@ on: jobs: test-teradata: - name: Run Teradata tests (spark=${{ inputs.spark-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) + name: Run Teradata tests (spark=${{ inputs.spark-version }}, pydantic=${{ inputs.pydantic-version }}, java=${{ inputs.java-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) runs-on: ${{ inputs.os }} steps: @@ -44,26 +47,26 @@ jobs: if: inputs.with-cache with: path: ~/.ivy2 - key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-teradata-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + key: ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-teradata-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} restore-keys: | - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-teradata-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} - ${{ runner.os }}-ivy-${{ inputs.spark-version }}-tests-teradata- + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-teradata-${{ hashFiles('onetl/connection/db_connection/*.py', 'onetl/connection/file_df_connection/*.py') }} + ${{ runner.os }}-ivy-${{ inputs.spark-version }}-${{ inputs.pydantic-version }}-tests-teradata- - name: Cache pip uses: actions/cache@v4 with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-tests-teradata-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark*.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-teradata-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark*.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-tests-teradata-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark*.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-tests-teradata- + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-teradata-${{ hashFiles('requirements/core.txt', 'requirements/tests/base.txt', 'requirements/tests/spark*.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-spark-${{ inputs.spark-version }}-pydantic-${{ inputs.pydantic-version }}-tests-teradata- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel - name: Install dependencies run: | - pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt + pip install -I -r requirements/core.txt -r requirements/tests/base.txt -r requirements/tests/spark-${{ inputs.spark-version }}.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt - name: Run tests run: | diff --git a/.github/workflows/test-webdav.yml b/.github/workflows/test-webdav.yml index e259d4dfe..ee23f0ae8 100644 --- a/.github/workflows/test-webdav.yml +++ b/.github/workflows/test-webdav.yml @@ -5,6 +5,9 @@ on: webdav-version: required: true type: string + pydantic-version: + required: true + type: string python-version: required: true type: string @@ -18,7 +21,7 @@ on: jobs: test-webdav: - name: Run WebDAV tests (server=${{ inputs.webdav-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) + name: Run WebDAV tests (server=${{ inputs.webdav-version }}, pydantic=${{ inputs.pydantic-version }}, python=${{ inputs.python-version }}, os=${{ inputs.os }}) runs-on: ${{ inputs.os }} steps: @@ -35,17 +38,17 @@ jobs: if: inputs.with-cache with: path: ~/.cache/pip - key: ${{ runner.os }}-python-${{ inputs.python-version }}-tests-webdav-${{ hashFiles('requirements/core.txt', 'requirements/webdav.txt', 'requirements/tests/base.txt') }} + key: ${{ runner.os }}-python-${{ inputs.python-version }}-pydantic-${{ inputs.pydantic-version }}-tests-webdav-${{ hashFiles('requirements/core.txt', 'requirements/webdav.txt', 'requirements/tests/base.txt') }} restore-keys: | - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-webdav-${{ hashFiles('requirements/core.txt', 'requirements/webdav.txt', 'requirements/tests/base.txt') }} - ${{ runner.os }}-python-${{ inputs.python-version }}-tests-webdav- + ${{ runner.os }}-python-${{ inputs.python-version }}-pydantic-${{ inputs.pydantic-version }}-tests-webdav-${{ hashFiles('requirements/core.txt', 'requirements/webdav.txt', 'requirements/tests/base.txt') }} + ${{ runner.os }}-python-${{ inputs.python-version }}-pydantic-${{ inputs.pydantic-version }}-tests-webdav- - name: Upgrade pip run: python -m pip install --upgrade pip setuptools wheel - name: Install dependencies run: | - pip install -I -r requirements/core.txt -r requirements/webdav.txt -r requirements/tests/base.txt + pip install -I -r requirements/core.txt -r requirements/webdav.txt -r requirements/tests/base.txt -r requirements/tests/pydantic-${{ inputs.pydantic-version }}.txt # Replace with Github Actions' services after https://github.com/chonjay21/docker-webdav/pull/3 # Cannot use services because we need to mount config file from the repo, but services start before checkout. diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index b48f88c5a..4a337a751 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -21,7 +21,7 @@ jobs: uses: ./.github/workflows/get-matrix.yml tests-core: - name: Run core tests (spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run core tests (spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -31,12 +31,13 @@ jobs: uses: ./.github/workflows/test-core.yml with: spark-version: ${{ matrix.spark-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} tests-clickhouse: - name: Run Clickhouse tests (server=${{ matrix.clickhouse-version }}, spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run Clickhouse tests (server=${{ matrix.clickhouse-version }}, spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -48,12 +49,13 @@ jobs: clickhouse-image: ${{ matrix.clickhouse-image }} clickhouse-version: ${{ matrix.clickhouse-version }} spark-version: ${{ matrix.spark-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} tests-greenplum: - name: Run Greenplum tests (server=${{ matrix.greenplum-version }}, spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run Greenplum tests (server=${{ matrix.greenplum-version }}, spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -64,6 +66,7 @@ jobs: with: greenplum-version: ${{ matrix.greenplum-version }} spark-version: ${{ matrix.spark-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} @@ -72,7 +75,7 @@ jobs: GREENPLUM_PACKAGES_PASSWORD: ${{ secrets.GREENPLUM_PACKAGES_PASSWORD }} tests-hive: - name: Run Hive tests (spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run Hive tests (spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -82,12 +85,13 @@ jobs: uses: ./.github/workflows/test-hive.yml with: spark-version: ${{ matrix.spark-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} tests-kafka: - name: Run Kafka tests (server=${{ matrix.kafka-version }}, spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run Kafka tests (server=${{ matrix.kafka-version }}, spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -98,12 +102,13 @@ jobs: with: kafka-version: ${{ matrix.kafka-version }} spark-version: ${{ matrix.spark-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} tests-local-fs: - name: Run LocalFS tests (spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run LocalFS tests (spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -113,12 +118,13 @@ jobs: uses: ./.github/workflows/test-local-fs.yml with: spark-version: ${{ matrix.spark-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} tests-mongodb: - name: Run MongoDB tests (server=${{ matrix.mongodb-version }}, spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run MongoDB tests (server=${{ matrix.mongodb-version }}, spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -129,12 +135,13 @@ jobs: with: mongodb-version: ${{ matrix.mongodb-version }} spark-version: ${{ matrix.spark-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} tests-mssql: - name: Run MSSQL tests (server=${{ matrix.mssql-version }}, spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run MSSQL tests (server=${{ matrix.mssql-version }}, spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -145,12 +152,13 @@ jobs: with: mssql-version: ${{ matrix.mssql-version }} spark-version: ${{ matrix.spark-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} tests-mysql: - name: Run MySQL tests (server=${{ matrix.mysql-version }}, spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run MySQL tests (server=${{ matrix.mysql-version }}, spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -161,12 +169,13 @@ jobs: with: mysql-version: ${{ matrix.mysql-version }} spark-version: ${{ matrix.spark-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} tests-oracle: - name: Run Oracle tests (server=${{ matrix.oracle-version }}, spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run Oracle tests (server=${{ matrix.oracle-version }}, spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -179,12 +188,13 @@ jobs: oracle-version: ${{ matrix.oracle-version }} db-name: ${{ matrix.db-name }} spark-version: ${{ matrix.spark-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} tests-postgres: - name: Run Postgres tests (server=${{ matrix.postgres-version }}, spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run Postgres tests (server=${{ matrix.postgres-version }}, spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -195,12 +205,13 @@ jobs: with: postgres-version: ${{ matrix.postgres-version }} spark-version: ${{ matrix.spark-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} tests-teradata: - name: Run Teradata tests (spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run Teradata tests (spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -210,12 +221,13 @@ jobs: uses: ./.github/workflows/test-teradata.yml with: spark-version: ${{ matrix.spark-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} tests-ftp: - name: Run FTP tests (server=${{ matrix.ftp-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run FTP tests (server=${{ matrix.ftp-version }}, pydantic=${{ matrix.pydantic-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -225,11 +237,12 @@ jobs: uses: ./.github/workflows/test-ftp.yml with: ftp-version: ${{ matrix.ftp-version }} + pydantic-version: ${{ matrix.pydantic-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} tests-ftps: - name: Run FTPS tests (server=${{ matrix.ftps-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run FTPS tests (server=${{ matrix.ftps-version }}, pydantic=${{ matrix.pydantic-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -239,11 +252,12 @@ jobs: uses: ./.github/workflows/test-ftps.yml with: ftps-version: ${{ matrix.ftps-version }} + pydantic-version: ${{ matrix.pydantic-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} tests-hdfs: - name: Run HDFS tests (server=${{ matrix.hadoop-version }}, spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run HDFS tests (server=${{ matrix.hadoop-version }}, spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -254,12 +268,13 @@ jobs: with: hadoop-version: ${{ matrix.hadoop-version }} spark-version: ${{ matrix.spark-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} tests-s3: - name: Run S3 tests (server=${{ matrix.minio-version }}, spark=${{ matrix.spark-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run S3 tests (server=${{ matrix.minio-version }}, spark=${{ matrix.spark-version }}, pydantic=${{ matrix.pydantic-version }}, java=${{ matrix.java-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -270,12 +285,13 @@ jobs: with: minio-version: ${{ matrix.minio-version }} spark-version: ${{ matrix.spark-version }} + pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} tests-sftp: - name: Run SFTP tests (server=${{ matrix.openssh-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run SFTP tests (server=${{ matrix.openssh-version }}, pydantic=${{ matrix.pydantic-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -285,11 +301,12 @@ jobs: uses: ./.github/workflows/test-sftp.yml with: openssh-version: ${{ matrix.openssh-version }} + pydantic-version: ${{ matrix.pydantic-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} tests-samba: - name: Run Samba tests (server=${{ matrix.server-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run Samba tests (server=${{ matrix.server-version }}, pydantic=${{ matrix.pydantic-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -299,12 +316,13 @@ jobs: uses: ./.github/workflows/test-samba.yml with: server-version: ${{ matrix.server-version }} + pydantic-version: ${{ matrix.pydantic-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} tests-webdav: - name: Run WebDAV tests (server=${{ matrix.webdav-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) + name: Run WebDAV tests (server=${{ matrix.webdav-version }}, pydantic=${{ matrix.pydantic-version }}, python=${{ matrix.python-version }}, os=${{ matrix.os }}) needs: [get-matrix] strategy: fail-fast: false @@ -314,6 +332,7 @@ jobs: uses: ./.github/workflows/test-webdav.yml with: webdav-version: ${{ matrix.webdav-version }} + pydantic-version: ${{ matrix.pydantic-version }} python-version: ${{ matrix.python-version }} os: ${{ matrix.os }} diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 847cb5765..e347a6ecf 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -70,6 +70,7 @@ Create virtualenv and install dependencies: -r requirements/tests/mysql.txt \ -r requirements/tests/postgres.txt \ -r requirements/tests/oracle.txt \ + -r requirements/tests/pydantic-2.txt \ -r requirements/tests/spark-3.5.0.txt # TODO: remove after https://github.com/zqmillet/sphinx-plantuml/pull/4 diff --git a/docker/Dockerfile b/docker/Dockerfile index 0faa2591e..36cbb129f 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -68,7 +68,8 @@ RUN pip install \ -r /app/requirements/tests/mysql.txt \ -r /app/requirements/tests/oracle.txt \ -r /app/requirements/tests/postgres.txt \ - -r /app/requirements/tests/kafka.txt + -r /app/requirements/tests/kafka.txt \ + -r /app/requirements/tests/pydantic-${PYDANTIC_VERSION}.txt FROM base AS full diff --git a/docs/changelog/next_release/230.feature.rst b/docs/changelog/next_release/230.feature.rst new file mode 100644 index 000000000..9220b131f --- /dev/null +++ b/docs/changelog/next_release/230.feature.rst @@ -0,0 +1 @@ +Add support of Pydantic v2. diff --git a/onetl/_internal.py b/onetl/_internal.py index 3b533fac0..298ecdf95 100644 --- a/onetl/_internal.py +++ b/onetl/_internal.py @@ -11,7 +11,11 @@ from typing import TYPE_CHECKING, Any from etl_entities.process import ProcessStackManager -from pydantic import SecretStr + +try: + from pydantic.v1 import SecretStr +except (ImportError, AttributeError): + from pydantic import SecretStr # type: ignore[no-redef, assignment] if TYPE_CHECKING: from pathlib import PurePath diff --git a/onetl/connection/db_connection/db_connection/connection.py b/onetl/connection/db_connection/db_connection/connection.py index ff3e0af7f..1372cd69a 100644 --- a/onetl/connection/db_connection/db_connection/connection.py +++ b/onetl/connection/db_connection/db_connection/connection.py @@ -5,7 +5,10 @@ from logging import getLogger from typing import TYPE_CHECKING -from pydantic import Field, validator +try: + from pydantic.v1 import Field, validator +except (ImportError, AttributeError): + from pydantic import Field, validator # type: ignore[no-redef, assignment] from onetl._util.spark import try_import_pyspark from onetl.base import BaseDBConnection diff --git a/onetl/connection/db_connection/greenplum/connection.py b/onetl/connection/db_connection/greenplum/connection.py index d1c378cd7..bdd509862 100644 --- a/onetl/connection/db_connection/greenplum/connection.py +++ b/onetl/connection/db_connection/greenplum/connection.py @@ -9,7 +9,11 @@ from typing import TYPE_CHECKING, Any, ClassVar from etl_entities.instance import Host -from pydantic import validator + +try: + from pydantic.v1 import validator +except (ImportError, AttributeError): + from pydantic import validator # type: ignore[no-redef, assignment] from onetl._util.classproperty import classproperty from onetl._util.java import try_import_java_class diff --git a/onetl/connection/db_connection/greenplum/options.py b/onetl/connection/db_connection/greenplum/options.py index 8779722fa..e100e35f1 100644 --- a/onetl/connection/db_connection/greenplum/options.py +++ b/onetl/connection/db_connection/greenplum/options.py @@ -6,7 +6,10 @@ from enum import Enum from typing import Optional -from pydantic import Field, root_validator +try: + from pydantic.v1 import Field, root_validator +except (ImportError, AttributeError): + from pydantic import Field, root_validator # type: ignore[no-redef, assignment] from onetl.connection.db_connection.jdbc_mixin import JDBCOptions diff --git a/onetl/connection/db_connection/hive/connection.py b/onetl/connection/db_connection/hive/connection.py index 328d6b98e..f465fd73a 100644 --- a/onetl/connection/db_connection/hive/connection.py +++ b/onetl/connection/db_connection/hive/connection.py @@ -7,7 +7,11 @@ from typing import TYPE_CHECKING, Any, ClassVar, Iterable from etl_entities.instance import Cluster -from pydantic import validator + +try: + from pydantic.v1 import validator +except (ImportError, AttributeError): + from pydantic import validator # type: ignore[no-redef, assignment] from onetl._internal import clear_statement from onetl._util.spark import inject_spark_param diff --git a/onetl/connection/db_connection/hive/options.py b/onetl/connection/db_connection/hive/options.py index 54acccc64..cdf5b5d5c 100644 --- a/onetl/connection/db_connection/hive/options.py +++ b/onetl/connection/db_connection/hive/options.py @@ -6,7 +6,11 @@ from enum import Enum from typing import List, Optional, Tuple, Union -from pydantic import Field, root_validator, validator +try: + from pydantic.v1 import Field, root_validator, validator +except (ImportError, AttributeError): + from pydantic import Field, root_validator, validator # type: ignore[no-redef, assignment] + from typing_extensions import deprecated from onetl.impl import GenericOptions diff --git a/onetl/connection/db_connection/jdbc_connection/options.py b/onetl/connection/db_connection/jdbc_connection/options.py index 5a43c585f..fbeb3cead 100644 --- a/onetl/connection/db_connection/jdbc_connection/options.py +++ b/onetl/connection/db_connection/jdbc_connection/options.py @@ -6,7 +6,11 @@ from enum import Enum from typing import Optional -from pydantic import Field, PositiveInt, root_validator +try: + from pydantic.v1 import Field, PositiveInt, root_validator +except (ImportError, AttributeError): + from pydantic import Field, PositiveInt, root_validator # type: ignore[no-redef, assignment] + from typing_extensions import deprecated from onetl._internal import to_camel diff --git a/onetl/connection/db_connection/jdbc_mixin/connection.py b/onetl/connection/db_connection/jdbc_mixin/connection.py index e22ad6f2d..ab49aae81 100644 --- a/onetl/connection/db_connection/jdbc_mixin/connection.py +++ b/onetl/connection/db_connection/jdbc_mixin/connection.py @@ -9,7 +9,10 @@ from enum import Enum, auto from typing import TYPE_CHECKING, Callable, ClassVar, Optional, TypeVar -from pydantic import Field, PrivateAttr, SecretStr, validator +try: + from pydantic.v1 import Field, PrivateAttr, SecretStr, validator +except (ImportError, AttributeError): + from pydantic import Field, PrivateAttr, SecretStr, validator # type: ignore[no-redef, assignment] from onetl._internal import clear_statement, stringify from onetl._util.java import get_java_gateway, try_import_java_class diff --git a/onetl/connection/db_connection/jdbc_mixin/options.py b/onetl/connection/db_connection/jdbc_mixin/options.py index 583cb4f53..349630719 100644 --- a/onetl/connection/db_connection/jdbc_mixin/options.py +++ b/onetl/connection/db_connection/jdbc_mixin/options.py @@ -4,7 +4,10 @@ from typing import Optional -from pydantic import Field +try: + from pydantic.v1 import Field +except (ImportError, AttributeError): + from pydantic import Field # type: ignore[no-redef, assignment] from onetl.impl import GenericOptions diff --git a/onetl/connection/db_connection/kafka/connection.py b/onetl/connection/db_connection/kafka/connection.py index b7f06194c..21a042451 100644 --- a/onetl/connection/db_connection/kafka/connection.py +++ b/onetl/connection/db_connection/kafka/connection.py @@ -8,7 +8,11 @@ from typing import TYPE_CHECKING, Any, List, Optional from etl_entities.instance import Cluster -from pydantic import root_validator, validator + +try: + from pydantic.v1 import root_validator, validator +except (ImportError, AttributeError): + from pydantic import root_validator, validator # type: ignore[no-redef, assignment] from onetl._internal import stringify from onetl._util.java import try_import_java_class diff --git a/onetl/connection/db_connection/kafka/kafka_basic_auth.py b/onetl/connection/db_connection/kafka/kafka_basic_auth.py index 56037cb32..e364b4123 100644 --- a/onetl/connection/db_connection/kafka/kafka_basic_auth.py +++ b/onetl/connection/db_connection/kafka/kafka_basic_auth.py @@ -4,7 +4,10 @@ from typing import TYPE_CHECKING -from pydantic import Field, SecretStr +try: + from pydantic.v1 import Field, SecretStr +except (ImportError, AttributeError): + from pydantic import Field, SecretStr # type: ignore[no-redef, assignment] from onetl.connection.db_connection.kafka.kafka_auth import KafkaAuth from onetl.impl import GenericOptions diff --git a/onetl/connection/db_connection/kafka/kafka_kerberos_auth.py b/onetl/connection/db_connection/kafka/kafka_kerberos_auth.py index 2af64f165..6f56ca55d 100644 --- a/onetl/connection/db_connection/kafka/kafka_kerberos_auth.py +++ b/onetl/connection/db_connection/kafka/kafka_kerberos_auth.py @@ -7,7 +7,10 @@ import shutil from typing import TYPE_CHECKING, Optional -from pydantic import Field, PrivateAttr, root_validator, validator +try: + from pydantic.v1 import Field, PrivateAttr, root_validator, validator +except (ImportError, AttributeError): + from pydantic import Field, PrivateAttr, root_validator, validator # type: ignore[no-redef, assignment] from onetl._internal import stringify from onetl._util.file import get_file_hash, is_file_readable diff --git a/onetl/connection/db_connection/kafka/kafka_scram_auth.py b/onetl/connection/db_connection/kafka/kafka_scram_auth.py index 579dc7ed0..2015123ee 100644 --- a/onetl/connection/db_connection/kafka/kafka_scram_auth.py +++ b/onetl/connection/db_connection/kafka/kafka_scram_auth.py @@ -4,7 +4,11 @@ from typing import TYPE_CHECKING -from pydantic import Field, SecretStr +try: + from pydantic.v1 import Field, SecretStr +except (ImportError, AttributeError): + from pydantic import Field, SecretStr # type: ignore[no-redef, assignment] + from typing_extensions import Literal from onetl._internal import stringify diff --git a/onetl/connection/db_connection/kafka/kafka_ssl_protocol.py b/onetl/connection/db_connection/kafka/kafka_ssl_protocol.py index d76c4e333..f7b9ecc15 100644 --- a/onetl/connection/db_connection/kafka/kafka_ssl_protocol.py +++ b/onetl/connection/db_connection/kafka/kafka_ssl_protocol.py @@ -5,7 +5,10 @@ from pathlib import Path from typing import TYPE_CHECKING, Optional -from pydantic import Field, SecretStr, validator +try: + from pydantic.v1 import Field, SecretStr, validator +except (ImportError, AttributeError): + from pydantic import Field, SecretStr, validator # type: ignore[no-redef, assignment] from onetl._internal import stringify from onetl._util.file import is_file_readable diff --git a/onetl/connection/db_connection/kafka/options.py b/onetl/connection/db_connection/kafka/options.py index 7ca76c1fc..0b04f2cb5 100644 --- a/onetl/connection/db_connection/kafka/options.py +++ b/onetl/connection/db_connection/kafka/options.py @@ -4,7 +4,10 @@ from enum import Enum -from pydantic import Field, root_validator +try: + from pydantic.v1 import Field, root_validator +except (ImportError, AttributeError): + from pydantic import Field, root_validator # type: ignore[no-redef, assignment] from onetl.impl import GenericOptions diff --git a/onetl/connection/db_connection/mongodb/connection.py b/onetl/connection/db_connection/mongodb/connection.py index c84317d88..b34b6810a 100644 --- a/onetl/connection/db_connection/mongodb/connection.py +++ b/onetl/connection/db_connection/mongodb/connection.py @@ -9,7 +9,11 @@ from urllib import parse as parser from etl_entities.instance import Host -from pydantic import SecretStr, validator + +try: + from pydantic.v1 import SecretStr, validator +except (ImportError, AttributeError): + from pydantic import SecretStr, validator # type: ignore[no-redef, assignment] from onetl._util.classproperty import classproperty from onetl._util.java import try_import_java_class diff --git a/onetl/connection/db_connection/mongodb/options.py b/onetl/connection/db_connection/mongodb/options.py index 4b4656fa5..84022b8a5 100644 --- a/onetl/connection/db_connection/mongodb/options.py +++ b/onetl/connection/db_connection/mongodb/options.py @@ -5,7 +5,10 @@ import warnings from enum import Enum -from pydantic import Field, root_validator +try: + from pydantic.v1 import Field, root_validator +except (ImportError, AttributeError): + from pydantic import Field, root_validator # type: ignore[no-redef, assignment] from onetl.impl import GenericOptions diff --git a/onetl/connection/db_connection/oracle/connection.py b/onetl/connection/db_connection/oracle/connection.py index 6d87cb559..7008d09b5 100644 --- a/onetl/connection/db_connection/oracle/connection.py +++ b/onetl/connection/db_connection/oracle/connection.py @@ -12,7 +12,10 @@ from textwrap import indent from typing import TYPE_CHECKING, Any, ClassVar, Optional -from pydantic import root_validator +try: + from pydantic.v1 import root_validator +except (ImportError, AttributeError): + from pydantic import root_validator # type: ignore[no-redef, assignment] from onetl._internal import clear_statement from onetl._util.classproperty import classproperty diff --git a/onetl/connection/file_connection/ftp.py b/onetl/connection/file_connection/ftp.py index 5e91648e1..6fcbaa856 100644 --- a/onetl/connection/file_connection/ftp.py +++ b/onetl/connection/file_connection/ftp.py @@ -9,7 +9,11 @@ from typing import Optional from etl_entities.instance import Host -from pydantic import SecretStr + +try: + from pydantic.v1 import SecretStr +except (ImportError, AttributeError): + from pydantic import SecretStr # type: ignore[no-redef, assignment] from onetl.base import PathStatProtocol from onetl.connection.file_connection.file_connection import FileConnection diff --git a/onetl/connection/file_connection/hdfs/connection.py b/onetl/connection/file_connection/hdfs/connection.py index 9cf4482d7..7105763fe 100644 --- a/onetl/connection/file_connection/hdfs/connection.py +++ b/onetl/connection/file_connection/hdfs/connection.py @@ -9,7 +9,11 @@ from typing import TYPE_CHECKING, Optional, Tuple from etl_entities.instance import Cluster, Host -from pydantic import Field, FilePath, SecretStr, root_validator, validator + +try: + from pydantic.v1 import Field, FilePath, SecretStr, root_validator, validator +except (ImportError, AttributeError): + from pydantic import Field, FilePath, SecretStr, root_validator, validator # type: ignore[no-redef, assignment] from onetl.base import PathStatProtocol from onetl.connection.file_connection.file_connection import FileConnection diff --git a/onetl/connection/file_connection/s3.py b/onetl/connection/file_connection/s3.py index 9deb016c2..7811c4c4b 100644 --- a/onetl/connection/file_connection/s3.py +++ b/onetl/connection/file_connection/s3.py @@ -29,7 +29,12 @@ ) from e from etl_entities.instance import Host -from pydantic import SecretStr, root_validator + +try: + from pydantic.v1 import SecretStr, root_validator +except (ImportError, AttributeError): + from pydantic import SecretStr, root_validator # type: ignore[no-redef, assignment] + from typing_extensions import Literal from onetl.connection.file_connection.file_connection import FileConnection diff --git a/onetl/connection/file_connection/samba.py b/onetl/connection/file_connection/samba.py index 81c6cce8a..9fc0857fa 100644 --- a/onetl/connection/file_connection/samba.py +++ b/onetl/connection/file_connection/samba.py @@ -11,9 +11,13 @@ from typing import Optional, Union from etl_entities.instance import Host -from pydantic import SecretStr, validator from typing_extensions import Literal +try: + from pydantic.v1 import SecretStr, validator +except (ImportError, AttributeError): + from pydantic import SecretStr, validator # type: ignore[no-redef, assignment] + from onetl.connection.file_connection.file_connection import FileConnection from onetl.hooks import slot, support_hooks from onetl.impl import LocalPath, RemotePath, RemotePathStat diff --git a/onetl/connection/file_connection/sftp.py b/onetl/connection/file_connection/sftp.py index 61ed3ee95..3d64669b1 100644 --- a/onetl/connection/file_connection/sftp.py +++ b/onetl/connection/file_connection/sftp.py @@ -10,7 +10,11 @@ from typing import Optional from etl_entities.instance import Host -from pydantic import FilePath, SecretStr + +try: + from pydantic.v1 import FilePath, SecretStr +except (ImportError, AttributeError): + from pydantic import FilePath, SecretStr # type: ignore[no-redef, assignment] from onetl.connection.file_connection.file_connection import FileConnection from onetl.connection.file_connection.mixins.rename_dir_mixin import RenameDirMixin diff --git a/onetl/connection/file_connection/webdav.py b/onetl/connection/file_connection/webdav.py index b0eb7decc..f74eed3e3 100644 --- a/onetl/connection/file_connection/webdav.py +++ b/onetl/connection/file_connection/webdav.py @@ -12,7 +12,12 @@ from typing import Optional, Union from etl_entities.instance import Host -from pydantic import DirectoryPath, FilePath, SecretStr, root_validator + +try: + from pydantic.v1 import DirectoryPath, FilePath, SecretStr, root_validator +except (ImportError, AttributeError): + from pydantic import DirectoryPath, FilePath, SecretStr, root_validator # type: ignore[no-redef, assignment] + from typing_extensions import Literal from onetl.connection.file_connection.file_connection import FileConnection diff --git a/onetl/connection/file_df_connection/spark_file_df_connection.py b/onetl/connection/file_df_connection/spark_file_df_connection.py index d3061385d..06121139f 100644 --- a/onetl/connection/file_df_connection/spark_file_df_connection.py +++ b/onetl/connection/file_df_connection/spark_file_df_connection.py @@ -7,7 +7,10 @@ from logging import getLogger from typing import TYPE_CHECKING -from pydantic import Field, validator +try: + from pydantic.v1 import Field, validator +except (ImportError, AttributeError): + from pydantic import Field, validator # type: ignore[no-redef, assignment] from onetl._util.hadoop import get_hadoop_config from onetl._util.spark import try_import_pyspark diff --git a/onetl/connection/file_df_connection/spark_hdfs/connection.py b/onetl/connection/file_df_connection/spark_hdfs/connection.py index 7f23e8b58..1c0ca6bb3 100644 --- a/onetl/connection/file_df_connection/spark_hdfs/connection.py +++ b/onetl/connection/file_df_connection/spark_hdfs/connection.py @@ -10,7 +10,11 @@ from typing import TYPE_CHECKING, Optional from etl_entities.instance import Cluster, Host -from pydantic import Field, PrivateAttr, validator + +try: + from pydantic.v1 import Field, PrivateAttr, validator +except (ImportError, AttributeError): + from pydantic import Field, PrivateAttr, validator # type: ignore[no-redef, assignment] from onetl.base import PurePathProtocol from onetl.connection.file_df_connection.spark_file_df_connection import ( diff --git a/onetl/connection/file_df_connection/spark_local_fs.py b/onetl/connection/file_df_connection/spark_local_fs.py index d2accb68a..0ebd43683 100644 --- a/onetl/connection/file_df_connection/spark_local_fs.py +++ b/onetl/connection/file_df_connection/spark_local_fs.py @@ -6,7 +6,10 @@ import socket from pathlib import Path -from pydantic import validator +try: + from pydantic.v1 import validator +except (ImportError, AttributeError): + from pydantic import validator # type: ignore[no-redef, assignment] from onetl.base import PurePathProtocol from onetl.connection.file_df_connection.spark_file_df_connection import ( diff --git a/onetl/connection/file_df_connection/spark_s3/connection.py b/onetl/connection/file_df_connection/spark_s3/connection.py index ff516abf7..46ce8bb33 100644 --- a/onetl/connection/file_df_connection/spark_s3/connection.py +++ b/onetl/connection/file_df_connection/spark_s3/connection.py @@ -8,7 +8,12 @@ from typing import TYPE_CHECKING, ClassVar, List, Optional from etl_entities.instance import Host -from pydantic import SecretStr, root_validator, validator + +try: + from pydantic.v1 import SecretStr, root_validator, validator +except (ImportError, AttributeError): + from pydantic import SecretStr, root_validator, validator # type: ignore[no-redef, assignment] + from typing_extensions import Literal from onetl._internal import stringify diff --git a/onetl/core/file_filter/file_filter.py b/onetl/core/file_filter/file_filter.py index 5f71ff25a..a8fd7b694 100644 --- a/onetl/core/file_filter/file_filter.py +++ b/onetl/core/file_filter/file_filter.py @@ -9,9 +9,13 @@ import warnings from typing import List, Optional, Union -from pydantic import Field, root_validator, validator from typing_extensions import deprecated +try: + from pydantic.v1 import Field, root_validator, validator +except (ImportError, AttributeError): + from pydantic import Field, root_validator, validator # type: ignore[no-redef, assignment] + from onetl.base import BaseFileFilter, PathProtocol from onetl.impl import FrozenModel, RemotePath diff --git a/onetl/core/file_limit/file_limit.py b/onetl/core/file_limit/file_limit.py index 0e5637f8b..d3b98718b 100644 --- a/onetl/core/file_limit/file_limit.py +++ b/onetl/core/file_limit/file_limit.py @@ -5,9 +5,13 @@ import textwrap import warnings -from pydantic import validator from typing_extensions import deprecated +try: + from pydantic.v1 import validator +except (ImportError, AttributeError): + from pydantic import validator # type: ignore[no-redef, assignment] + from onetl.base import BaseFileLimit, PathProtocol from onetl.impl import FrozenModel diff --git a/onetl/db/db_reader/db_reader.py b/onetl/db/db_reader/db_reader.py index 2ce47a54c..83953b924 100644 --- a/onetl/db/db_reader/db_reader.py +++ b/onetl/db/db_reader/db_reader.py @@ -11,7 +11,11 @@ from etl_entities.hwm import HWM, ColumnHWM, KeyValueHWM from etl_entities.old_hwm import IntHWM as OldColumnHWM from etl_entities.source import Column, Table -from pydantic import Field, PrivateAttr, root_validator, validator + +try: + from pydantic.v1 import Field, PrivateAttr, root_validator, validator +except (ImportError, AttributeError): + from pydantic import Field, PrivateAttr, root_validator, validator # type: ignore[no-redef, assignment] from onetl._util.spark import try_import_pyspark from onetl.base import ( diff --git a/onetl/db/db_writer/db_writer.py b/onetl/db/db_writer/db_writer.py index e79bb5205..eb83c9eb8 100644 --- a/onetl/db/db_writer/db_writer.py +++ b/onetl/db/db_writer/db_writer.py @@ -5,7 +5,10 @@ from logging import getLogger from typing import TYPE_CHECKING, Optional -from pydantic import Field, PrivateAttr, validator +try: + from pydantic.v1 import Field, PrivateAttr, validator +except (ImportError, AttributeError): + from pydantic import Field, PrivateAttr, validator # type: ignore[no-redef, assignment] from onetl.base import BaseDBConnection from onetl.hooks import slot, support_hooks diff --git a/onetl/file/file_df_reader/file_df_reader.py b/onetl/file/file_df_reader/file_df_reader.py index 5b8d9bed6..6967387a6 100644 --- a/onetl/file/file_df_reader/file_df_reader.py +++ b/onetl/file/file_df_reader/file_df_reader.py @@ -7,7 +7,11 @@ from typing import TYPE_CHECKING, Iterable, Optional from ordered_set import OrderedSet -from pydantic import PrivateAttr, validator + +try: + from pydantic.v1 import PrivateAttr, validator +except (ImportError, AttributeError): + from pydantic import PrivateAttr, validator # type: ignore[no-redef, assignment] from onetl._util.spark import try_import_pyspark from onetl.base import BaseFileDFConnection, BaseReadableFileFormat, PurePathProtocol diff --git a/onetl/file/file_df_reader/options.py b/onetl/file/file_df_reader/options.py index ea4564549..7fdbe64d9 100644 --- a/onetl/file/file_df_reader/options.py +++ b/onetl/file/file_df_reader/options.py @@ -4,7 +4,10 @@ from typing import TYPE_CHECKING, Optional -from pydantic import Field, validator +try: + from pydantic.v1 import Field, validator +except (ImportError, AttributeError): + from pydantic import Field, validator # type: ignore[no-redef, assignment] from onetl._util.spark import get_pyspark_version from onetl.base import FileDFReadOptions diff --git a/onetl/file/file_df_writer/file_df_writer.py b/onetl/file/file_df_writer/file_df_writer.py index 3b7a2724c..a80f54801 100644 --- a/onetl/file/file_df_writer/file_df_writer.py +++ b/onetl/file/file_df_writer/file_df_writer.py @@ -5,7 +5,10 @@ import logging from typing import TYPE_CHECKING -from pydantic import PrivateAttr, validator +try: + from pydantic.v1 import PrivateAttr, validator +except (ImportError, AttributeError): + from pydantic import PrivateAttr, validator # type: ignore[no-redef, assignment] from onetl.base import BaseFileDFConnection, BaseWritableFileFormat, PurePathProtocol from onetl.file.file_df_writer.options import FileDFWriterOptions diff --git a/onetl/file/file_df_writer/options.py b/onetl/file/file_df_writer/options.py index 01d808135..01bd9ee48 100644 --- a/onetl/file/file_df_writer/options.py +++ b/onetl/file/file_df_writer/options.py @@ -7,7 +7,10 @@ from enum import Enum from typing import TYPE_CHECKING, ContextManager, Iterable, List, Optional, Union -from pydantic import Field, root_validator +try: + from pydantic.v1 import Field, root_validator +except (ImportError, AttributeError): + from pydantic import Field, root_validator # type: ignore[no-redef, assignment] from onetl._util.spark import inject_spark_param from onetl.base import FileDFWriteOptions diff --git a/onetl/file/file_downloader/file_downloader.py b/onetl/file/file_downloader/file_downloader.py index a3db0ef19..f179491e0 100644 --- a/onetl/file/file_downloader/file_downloader.py +++ b/onetl/file/file_downloader/file_downloader.py @@ -16,7 +16,11 @@ from etl_entities.old_hwm import FileListHWM as OldFileListHWM from etl_entities.source import RemoteFolder from ordered_set import OrderedSet -from pydantic import Field, PrivateAttr, root_validator, validator + +try: + from pydantic.v1 import Field, PrivateAttr, root_validator, validator +except (ImportError, AttributeError): + from pydantic import Field, PrivateAttr, root_validator, validator # type: ignore[no-redef, assignment] from onetl._internal import generate_temp_path from onetl.base import BaseFileConnection, BaseFileFilter, BaseFileLimit diff --git a/onetl/file/file_downloader/options.py b/onetl/file/file_downloader/options.py index 11b776239..07cc2abc3 100644 --- a/onetl/file/file_downloader/options.py +++ b/onetl/file/file_downloader/options.py @@ -4,7 +4,10 @@ import warnings -from pydantic import Field, root_validator +try: + from pydantic.v1 import Field, root_validator +except (ImportError, AttributeError): + from pydantic import Field, root_validator # type: ignore[no-redef, assignment] from onetl.impl import FileExistBehavior, GenericOptions diff --git a/onetl/file/file_downloader/result.py b/onetl/file/file_downloader/result.py index 7d6233025..754dcdc04 100644 --- a/onetl/file/file_downloader/result.py +++ b/onetl/file/file_downloader/result.py @@ -2,7 +2,10 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations -from pydantic import Field +try: + from pydantic.v1 import Field +except (ImportError, AttributeError): + from pydantic import Field # type: ignore[no-redef, assignment] from onetl.file.file_result import FileResult, FileSet from onetl.impl import FailedRemoteFile, LocalPath, RemoteFile, RemotePath diff --git a/onetl/file/file_mover/file_mover.py b/onetl/file/file_mover/file_mover.py index c39f2170f..1c0b55dc7 100644 --- a/onetl/file/file_mover/file_mover.py +++ b/onetl/file/file_mover/file_mover.py @@ -9,7 +9,11 @@ from typing import Iterable, List, Optional, Tuple from ordered_set import OrderedSet -from pydantic import Field, PrivateAttr, validator + +try: + from pydantic.v1 import Field, PrivateAttr, validator +except (ImportError, AttributeError): + from pydantic import Field, PrivateAttr, validator # type: ignore[no-redef, assignment] from onetl.base import BaseFileConnection, BaseFileFilter, BaseFileLimit from onetl.base.path_protocol import PathProtocol, PathWithStatsProtocol diff --git a/onetl/file/file_mover/options.py b/onetl/file/file_mover/options.py index 78a6fd63d..b68326198 100644 --- a/onetl/file/file_mover/options.py +++ b/onetl/file/file_mover/options.py @@ -4,7 +4,10 @@ import warnings -from pydantic import Field, root_validator +try: + from pydantic.v1 import Field, root_validator +except (ImportError, AttributeError): + from pydantic import Field, root_validator # type: ignore[no-redef, assignment] from onetl.impl import FileExistBehavior, GenericOptions diff --git a/onetl/file/file_mover/result.py b/onetl/file/file_mover/result.py index 59fc433e8..90175a2d4 100644 --- a/onetl/file/file_mover/result.py +++ b/onetl/file/file_mover/result.py @@ -2,7 +2,10 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations -from pydantic import Field +try: + from pydantic.v1 import Field +except (ImportError, AttributeError): + from pydantic import Field # type: ignore[no-redef, assignment] from onetl.file.file_result import FileResult, FileSet from onetl.impl import FailedRemoteFile, RemoteFile, RemotePath diff --git a/onetl/file/file_result.py b/onetl/file/file_result.py index 0ba074288..2841741f9 100644 --- a/onetl/file/file_result.py +++ b/onetl/file/file_result.py @@ -6,7 +6,11 @@ from typing import Iterable from humanize import naturalsize -from pydantic import Field, validator + +try: + from pydantic.v1 import Field, validator +except (ImportError, AttributeError): + from pydantic import Field, validator # type: ignore[no-redef, assignment] from onetl.base import PurePathProtocol from onetl.exception import ( diff --git a/onetl/file/file_uploader/file_uploader.py b/onetl/file/file_uploader/file_uploader.py index d4e77ecdd..ba107310b 100644 --- a/onetl/file/file_uploader/file_uploader.py +++ b/onetl/file/file_uploader/file_uploader.py @@ -9,7 +9,11 @@ from typing import Iterable, Optional, Tuple from ordered_set import OrderedSet -from pydantic import PrivateAttr, validator + +try: + from pydantic.v1 import PrivateAttr, validator +except (ImportError, AttributeError): + from pydantic import PrivateAttr, validator # type: ignore[no-redef, assignment] from onetl._internal import generate_temp_path from onetl.base import BaseFileConnection diff --git a/onetl/file/file_uploader/options.py b/onetl/file/file_uploader/options.py index af4e588fe..9789d8477 100644 --- a/onetl/file/file_uploader/options.py +++ b/onetl/file/file_uploader/options.py @@ -4,7 +4,10 @@ import warnings -from pydantic import Field, root_validator +try: + from pydantic.v1 import Field, root_validator +except (ImportError, AttributeError): + from pydantic import Field, root_validator # type: ignore[no-redef, assignment] from onetl.impl import FileExistBehavior, GenericOptions diff --git a/onetl/file/file_uploader/result.py b/onetl/file/file_uploader/result.py index 48fbbcc85..9a785c719 100644 --- a/onetl/file/file_uploader/result.py +++ b/onetl/file/file_uploader/result.py @@ -2,7 +2,10 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations -from pydantic import Field +try: + from pydantic.v1 import Field +except (ImportError, AttributeError): + from pydantic import Field # type: ignore[no-redef, assignment] from onetl.file.file_result import FileResult, FileSet from onetl.impl import FailedLocalFile, LocalPath, RemoteFile diff --git a/onetl/file/filter/exclude_dir.py b/onetl/file/filter/exclude_dir.py index 33b387623..a7d9007a5 100644 --- a/onetl/file/filter/exclude_dir.py +++ b/onetl/file/filter/exclude_dir.py @@ -4,7 +4,10 @@ import os -from pydantic import validator +try: + from pydantic.v1 import validator +except (ImportError, AttributeError): + from pydantic import validator # type: ignore[no-redef, assignment] from onetl.base import BaseFileFilter, PathProtocol, PurePathProtocol from onetl.impl import FrozenModel, RemotePath diff --git a/onetl/file/filter/glob.py b/onetl/file/filter/glob.py index 045ee1bcc..768261e60 100644 --- a/onetl/file/filter/glob.py +++ b/onetl/file/filter/glob.py @@ -4,7 +4,10 @@ import glob -from pydantic import validator +try: + from pydantic.v1 import validator +except (ImportError, AttributeError): + from pydantic import validator # type: ignore[no-redef, assignment] from onetl.base import BaseFileFilter, PathProtocol from onetl.impl import FrozenModel diff --git a/onetl/file/filter/regexp.py b/onetl/file/filter/regexp.py index e07ba7039..0c64001da 100644 --- a/onetl/file/filter/regexp.py +++ b/onetl/file/filter/regexp.py @@ -5,7 +5,10 @@ import os import re -from pydantic import validator +try: + from pydantic.v1 import validator +except (ImportError, AttributeError): + from pydantic import validator # type: ignore[no-redef, assignment] from onetl.base import BaseFileFilter, PathProtocol from onetl.impl import FrozenModel diff --git a/onetl/file/format/avro.py b/onetl/file/format/avro.py index 04d07af84..551c76c9b 100644 --- a/onetl/file/format/avro.py +++ b/onetl/file/format/avro.py @@ -6,7 +6,10 @@ import logging from typing import TYPE_CHECKING, ClassVar, Dict, Optional -from pydantic import Field, validator +try: + from pydantic.v1 import Field, validator +except (ImportError, AttributeError): + from pydantic import Field, validator # type: ignore[no-redef, assignment] from onetl._util.java import try_import_java_class from onetl._util.scala import get_default_scala_version diff --git a/onetl/file/format/csv.py b/onetl/file/format/csv.py index e21eb5d82..41794f7ef 100644 --- a/onetl/file/format/csv.py +++ b/onetl/file/format/csv.py @@ -4,7 +4,10 @@ from typing import TYPE_CHECKING, ClassVar -from pydantic import Field +try: + from pydantic.v1 import Field +except (ImportError, AttributeError): + from pydantic import Field # type: ignore[no-redef, assignment] from onetl.file.format.file_format import ReadWriteFileFormat from onetl.hooks import slot, support_hooks diff --git a/onetl/file/format/xml.py b/onetl/file/format/xml.py index 794593d34..47190ea77 100644 --- a/onetl/file/format/xml.py +++ b/onetl/file/format/xml.py @@ -5,7 +5,10 @@ import logging from typing import TYPE_CHECKING, ClassVar -from pydantic import Field +try: + from pydantic.v1 import Field +except (ImportError, AttributeError): + from pydantic import Field # type: ignore[no-redef, assignment] from onetl._util.java import try_import_java_class from onetl._util.scala import get_default_scala_version diff --git a/onetl/hwm/auto_hwm.py b/onetl/hwm/auto_hwm.py index 26ee406c8..5c42f3579 100644 --- a/onetl/hwm/auto_hwm.py +++ b/onetl/hwm/auto_hwm.py @@ -5,7 +5,12 @@ from typing import Any from etl_entities.hwm import HWM -from pydantic import root_validator + +try: + from pydantic.v1 import root_validator +except (ImportError, AttributeError): + from pydantic import root_validator # type: ignore[no-redef, assignment] + from typing_extensions import Literal diff --git a/onetl/hwm/store/yaml_hwm_store.py b/onetl/hwm/store/yaml_hwm_store.py index 759637bab..2ffd24f68 100644 --- a/onetl/hwm/store/yaml_hwm_store.py +++ b/onetl/hwm/store/yaml_hwm_store.py @@ -14,7 +14,11 @@ register_hwm_store_class, ) from platformdirs import user_data_dir -from pydantic import validator + +try: + from pydantic.v1 import validator +except (ImportError, AttributeError): + from pydantic import validator # type: ignore[no-redef, assignment] from onetl.hooks import slot, support_hooks from onetl.impl import FrozenModel, LocalPath diff --git a/onetl/impl/base_model.py b/onetl/impl/base_model.py index cf76cd48c..3208619e5 100644 --- a/onetl/impl/base_model.py +++ b/onetl/impl/base_model.py @@ -1,10 +1,15 @@ # SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) # SPDX-License-Identifier: Apache-2.0 +# isort: skip_file + from __future__ import annotations from typing import ClassVar -from pydantic import BaseModel as PydanticBaseModel +try: + from pydantic.v1 import BaseModel as PydanticBaseModel +except (ImportError, AttributeError): + from pydantic import BaseModel as PydanticBaseModel # type: ignore[no-redef, assignment] class BaseModel(PydanticBaseModel): diff --git a/onetl/impl/generic_options.py b/onetl/impl/generic_options.py index 057e69a65..8d3e629b9 100644 --- a/onetl/impl/generic_options.py +++ b/onetl/impl/generic_options.py @@ -7,7 +7,10 @@ from fnmatch import fnmatch from typing import Iterable, TypeVar -from pydantic import root_validator +try: + from pydantic.v1 import root_validator +except (ImportError, AttributeError): + from pydantic import root_validator # type: ignore[no-redef, assignment] from onetl.impl.frozen_model import FrozenModel diff --git a/onetl/strategy/batch_hwm_strategy.py b/onetl/strategy/batch_hwm_strategy.py index dc02fb29a..1e5ec5b5b 100644 --- a/onetl/strategy/batch_hwm_strategy.py +++ b/onetl/strategy/batch_hwm_strategy.py @@ -6,7 +6,10 @@ from textwrap import dedent from typing import Any, ClassVar -from pydantic import validator +try: + from pydantic.v1 import validator +except (ImportError, AttributeError): + from pydantic import validator # type: ignore[no-redef, assignment] from onetl.hwm import Edge from onetl.strategy.hwm_strategy import HWMStrategy diff --git a/requirements/core.txt b/requirements/core.txt index 27c17ceaa..c8e245a4a 100644 --- a/requirements/core.txt +++ b/requirements/core.txt @@ -1,4 +1,4 @@ -etl-entities>=2.2,<2.3 +etl-entities>=2.2,<2.4 evacuator>=1.0,<1.1 frozendict humanize @@ -8,7 +8,7 @@ ordered-set platformdirs # https://github.com/pydantic/pydantic/issues/4092 # https://github.com/pydantic/pydantic/issues/4498 -pydantic>=1.9.2,!=1.10.2,<2 +pydantic>=1.9.2,!=1.10.2,<3 pyyaml # https://github.com/python/typing_extensions/pull/188 typing_extensions>=4.5.0,!=4.6.1; python_version == "3.7" diff --git a/requirements/docs-plantuml.txt b/requirements/docs-plantuml.txt deleted file mode 100644 index efaaff7ac..000000000 --- a/requirements/docs-plantuml.txt +++ /dev/null @@ -1,2 +0,0 @@ ---no-deps -sphinx-plantuml diff --git a/requirements/docs.txt b/requirements/docs.txt index fedf2562a..3776dbb09 100644 --- a/requirements/docs.txt +++ b/requirements/docs.txt @@ -1,4 +1,4 @@ -autodoc-pydantic<2.0.0 +autodoc-pydantic<2 furo importlib-resources<6 numpydoc diff --git a/requirements/tests/pydantic-1.txt b/requirements/tests/pydantic-1.txt new file mode 100644 index 000000000..fd72c6577 --- /dev/null +++ b/requirements/tests/pydantic-1.txt @@ -0,0 +1 @@ +pydantic<2 diff --git a/requirements/tests/pydantic-2.txt b/requirements/tests/pydantic-2.txt new file mode 100644 index 000000000..8b2c779e6 --- /dev/null +++ b/requirements/tests/pydantic-2.txt @@ -0,0 +1 @@ +pydantic>=2,<3 diff --git a/requirements/tests/pydantic-latest.txt b/requirements/tests/pydantic-latest.txt new file mode 100644 index 000000000..2d5452210 --- /dev/null +++ b/requirements/tests/pydantic-latest.txt @@ -0,0 +1 @@ +pydantic>=2 diff --git a/setup.cfg b/setup.cfg index f34539545..96b763b7c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -51,7 +51,7 @@ max-cognitive-score = 20 # Max amount of cognitive complexity per module max-cognitive-average = 25 max-imports = 25 -max-imported-names = 55 +max-imported-names = 60 # Max of expression usages in a module max-module-expressions = 15 # Max of expression usages in a function From c8492b10d7f0d3eb6ada4bfef879aca91cb6e64a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Wed, 6 Mar 2024 12:38:26 +0000 Subject: [PATCH 38/63] [DOP-13259] Update Clickhouse types documentation --- .../db_connection/clickhouse/types.rst | 62 ++++++++++--------- .../db_connection/postgres/types.rst | 3 +- 2 files changed, 35 insertions(+), 30 deletions(-) diff --git a/docs/connection/db_connection/clickhouse/types.rst b/docs/connection/db_connection/clickhouse/types.rst index 01d6e6b53..d34b427a6 100644 --- a/docs/connection/db_connection/clickhouse/types.rst +++ b/docs/connection/db_connection/clickhouse/types.rst @@ -23,8 +23,8 @@ Writing to some existing Clickhuse table This is how Clickhouse connector performs this: * Get names of columns in DataFrame. [1]_ -* Perform ``SELECT column1, colum2, ... FROM table LIMIT 0`` query -* For each column in query result get column name and Clickhouse type. +* Perform ``SELECT * FROM table LIMIT 0`` query. +* Take only columns present in DataFrame (by name, case insensitive). For each found column get Clickhouse type. * **Find corresponding** ``Clickhouse type (read)`` -> ``Spark type`` **combination** (see below) for each DataFrame column. If no combination is found, raise exception. [2]_ * Find corresponding ``Spark type`` -> ``Clickhousetype (write)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. * If ``Clickhousetype (write)`` match ``Clickhouse type (read)``, no additional casts will be performed, DataFrame column will be written to Clickhouse as is. @@ -61,40 +61,44 @@ This may lead to incidental precision loss, or sometimes data cannot be written So instead of relying on Spark to create tables: -.. code:: python +.. dropdown:: See example - writer = DBWriter( - connection=clickhouse, - table="default.target_tbl", - options=Clickhouse.WriteOptions( - if_exists="append", - # ENGINE is required by Clickhouse - createTableOptions="ENGINE = MergeTree() ORDER BY id", - ), - ) - writer.run(df) + .. code:: python + + writer = DBWriter( + connection=clickhouse, + table="default.target_tbl", + options=Clickhouse.WriteOptions( + if_exists="append", + # ENGINE is required by Clickhouse + createTableOptions="ENGINE = MergeTree() ORDER BY id", + ), + ) + writer.run(df) Always prefer creating tables with specific types **BEFORE WRITING DATA**: -.. code:: python +.. dropdown:: See example - clickhouse.execute( - """ - CREATE TABLE default.target_tbl AS ( - id UInt8, - value DateTime64(6) -- specific type and precision + .. code:: python + + clickhouse.execute( + """ + CREATE TABLE default.target_tbl AS ( + id UInt8, + value DateTime64(6) -- specific type and precision + ) + ENGINE = MergeTree() + ORDER BY id + """, ) - ENGINE = MergeTree() - ORDER BY id - """, - ) - writer = DBWriter( - connection=clickhouse, - table="default.target_tbl", - options=Clickhouse.WriteOptions(if_exists="append"), - ) - writer.run(df) + writer = DBWriter( + connection=clickhouse, + table="default.target_tbl", + options=Clickhouse.WriteOptions(if_exists="append"), + ) + writer.run(df) References ~~~~~~~~~~ diff --git a/docs/connection/db_connection/postgres/types.rst b/docs/connection/db_connection/postgres/types.rst index 210e3bd6c..b149b64bd 100644 --- a/docs/connection/db_connection/postgres/types.rst +++ b/docs/connection/db_connection/postgres/types.rst @@ -26,7 +26,8 @@ Writing to some existing Clickhuse table This is how Postgres connector performs this: * Get names of columns in DataFrame. [1]_ -* Perform ``SELECT column1, colum2, ... FROM table LIMIT 0`` query. For each column in query result get Postgres type. +* Perform ``SELECT * FROM table LIMIT 0`` query. +* Take only columns present in DataFrame (by name, case insensitive). For each found column get Clickhouse type. * Find corresponding ``Spark type`` -> ``Postgres type (write)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. * If ``Postgres type (write)`` match ``Postgres type (read)``, no additional casts will be performed, DataFrame column will be written to Postgres as is. * If ``Postgres type (write)`` does not match ``Postgres type (read)``, DataFrame column will be casted to target column type **on Postgres side**. From d19bb3edc42b6e1b7fbc882dfa7d2226480286bb Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 11 Mar 2024 06:38:21 +0000 Subject: [PATCH 39/63] Bump the github-actions group with 1 update Bumps the github-actions group with 1 update: [softprops/action-gh-release](https://github.com/softprops/action-gh-release). Updates `softprops/action-gh-release` from 1 to 2 - [Release notes](https://github.com/softprops/action-gh-release/releases) - [Changelog](https://github.com/softprops/action-gh-release/blob/master/CHANGELOG.md) - [Commits](https://github.com/softprops/action-gh-release/compare/v1...v2) --- updated-dependencies: - dependency-name: softprops/action-gh-release dependency-type: direct:production update-type: version-update:semver-major dependency-group: github-actions ... Signed-off-by: dependabot[bot] --- .github/workflows/release.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 90dfc638d..72aab7800 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -105,7 +105,7 @@ jobs: - name: Create Github release id: create_release - uses: softprops/action-gh-release@v1 + uses: softprops/action-gh-release@v2 with: token: ${{ secrets.GITHUB_TOKEN }} draft: false From c4a62924f77ea07b720b96c138d63b6a8794684d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Mon, 11 Mar 2024 07:49:31 +0000 Subject: [PATCH 40/63] [DOP-13337] Fix coverage config --- tests/.coveragerc | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/.coveragerc b/tests/.coveragerc index 295122b8a..5d7fb16f9 100644 --- a/tests/.coveragerc +++ b/tests/.coveragerc @@ -20,3 +20,4 @@ exclude_lines = spark = SparkSession._instantiatedSession if log.isEnabledFor(logging.DEBUG): if sys.version_info + except .*ImportError From 6476cc14175a74595882292d7b8687b826a08a7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Mon, 11 Mar 2024 07:53:07 +0000 Subject: [PATCH 41/63] [DOP-13337] Update Codecov action usage --- .github/workflows/tests.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 4a337a751..3ee7f3f80 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -397,8 +397,7 @@ jobs: token: ${{ secrets.CODECOV_TOKEN }} file: ./reports/coverage.xml fail_ci_if_error: true - # TODO: remove after fixing https://github.com/codecov/codecov-cli/issues/367 - plugin: 'gcov' + plugin: 'noop' - name: All done run: echo 1 From f306a979354113e2863278b81a6f973fc665b44b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Thu, 7 Mar 2024 11:09:32 +0000 Subject: [PATCH 42/63] [DOP-13558] Document using Greenplum connector with IP resolution issues --- .../next_release/228.improvement.rst | 1 + .../db_connection/greenplum/prerequisites.rst | 198 ++++++++++++++---- 2 files changed, 154 insertions(+), 45 deletions(-) diff --git a/docs/changelog/next_release/228.improvement.rst b/docs/changelog/next_release/228.improvement.rst index ef936eaaa..c5295422e 100644 --- a/docs/changelog/next_release/228.improvement.rst +++ b/docs/changelog/next_release/228.improvement.rst @@ -1,3 +1,4 @@ Improve Greenplum documentation: * Add "Types" section describing mapping between Greenplum and Spark types * Add more examples of reading and writing data from Greenplum + * Add notes about issues with IP resolution and building ``gpfdist`` URL. diff --git a/docs/connection/db_connection/greenplum/prerequisites.rst b/docs/connection/db_connection/greenplum/prerequisites.rst index 4796e9d12..a5509a003 100644 --- a/docs/connection/db_connection/greenplum/prerequisites.rst +++ b/docs/connection/db_connection/greenplum/prerequisites.rst @@ -53,8 +53,8 @@ Greenplum master only receives commands to start reading/writing process, and ma More details can be found in `official documentation `_. -Number of parallel connections -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Set number of connections +~~~~~~~~~~~~~~~~~~~~~~~~~ .. warning:: @@ -83,32 +83,32 @@ Number of connections can be limited by 2 ways: .. code-tab:: py Spark with master=local - ( + spark = ( SparkSession.builder - # Spark will start EXACTLY 10 executors with 1 core each, so max number of parallel jobs is 10 - .config("spark.master", "local[10]") + # Spark will start EXACTLY 5 executors with 1 core each, so max number of parallel jobs is 10 + .config("spark.master", "local[5]") .config("spark.executor.cores", 1) - ) + ).getOrCreate() .. code-tab:: py Spark with master=yarn or master=k8s, dynamic allocation - ( + spark = ( SparkSession.builder .config("spark.master", "yarn") # Spark will start MAX 10 executors with 1 core each (dynamically), so max number of parallel jobs is 10 .config("spark.dynamicAllocation.maxExecutors", 10) .config("spark.executor.cores", 1) - ) + ).getOrCreate() .. code-tab:: py Spark with master=yarn or master=k8s, static allocation - ( + spark = ( SparkSession.builder .config("spark.master", "yarn") # Spark will start EXACTLY 10 executors with 1 core each, so max number of parallel jobs is 10 .config("spark.executor.instances", 10) .config("spark.executor.cores", 1) - ) + ).getOrCreate() * By limiting connection pool size user by Spark (**only** for Spark with ``master=local``): @@ -140,8 +140,8 @@ e.g. by updating ``pg_hba.conf`` file. More details can be found in `official documentation `_. -Network ports -~~~~~~~~~~~~~ +Set connection port +~~~~~~~~~~~~~~~~~~~ Spark with ``master=k8s`` ^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -159,7 +159,7 @@ To read data from Greenplum using Spark, following ports should be opened in fir .. code:: python - Greenplum(host="master.host", port=5432, ...) + greenplum = Greenplum(host="master.host", port=5432, ...) * Greenplum segments -> some port range (e.g. ``41000-42000``) **listened by Spark executors**. @@ -167,12 +167,12 @@ To read data from Greenplum using Spark, following ports should be opened in fir .. code:: python - Greenplum( - ..., - extra={ - "server.port": "41000-42000", - }, - ) + greenplum = Greenplum( + ..., + extra={ + "server.port": "41000-42000", + }, + ) Number of ports in this range is ``number of parallel running Spark sessions`` * ``number of parallel connections per session``. @@ -189,8 +189,140 @@ More details can be found in official documentation: * `format of server.port value `_ * `port troubleshooting `_ -Required grants -~~~~~~~~~~~~~~~ +Set connection host +~~~~~~~~~~~~~~~~~~~ + +Spark with ``master=k8s`` +^^^^^^^^^^^^^^^^^^^^^^^^^ + +Please follow `the official documentation `_ + +Spark with ``master=local`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +By default, Greenplum connector tries to resolve IP of current host, and then pass it as ``gpfdist`` URL to Greenplum segment. +This may fail in some cases. + +For example, IP can be resolved using ``/etc/hosts`` content like this: + +.. code:: text + + 127.0.0.1 localhost real-host-name + +.. code:: bash + + $ hostname -f + localhost + + $ hostname -i + 127.0.0.1 + +Reading/writing data to Greenplum will fail with following exception: + +.. code-block:: text + + org.postgresql.util.PSQLException: ERROR: connection with gpfdist failed for + "gpfdist://127.0.0.1:49152/local-1709739764667/exec/driver", + effective url: "http://127.0.0.1:49152/local-1709739764667/exec/driver": + error code = 111 (Connection refused); (seg3 slice1 12.34.56.78:10003 pid=123456) + +There are 2 ways to fix that: + +* Explicitly pass your host IP address to connector, like this + + .. code-block:: python + + import os + + # pass here real host IP (accessible from GP segments) + os.environ["HOST_IP"] = "192.168.1.1" + + greenplum = Greenplum( + ..., + extra={ + # connector will read IP from this environment variable + "server.hostEnv": "env.HOST_IP", + }, + spark=spark, + ) + + More details can be found in `official documentation `_. + +* Update ``/etc/hosts`` file to include real host IP: + + .. code:: text + + 127.0.0.1 localhost + # this IP should be accessible from GP segments + 192.168.1.1 driver-host-name + + So Greenplum connector will properly resolve host IP. + +Spark with ``master=yarn`` +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The same issue with resolving IP address can occur on Hadoop cluster node, but it's tricky to fix, because each node has a different IP. + +There are 3 ways to fix that: + +* Pass node hostname to ``gpfdist`` URL. So IP will be resolved on segment side: + + .. code-block:: python + + greenplum = Greenplum( + ..., + extra={ + "server.useHostname": "true", + }, + ) + + But this may fail if Hadoop cluster node hostname cannot be resolved from Greenplum segment side. + + More details can be found in `official documentation `_. + +* Set specific network interface to get IP address from: + + .. code-block:: python + + greenplum = Greenplum( + ..., + extra={ + "server.nic": "eth0", + }, + ) + + You can get list of network interfaces using this command. + + .. note:: + + This command should be executed on Hadoop cluster node, **not** Spark driver host! + + .. code-block:: bash + + $ ip address + 1: lo: mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000 + inet 127.0.0.1/8 scope host lo + valid_lft forever preferred_lft forever + 2: eth0: mtu 1500 qdisc fq_codel state UP group default qlen 1000 + inet 192.168.1.1/24 brd 192.168.1.255 scope global dynamic noprefixroute eth0 + valid_lft 83457sec preferred_lft 83457sec + + Note that in this case **each** Hadoop cluster node node should have network interface with name ``eth0``. + + More details can be found in `official documentation `_. + +* Update ``/etc/hosts`` on each Hadoop cluster node to include real node IP: + + .. code:: text + + 127.0.0.1 localhost + # this IP should be accessible from GP segments + 192.168.1.1 cluster-node-name + + So Greenplum connector will properly resolve node IP. + +Set required grants +~~~~~~~~~~~~~~~~~~~ Ask your Greenplum cluster administrator to set following grants for a user, used for creating a connection: @@ -243,28 +375,4 @@ used for creating a connection: -- allow read access to specific table GRANT SELECT ON schema_to_read.table_to_read TO username; - .. code-tab:: sql Write only - - -- get access to get tables metadata & cluster information - GRANT SELECT ON information_schema.tables TO username; - GRANT SELECT ON pg_attribute TO username; - GRANT SELECT ON pg_class TO username; - GRANT SELECT ON pg_namespace TO username; - GRANT SELECT ON pg_settings TO username; - GRANT SELECT ON pg_stats TO username; - GRANT SELECT ON gp_distributed_xacts TO username; - GRANT SELECT ON gp_segment_configuration TO username; - -- Greenplum 5.x only - GRANT SELECT ON gp_distribution_policy TO username; - - -- allow creating external tables in the same schema as target table - GRANT USAGE ON SCHEMA schema_to_write TO username; - GRANT CREATE ON SCHEMA schema_to_write TO username; - -- yes, ``readable``, because data is read from Spark executor to Greenplum. - ALTER USER username CREATEEXTTABLE(type = 'readable', protocol = 'gpfdist'); - - -- allow read access to specific table (to get column types) - -- allow write access to specific table - GRANT SELECT, INSERT ON schema_to_write.table_to_write TO username; - More details can be found in `official documentation `_. From b5a577a34316b5778b9ef34f5221d1d351c741d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Tue, 12 Mar 2024 14:00:28 +0000 Subject: [PATCH 43/63] [DOP-13252] Improve MySQL documentation --- .../next_release/234.improvement.rst | 5 + .../db_connection/mysql/execute.rst | 92 ++++- docs/connection/db_connection/mysql/index.rst | 8 + .../db_connection/mysql/prerequisites.rst | 64 +++ docs/connection/db_connection/mysql/read.rst | 74 +++- docs/connection/db_connection/mysql/sql.rst | 55 +++ docs/connection/db_connection/mysql/types.rst | 379 ++++++++++++++++++ docs/connection/db_connection/mysql/write.rst | 48 ++- .../db_connection/mysql/connection.py | 26 +- 9 files changed, 716 insertions(+), 35 deletions(-) create mode 100644 docs/changelog/next_release/234.improvement.rst create mode 100644 docs/connection/db_connection/mysql/prerequisites.rst create mode 100644 docs/connection/db_connection/mysql/sql.rst create mode 100644 docs/connection/db_connection/mysql/types.rst diff --git a/docs/changelog/next_release/234.improvement.rst b/docs/changelog/next_release/234.improvement.rst new file mode 100644 index 000000000..89857ef03 --- /dev/null +++ b/docs/changelog/next_release/234.improvement.rst @@ -0,0 +1,5 @@ +Improve MySQL documentation: + * Add "Types" section describing mapping between MySQL and Spark types + * Add "Prerequisites" section describing different aspects of connecting to MySQL + * Separate documentation of ``DBReader`` and ``MySQL.sql`` + * Add examples for ``MySQL.fetch`` and ``MySQL.execute`` diff --git a/docs/connection/db_connection/mysql/execute.rst b/docs/connection/db_connection/mysql/execute.rst index ec5d01482..243f4e704 100644 --- a/docs/connection/db_connection/mysql/execute.rst +++ b/docs/connection/db_connection/mysql/execute.rst @@ -1,7 +1,97 @@ .. _mysql-execute: Executing statements in MySQL -============================= +================================== + +How to +------ + +There are 2 ways to execute some statement in MySQL + +Use :obj:`MySQL.fetch ` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Use this method to execute some ``SELECT`` query which returns **small number or rows**, like reading +MySQL config, or reading data from some reference table. + +Method accepts :obj:`JDBCOptions `. + +Connection opened using this method should be then closed with :obj:`MySQL.close `. + +Syntax support +^^^^^^^^^^^^^^ + +This method supports **any** query syntax supported by MySQL, like: + +* ✅︎ ``SELECT ... FROM ...`` +* ✅︎ ``WITH alias AS (...) SELECT ...`` +* ✅︎ ``SELECT func(arg1, arg2)`` or ``{?= call func(arg1, arg2)}`` - special syntax for calling function +* ✅︎ ``SHOW ...`` +* ❌ ``SET ...; SELECT ...;`` - multiple statements not supported + +Examples +^^^^^^^^ + +.. code-block:: python + + from onetl.connection import MySQL + + mysql = MySQL(...) + + df = mysql.fetch( + "SELECT value FROM some.reference_table WHERE key = 'some_constant'", + options=MySQL.JDBCOptions(query_timeout=10), + ) + mysql.close() + value = df.collect()[0][0] # get value from first row and first column + +Use :obj:`MySQL.execute ` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it. + +Method accepts :obj:`JDBCOptions `. + +Connection opened using this method should be then closed with :obj:`MySQL.close `. + +Syntax support +^^^^^^^^^^^^^^ + +This method supports **any** query syntax supported by MySQL, like: + +* ✅︎ ``CREATE TABLE ...``, ``CREATE VIEW ...``, and so on +* ✅︎ ``ALTER ...`` +* ✅︎ ``INSERT INTO ... AS SELECT ...`` +* ✅︎ ``DROP TABLE ...``, ``DROP VIEW ...``, and so on +* ✅︎ ``CALL procedure(arg1, arg2) ...`` or ``{call procedure(arg1, arg2)}`` - special syntax for calling procedure +* ✅︎ other statements not mentioned here +* ❌ ``SET ...; SELECT ...;`` - multiple statements not supported + +Examples +^^^^^^^^ + +.. code-block:: python + + from onetl.connection import MySQL + + mysql = MySQL(...) + + with mysql: + mysql.execute("DROP TABLE schema.table") + mysql.execute( + """ + CREATE TABLE schema.table AS ( + id bigint, + key text, + value float + ) + ENGINE = InnoDB + """, + options=MySQL.JDBCOptions(query_timeout=10), + ) + +References +---------- .. currentmodule:: onetl.connection.db_connection.mysql.connection diff --git a/docs/connection/db_connection/mysql/index.rst b/docs/connection/db_connection/mysql/index.rst index e221165cd..f3348141a 100644 --- a/docs/connection/db_connection/mysql/index.rst +++ b/docs/connection/db_connection/mysql/index.rst @@ -7,6 +7,7 @@ MySQL :maxdepth: 1 :caption: Connection + prerequisites connection .. toctree:: @@ -14,5 +15,12 @@ MySQL :caption: Operations read + sql write execute + +.. toctree:: + :maxdepth: 1 + :caption: Troubleshooting + + types diff --git a/docs/connection/db_connection/mysql/prerequisites.rst b/docs/connection/db_connection/mysql/prerequisites.rst new file mode 100644 index 000000000..eeb59cf44 --- /dev/null +++ b/docs/connection/db_connection/mysql/prerequisites.rst @@ -0,0 +1,64 @@ +.. _mysql-prerequisites: + +Prerequisites +============= + +Version Compatibility +--------------------- + +* MySQL server versions: 5.7, 8.0 +* Spark versions: 2.3.x - 3.5.x +* Java versions: 8 - 20 + +See `official documentation `_. + +Installing PySpark +------------------ + +To use MySQL connector you should have PySpark installed (or injected to ``sys.path``) +BEFORE creating the connector instance. + +See :ref:`install-spark` installation instruction for more details. + +Connecting to MySQL +----------------------- + +Connection host +~~~~~~~~~~~~~~~ + +It is possible to connect to MySQL by using either DNS name of host or it's IP address. + +If you're using MySQL cluster, it is currently possible to connect only to **one specific node**. +Connecting to multiple nodes to perform load balancing, as well as automatic failover to new master/replica are not supported. + +Connection port +~~~~~~~~~~~~~~~ + +Connection is usually performed to port 3306. Port may differ for different MySQL instances. +Please ask your MySQL administrator to provide required information. + +Required grants +~~~~~~~~~~~~~~~ + +Ask your MySQL cluster administrator to set following grants for a user, +used for creating a connection: + +.. tabs:: + + .. code-tab:: sql Read + Write + + -- allow external tables in the same schema as target table + GRANT CREATE ON myschema.* TO username@'192.168.1.%'; + + -- allow read & write access to specific table + GRANT SELECT, INSERT ON myschema.mytable TO username@'192.168.1.%'; + + .. code-tab:: sql Read only + + -- allow read access to specific table + GRANT SELECT ON myschema.mytable TO username@'192.168.1.%'; + +In example above ``'192.168.1.%''`` is a network subnet ``192.168.1.0 - 192.168.1.255`` +where Spark driver and executors are running. To allow connecting user from any IP, use ``'%'`` (not secure!). + +More details can be found in `official documentation `_. diff --git a/docs/connection/db_connection/mysql/read.rst b/docs/connection/db_connection/mysql/read.rst index 6a6960532..ebf7f7a49 100644 --- a/docs/connection/db_connection/mysql/read.rst +++ b/docs/connection/db_connection/mysql/read.rst @@ -1,18 +1,76 @@ .. _mysql-read: -Reading from MySQL -================== +Reading from MySQL using ``DBReader`` +===================================== -There are 2 ways of distributed data reading from MySQL: +.. warning:: -* Using :obj:`DBReader ` with different :ref:`strategy` -* Using :obj:`MySQL.sql ` + Please take into account :ref:`mysql-types` -Both methods accept :obj:`JDBCReadOptions ` +:obj:`DBReader ` supports :ref:`strategy` for incremental data reading, +but does not support custom queries, like JOINs. -.. currentmodule:: onetl.connection.db_connection.mysql.connection +Supported DBReader features +--------------------------- -.. automethod:: MySQL.sql +* ✅︎ ``columns`` +* ✅︎ ``where`` +* ✅︎ ``hwm``, supported strategies: +* * ✅︎ :ref:`snapshot-strategy` +* * ✅︎ :ref:`incremental-strategy` +* * ✅︎ :ref:`snapshot-batch-strategy` +* * ✅︎ :ref:`incremental-batch-strategy` +* ✅︎ ``hint`` (see `official documentation `_) +* ❌ ``df_schema`` +* ✅︎ ``options`` (see :obj:`JDBCReadOptions `) + +Examples +-------- + +Snapshot strategy: + +.. code-block:: python + + from onetl.connection import MySQL + from onetl.db import DBReader + + mysql = MySQL(...) + + reader = DBReader( + connection=mysql, + source="schema.table", + columns=["id", "key", "CAST(value AS text) value", "updated_dt"], + where="key = 'something'", + hint="SKIP_SCAN(schema.table key_index)", + options=MySQL.ReadOptions(partition_column="id", num_partitions=10), + ) + df = reader.run() + +Incremental strategy: + +.. code-block:: python + + from onetl.connection import MySQL + from onetl.db import DBReader + from onetl.strategy import IncrementalStrategy + + mysql = MySQL(...) + + reader = DBReader( + connection=mysql, + source="schema.table", + columns=["id", "key", "CAST(value AS text) value", "updated_dt"], + where="key = 'something'", + hint="SKIP_SCAN(schema.table key_index)", + hwm=DBReader.AutoDetectHWM(name="mysql_hwm", expression="updated_dt"), + options=MySQL.ReadOptions(partition_column="id", num_partitions=10), + ) + + with IncrementalStrategy(): + df = reader.run() + +Read options +------------ .. currentmodule:: onetl.connection.db_connection.jdbc_connection.options diff --git a/docs/connection/db_connection/mysql/sql.rst b/docs/connection/db_connection/mysql/sql.rst new file mode 100644 index 000000000..515dcf77c --- /dev/null +++ b/docs/connection/db_connection/mysql/sql.rst @@ -0,0 +1,55 @@ +.. _clickhouse-sql: + +Reading from Clickhouse using ``Clickhouse.sql`` +================================================ + +.. warning:: + + Please take into account :ref:`clickhouse-types` + +:obj:`Clickhouse.sql ` allows passing custom SQL query, +but does not support incremental strategies. + +Method also accepts :obj:`JDBCReadOptions `. + +Syntax support +-------------- + +Only queries with the following syntax are supported: + +* ``SELECT ...`` +* ``WITH alias AS (...) SELECT ...`` + +Queries like ``SHOW ...`` are not supported. + +This method also does not support multiple queries in the same operation, like ``SET ...; SELECT ...;``. + +Examples +-------- + +.. code-block:: python + + from onetl.connection import Clickhouse + + clickhouse = Clickhouse(...) + df = clickhouse.sql( + """ + SELECT + id, + key, + CAST(value AS text) value, + updated_at + FROM + some.mytable + WHERE + key = 'something' + """, + options=Clickhouse.ReadOptions(partition_column="id", num_partitions=10), + ) + +References +---------- + +.. currentmodule:: onetl.connection.db_connection.clickhouse.connection + +.. automethod:: Clickhouse.sql diff --git a/docs/connection/db_connection/mysql/types.rst b/docs/connection/db_connection/mysql/types.rst new file mode 100644 index 000000000..4968bd434 --- /dev/null +++ b/docs/connection/db_connection/mysql/types.rst @@ -0,0 +1,379 @@ +.. _mysql-types: + +MySQL <-> Spark type mapping +================================= + +Type detection & casting +------------------------ + +Spark's DataFrames always have a ``schema`` which is a list of columns with corresponding Spark types. All operations on a column are performed using column type. + +Reading from MySQL +~~~~~~~~~~~~~~~~~~~~~~~ + +This is how MySQL connector performs this: + +* For each column in query result (``SELECT column1, column2, ... FROM table ...``) get column name and MySQL type. +* Find corresponding ``MySQL type (read)`` -> ``Spark type`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* Create DataFrame from query with specific column names and Spark types. + +Writing to some existing MySQL table +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This is how MySQL connector performs this: + +* Get names of columns in DataFrame. [1]_ +* Perform ``SELECT * FROM table LIMIT 0`` query. +* Take only columns present in DataFrame (by name, case insensitive). For each found column get MySQL type. +* Find corresponding ``Spark type`` -> ``MySQL type (write)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* If ``MySQL type (write)`` match ``MySQL type (read)``, no additional casts will be performed, DataFrame column will be written to MySQL as is. +* If ``MySQL type (write)`` does not match ``MySQL type (read)``, DataFrame column will be casted to target column type **on MySQL side**. For example, you can write column with text data to ``int`` column, if column contains valid integer values within supported value range and precision. + +.. [1] + This allows to write data to tables with ``DEFAULT`` and ``GENERATED`` columns - if DataFrame has no such column, + it will be populated by MySQL. + +Create new table using Spark +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. warning:: + + ABSOLUTELY NOT RECOMMENDED! + +This is how MySQL connector performs this: + +* Find corresponding ``Spark type`` -> ``MySQL type (create)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* Generate DDL for creating table in MySQL, like ``CREATE TABLE (col1 ...)``, and run it. +* Write DataFrame to created table as is. + +But some cases this may lead to using wrong column type. For example, Spark creates column of type ``timestamp`` +which corresponds to MySQL's type ``timestamp(0)`` (precision up to seconds) +instead of more precise ``timestamp(6)`` (precision up to nanoseconds). +This may lead to incidental precision loss, or sometimes data cannot be written to created table at all. + +So instead of relying on Spark to create tables: + +.. dropdown:: See example + + .. code:: python + + writer = DBWriter( + connection=mysql, + table="myschema.target_tbl", + options=MySQL.WriteOptions( + if_exists="append", + createTableOptions="ENGINE = InnoDB", + ), + ) + writer.run(df) + +Always prefer creating tables with specific types **BEFORE WRITING DATA**: + +.. dropdown:: See example + + .. code:: python + + mysql.execute( + """ + CREATE TABLE schema.table AS ( + id bigint, + key text, + value timestamp(6) -- specific type and precision + ) + ENGINE = InnoDB + """, + ) + + writer = DBWriter( + connection=mysql, + table="myschema.target_tbl", + options=MySQL.WriteOptions(if_exists="append"), + ) + writer.run(df) + +References +~~~~~~~~~~ + +Here you can find source code with type conversions: + +* `MySQL -> JDBC `_ +* `JDBC -> Spark `_ +* `Spark -> JDBC `_ +* `JDBC -> MySQL `_ + +Supported types +--------------- + +See `official documentation `_ + +Numeric types +~~~~~~~~~~~~~ + ++-------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| MySQL type (read) | Spark type | MySQL type (write) | MySQL type (create) | ++===============================+===================================+===============================+===============================+ +| ``decimal`` | ``DecimalType(P=10, S=0)`` | ``decimal(P=10, S=0)`` | ``decimal(P=10, S=0)`` | ++-------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``decimal(P=0..38)`` | ``DecimalType(P=0..38, S=0)`` | ``decimal(P=0..38, S=0)`` | ``decimal(P=0..38, S=0)`` | ++-------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``decimal(P=0..38, S=0..30)`` | ``DecimalType(P=0..38, S=0..30)`` | ``decimal(P=0..38, S=0..30)`` | ``decimal(P=0..38, S=0..30)`` | ++-------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``decimal(P=39..65, S=...)`` | unsupported [2]_ | | | ++-------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``float`` | ``DoubleType()`` | ``double`` | ``double`` | ++-------------------------------+ | | | +| ``double`` | | | | ++-------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``tinyint`` | ``IntegerType()`` | ``int`` | ``int`` | ++-------------------------------+ | | | +| ``smallint`` | | | | ++-------------------------------+ | | | +| ``mediumint`` | | | | ++-------------------------------+ | | | +| ``int`` | | | | ++-------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``bigint`` | ``LongType()`` | ``bigint`` | ``bigint`` | ++-------------------------------+-----------------------------------+-------------------------------+-------------------------------+ + +.. [2] + + MySQL support decimal types with precision ``P`` up to 65. + + But Spark's ``DecimalType(P, S)`` supports maximum ``P=38``. It is impossible to read, write or operate with values of larger precision, + this leads to an exception. + +Temporal types +~~~~~~~~~~~~~~ + ++-----------------------------------+--------------------------------------+-----------------------------------+-------------------------------+ +| MySQL type (read) | Spark type | MySQL type (write) | MySQL type (create) | ++===================================+======================================+===================================+===============================+ +| ``year`` | ``DateType()`` | ``date`` | ``date`` | ++-----------------------------------+ | | | +| ``date`` | | | | ++-----------------------------------+--------------------------------------+-----------------------------------+-------------------------------+ +| ``datetime``, seconds | ``TimestampType()``, microseconds | ``timestamp(N=6)``, microseconds | ``timestamp(N=0)``, seconds | ++-----------------------------------+ | | | +| ``timestamp``, seconds | | | | ++-----------------------------------+ | | | +| ``datetime(N=0)``, seconds | | | | ++-----------------------------------+ | | | +| ``timestamp(N=0)``, seconds | | | | ++-----------------------------------+--------------------------------------+-----------------------------------+-------------------------------+ +| ``datetime(N=3)``, milliseconds | ``TimestampType()``, microseconds | ``timestamp(N=6)``, microseconds | ``timestamp(N=0)``, seconds, | ++-----------------------------------+ | | **precision loss** [4]_, | +| ``timestamp(N=6)``, milliseconds | | | | ++-----------------------------------+ | | | +| ``datetime(N=6)``, microseconds | | | | ++-----------------------------------+ | | | +| ``timestamp(N=6)``, microseconds | | | | ++-----------------------------------+--------------------------------------+-----------------------------------+-------------------------------+ +| ``time``, seconds | ``TimestampType()``, microseconds, | ``timestamp(N=6)``, microseconds | ``timestamp(N=0)``, seconds | ++-----------------------------------+ with time format quirks [5]_ | | | +| ``time(N=0)``, seconds | | | | ++-----------------------------------+--------------------------------------+-----------------------------------+-------------------------------+ +| ``time(N=3)``, milliseconds | ``TimestampType()``, microseconds | ``timestamp(N=6)``, microseconds | ``timestamp(N=0)``, seconds, | ++-----------------------------------+ with time format quirks [5]_ | | **precision loss** [4]_, | +| ``time(N=6)``, microseconds | | | | ++-----------------------------------+--------------------------------------+-----------------------------------+-------------------------------+ + +.. warning:: + + Note that types in MySQL and Spark have different value ranges: + + +-----------------------------+-------------------------+-------------------------+ + | Type | Min value | Max value | + +=============================+=========================+=========================+ + | MySQL's ``year`` | ``1901`` | ``2155`` | + +-----------------------------+-------------------------+-------------------------+ + | MySQL's ``date`` | ``1000-01-01`` | ``9999-12-31`` | + +-----------------------------+-------------------------+-------------------------+ + | Spark's ``DateType()`` | ``0001-01-01`` | ``9999-12-31`` | + +-----------------------------+-------------------------+-------------------------+ + | MySQL's ``datetime`` | ``1000-01-01 00:00:00`` | ``9999-12-31 23:59:59`` | + +-----------------------------+-------------------------+-------------------------+ + | MySQL's ``timestamp`` | ``1970-01-01 00:00:01`` | ``9999-12-31 23:59:59`` | + +-----------------------------+-------------------------+-------------------------+ + | MySQL's ``time`` | ``-838:59:59`` | ``838:59:59`` | + +-----------------------------+-------------------------+-------------------------+ + | Spark's ``TimestampType()`` | ``0001-01-01 00:00:00`` | ``9999-12-31 23:59:59`` | + +-----------------------------+-------------------------+-------------------------+ + + So Spark can read all the values from MySQL, but not all of values in Spark DataFrame can be written to MySQL. + + See: + * `MySQL's year documentation `_ + * `MySQL's date, datetime & timestamp documentation `_ + * `MySQL's time documentation `_ + * `Spark's DateType documentation `_ + * `Spark's TimestampType documentation `_ + +.. [4] + MySQL dialect generates DDL with MySQL type ``timestamp`` which is alias for ``timestamp(0)`` with precision up to seconds (``23:59:59``). + Inserting data with microseconds precision (``23:59:59.999999``) will lead to **throwing away microseconds**. + +.. [5] + ``time`` type is the same as ``timestamp`` with date ``1970-01-01``. So instead of reading data from MySQL like ``23:59:59`` + it is actually read ``1970-01-01 23:59:59``, and vice versa. + +String types +~~~~~~~~~~~~~ + ++-------------------------------+------------------+--------------------+---------------------+ +| MySQL type (read) | Spark type | MySQL type (write) | MySQL type (create) | ++===============================+==================+====================+=====================+ +| ``char`` | ``StringType()`` | ``longtext`` | ``longtext`` | ++-------------------------------+ | | | +| ``char(N)`` | | | | ++-------------------------------+ | | | +| ``varchar(N)`` | | | | ++-------------------------------+ | | | +| ``mediumtext`` | | | | ++-------------------------------+ | | | +| ``text`` | | | | ++-------------------------------+ | | | +| ``longtext`` | | | | ++-------------------------------+ | | | +| ``json`` | | | | ++-------------------------------+ | | | +| ``enum("val1", "val2", ...)`` | | | | ++-------------------------------+ | | | +| ``set("val1", "val2", ...)`` | | | | ++-------------------------------+------------------+--------------------+---------------------+ + +Binary types +~~~~~~~~~~~~ + ++---------------------------+------------------+--------------------+---------------------+ +| MySQL type (read) | Spark type | MySQL type (write) | MySQL type (create) | ++===========================+==================+====================+=====================+ +| ``binary`` | ``BinaryType()`` | ``blob`` | ``blob`` | ++---------------------------+ | | | +| ``binary(N)`` | | | | ++---------------------------+ | | | +| ``varbinary(N)`` | | | | ++---------------------------+ | | | +| ``mediumblob`` | | | | ++---------------------------+ | | | +| ``blob`` | | | | ++---------------------------+ | | | +| ``longblob`` | | | | ++---------------------------+------------------+--------------------+---------------------+ + +Geometry types +~~~~~~~~~~~~~~ + ++---------------------------+------------------+--------------------+---------------------+ +| MySQL type (read) | Spark type | MySQL type (write) | MySQL type (create) | ++===========================+==================+====================+=====================+ +| ``point`` | ``BinaryType()`` | ``blob`` | ``blob`` | ++---------------------------+ | | | +| ``linestring`` | | | | ++---------------------------+ | | | +| ``polygon`` | | | | ++---------------------------+ | | | +| ``geometry`` | | | | ++---------------------------+ | | | +| ``multipoint`` | | | | ++---------------------------+ | | | +| ``multilinestring`` | | | | ++---------------------------+ | | | +| ``multipolygon`` | | | | ++---------------------------+ | | | +| ``geometrycollection`` | | | | ++---------------------------+------------------+--------------------+---------------------+ + +Explicit type cast +------------------ + +``DBReader`` +~~~~~~~~~~~~ + +It is possible to explicitly cast column type using ``DBReader(columns=...)`` syntax. + +For example, you can use ``CAST(column AS text)`` to convert data to string representation on MySQL side, and so it will be read as Spark's ``StringType()``. + +It is also possible to use `JSON_OBJECT `_ MySQL function +to convert column of any type to string representation, and then parse this column on Spark side using +`from_json `_: + +.. code-block:: python + + from pyspark.sql.types import IntegerType, StructField, StructType + + from onetl.connection import MySQL + from onetl.db import DBReader + + mysql = MySQL(...) + + DBReader( + connection=mysql, + columns=[ + "id", + "supported_column", + "CAST(unsupported_column AS TEXT) unsupported_column_str", + # or + "JSON_OBJECT('key', value_column) json_column", + ], + ) + df = reader.run() + + # Spark requires all columns to have some type, describe it + column_type = StructType([StructField("key", IntegerType())]) + + # cast column content to proper Spark type + df = df.select( + df.id, + df.supported_column, + # explicit cast + df.unsupported_column_str.cast("integer").alias("parsed_integer"), + # or explicit json parsing + from_json(df.json_column, schema).alias("struct_column"), + ) + +``DBWriter`` +~~~~~~~~~~~~ + +Convert dataframe column to JSON using `to_json `_, +and write it as ``text`` column in MySQL: + +.. code:: python + + mysql.execute( + """ + CREATE TABLE schema.target_tbl AS ( + id bigint, + array_column_json json -- any string type, actually + ) + ENGINE = InnoDB + """, + ) + + from pyspark.sql.functions import to_json + + df = df.select( + df.id, + to_json(df.array_column).alias("array_column_json"), + ) + + writer.run(df) + +Then you can parse this column on MySQL side - for example, by creating a view: + +.. code:: sql + + SELECT id, json_column->"$[0]" AS array_item FROM target_tbl + +Or by using `GENERATED column `_: + +.. code-block:: sql + + CREATE TABLE schema.target_table ( + id bigint, + supported_column timestamp, + array_column_json json, -- any string type, actually + array_item_0 GENERATED ALWAYS AS (array_column_json->"$[0]")) VIRTUAL + ) + +``VIRTUAL`` column value is calculated on every table read. +``STORED`` column value is calculated during insert, but this require additional space. diff --git a/docs/connection/db_connection/mysql/write.rst b/docs/connection/db_connection/mysql/write.rst index 67f13cf1b..6974c142f 100644 --- a/docs/connection/db_connection/mysql/write.rst +++ b/docs/connection/db_connection/mysql/write.rst @@ -1,9 +1,51 @@ .. _mysql-write: -Writing to MySQL -================ +Writing to MySQL using ``DBWriter`` +======================================== -For writing data to MySQL, use :obj:`DBWriter ` with options below. +For writing data to MySQL, use :obj:`DBWriter `. + +.. warning:: + + Please take into account :ref:`mysql-types` + +.. warning:: + + It is always recommended to create table explicitly using :obj:`MySQL.execute ` + instead of relying on Spark's table DDL generation. + + This is because Spark's DDL generator can create columns with different precision and types than it is expected, + causing precision loss or other issues. + +Examples +-------- + +.. code-block:: python + + from onetl.connection import MySQL + from onetl.db import DBWriter + + mysql = MySQL(...) + + df = ... # data is here + + writer = DBWriter( + connection=mysql, + target="schema.table", + options=MySQL.WriteOptions( + if_exists="append", + # ENGINE is required by MySQL + createTableOptions="ENGINE = MergeTree() ORDER BY id", + ), + ) + + writer.run(df) + + +Write options +------------- + +Method above accepts :obj:`JDBCWriteOptions ` .. currentmodule:: onetl.connection.db_connection.jdbc_connection.options diff --git a/onetl/connection/db_connection/mysql/connection.py b/onetl/connection/db_connection/mysql/connection.py index 224504962..ebf43f668 100644 --- a/onetl/connection/db_connection/mysql/connection.py +++ b/onetl/connection/db_connection/mysql/connection.py @@ -29,29 +29,9 @@ class MySQL(JDBCConnection): Based on Maven package ``com.mysql:mysql-connector-j:8.0.33`` (`official MySQL JDBC driver `_). - .. dropdown:: Version compatibility - - * MySQL server versions: 5.7, 8.0 - * Spark versions: 2.3.x - 3.5.x - * Java versions: 8 - 20 - - See `official documentation `_. - .. warning:: - To use MySQL connector you should have PySpark installed (or injected to ``sys.path``) - BEFORE creating the connector instance. - - You can install PySpark as follows: - - .. code:: bash - - pip install onetl[spark] # latest PySpark version - - # or - pip install onetl pyspark=3.5.0 # pass specific PySpark version - - See :ref:`install-spark` installation instruction for more details. + Before using this connector please take into account :ref:`mysql-prerequisites` Parameters ---------- @@ -78,7 +58,7 @@ class MySQL(JDBCConnection): extra : dict, default: ``None`` Specifies one or more extra parameters by which clients can connect to the instance. - For example: ``{"useSSL": "false"}`` + For example: ``{"useSSL": "false", "allowPublicKeyRetrieval": "true"}`` See `MySQL JDBC driver properties documentation `_ @@ -107,7 +87,7 @@ class MySQL(JDBCConnection): host="database.host.or.ip", user="user", password="*****", - extra={"useSSL": "false"}, + extra={"useSSL": "false", "allowPublicKeyRetrieval": "true"}, spark=spark, ) From 02ba65a2f651f61bd1fbb59898e5e11c42981770 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Mon, 11 Mar 2024 13:52:12 +0000 Subject: [PATCH 44/63] [DOP-13252] Improve Oracle documentation --- .../next_release/211.improvement.rst | 2 + .../next_release/228.improvement.rst | 3 +- .../next_release/229.improvement.rst | 3 +- .../next_release/233.improvement.rst | 5 + .../db_connection/clickhouse/execute.rst | 22 +- .../clickhouse/prerequisites.rst | 11 +- .../db_connection/clickhouse/sql.rst | 2 +- .../db_connection/clickhouse/types.rst | 23 +- .../db_connection/greenplum/execute.rst | 27 +- .../db_connection/greenplum/types.rst | 15 +- .../db_connection/oracle/execute.rst | 93 ++++- .../connection/db_connection/oracle/index.rst | 8 + .../db_connection/oracle/prerequisites.rst | 112 +++++ docs/connection/db_connection/oracle/read.rst | 74 +++- docs/connection/db_connection/oracle/sql.rst | 55 +++ .../connection/db_connection/oracle/types.rst | 382 ++++++++++++++++++ .../connection/db_connection/oracle/write.rst | 44 +- .../db_connection/postgres/execute.rst | 32 +- .../db_connection/postgres/prerequisites.rst | 23 +- .../db_connection/postgres/read.rst | 2 +- .../connection/db_connection/postgres/sql.rst | 4 +- .../db_connection/postgres/types.rst | 26 +- .../db_connection/postgres/write.rst | 2 +- .../db_connection/oracle/connection.py | 57 ++- 24 files changed, 885 insertions(+), 142 deletions(-) create mode 100644 docs/changelog/next_release/233.improvement.rst create mode 100644 docs/connection/db_connection/oracle/prerequisites.rst create mode 100644 docs/connection/db_connection/oracle/sql.rst create mode 100644 docs/connection/db_connection/oracle/types.rst diff --git a/docs/changelog/next_release/211.improvement.rst b/docs/changelog/next_release/211.improvement.rst index 853a7cf94..5deec2051 100644 --- a/docs/changelog/next_release/211.improvement.rst +++ b/docs/changelog/next_release/211.improvement.rst @@ -1,3 +1,5 @@ Improve Clickhouse documentation: * Add "Types" section describing mapping between Clickhouse and Spark types * Add "Prerequisites" section describing different aspects of connecting to Clickhouse + * Separate documentation of ``DBReader`` and ``Clickhouse.sql`` + * Add examples for ``Clickhouse.fetch`` and ``Clickhouse.execute`` diff --git a/docs/changelog/next_release/228.improvement.rst b/docs/changelog/next_release/228.improvement.rst index c5295422e..8b5c4c284 100644 --- a/docs/changelog/next_release/228.improvement.rst +++ b/docs/changelog/next_release/228.improvement.rst @@ -1,4 +1,5 @@ Improve Greenplum documentation: * Add "Types" section describing mapping between Greenplum and Spark types * Add more examples of reading and writing data from Greenplum - * Add notes about issues with IP resolution and building ``gpfdist`` URL. + * Add examples for ``Greenplum.fetch`` and ``Greenplum.execute`` + * Add notes about issues with IP resolution and building ``gpfdist`` URL diff --git a/docs/changelog/next_release/229.improvement.rst b/docs/changelog/next_release/229.improvement.rst index 30eae3549..d0d2d6b57 100644 --- a/docs/changelog/next_release/229.improvement.rst +++ b/docs/changelog/next_release/229.improvement.rst @@ -1,4 +1,5 @@ Improve Postgres documentation: * Add "Types" section describing mapping between Postgres and Spark types * Add "Prerequisites" section describing different aspects of connecting to Postgres - * Separate documentation of DBReader and Postgres.sql + * Separate documentation of ``DBReader`` and ``Postgres.sql`` + * Add examples for ``Postgres.fetch`` and ``Postgres.execute`` diff --git a/docs/changelog/next_release/233.improvement.rst b/docs/changelog/next_release/233.improvement.rst new file mode 100644 index 000000000..5b1913eba --- /dev/null +++ b/docs/changelog/next_release/233.improvement.rst @@ -0,0 +1,5 @@ +Improve Oracle documentation: + * Add "Types" section describing mapping between Oracle and Spark types + * Add "Prerequisites" section describing different aspects of connecting to Oracle + * Separate documentation of ``DBReader`` and ``Oracle.sql`` + * Add examples for ``Oracle.fetch`` and ``Oracle.execute`` diff --git a/docs/connection/db_connection/clickhouse/execute.rst b/docs/connection/db_connection/clickhouse/execute.rst index 59d956a89..ef9ad2ecf 100644 --- a/docs/connection/db_connection/clickhouse/execute.rst +++ b/docs/connection/db_connection/clickhouse/execute.rst @@ -23,11 +23,11 @@ Syntax support This method supports **any** query syntax supported by Clickhouse, like: -* ``SELECT ... FROM ...`` -* ``WITH alias AS (...) SELECT ...`` -* ``SHOW ...`` - -It does not support multiple queries in the same operation, like ``SET ...; SELECT ...;``. +* ✅︎ ``SELECT ... FROM ...`` +* ✅︎ ``WITH alias AS (...) SELECT ...`` +* ✅︎ ``SELECT func(arg1, arg2)`` - call function +* ✅︎ ``SHOW ...`` +* ❌ ``SET ...; SELECT ...;`` - multiple statements not supported Examples ^^^^^^^^ @@ -59,12 +59,12 @@ Syntax support This method supports **any** query syntax supported by Clickhouse, like: -* ``CREATE TABLE ...`` -* ``ALTER ...`` -* ``INSERT INTO ... AS SELECT ...`` -* etc - -It does not support multiple queries in the same operation, like ``SET ...; CREATE TABLE ...;``. +* ✅︎ ``CREATE TABLE ...``, ``CREATE VIEW ...``, and so on +* ✅︎ ``ALTER ...`` +* ✅︎ ``INSERT INTO ... AS SELECT ...`` +* ✅︎ ``DROP TABLE ...``, ``DROP VIEW ...``, and so on +* ✅︎ other statements not mentioned here +* ❌ ``SET ...; SELECT ...;`` - multiple statements not supported Examples ^^^^^^^^ diff --git a/docs/connection/db_connection/clickhouse/prerequisites.rst b/docs/connection/db_connection/clickhouse/prerequisites.rst index 35c3f04e6..654add047 100644 --- a/docs/connection/db_connection/clickhouse/prerequisites.rst +++ b/docs/connection/db_connection/clickhouse/prerequisites.rst @@ -46,7 +46,7 @@ used for creating a connection: .. code-tab:: sql Read + Write - -- allow external tables in the same schema as target table + -- allow creating tables in the target schema GRANT CREATE TABLE ON myschema.* TO username; -- allow read & write access to specific table @@ -57,13 +57,4 @@ used for creating a connection: -- allow read access to specific table GRANT SELECT ON myschema.mytable TO username; - .. code-tab:: sql Write only - - -- allow external tables in the same schema as target table - GRANT CREATE TABLE ON myschema.* TO username; - - -- allow read access to specific table (to get column types) - -- allow write access to specific table - GRANT SELECT, INSERT ON myschema.mytable TO username; - More details can be found in `official documentation `_. diff --git a/docs/connection/db_connection/clickhouse/sql.rst b/docs/connection/db_connection/clickhouse/sql.rst index 5550cde7b..cd331ef33 100644 --- a/docs/connection/db_connection/clickhouse/sql.rst +++ b/docs/connection/db_connection/clickhouse/sql.rst @@ -18,7 +18,7 @@ Syntax support Only queries with the following syntax are supported: * ``SELECT ...`` -* ``WITH ... SELECT ...`` +* ``WITH alias AS (...) SELECT ...`` Queries like ``SHOW ...`` are not supported. diff --git a/docs/connection/db_connection/clickhouse/types.rst b/docs/connection/db_connection/clickhouse/types.rst index d34b427a6..e9bdd22e9 100644 --- a/docs/connection/db_connection/clickhouse/types.rst +++ b/docs/connection/db_connection/clickhouse/types.rst @@ -17,8 +17,8 @@ This is how Clickhouse connector performs this: * Find corresponding ``Clickhouse type (read)`` -> ``Spark type`` combination (see below) for each DataFrame column. If no combination is found, raise exception. * Create DataFrame from query with specific column names and Spark types. -Writing to some existing Clickhuse table -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Writing to some existing Clickhouse table +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ This is how Clickhouse connector performs this: @@ -113,6 +113,8 @@ Here you can find source code with type conversions: Supported types --------------- +See `official documentation `_ + Generic types ~~~~~~~~~~~~~ @@ -243,7 +245,7 @@ Note: ``DateTime(P, TZ)`` has the same precision as ``DateTime(P)``. .. [5] Generic JDBC dialect generates DDL with Clickhouse type ``TIMESTAMP`` which is alias for ``DateTime32`` with precision up to seconds (``23:59:59``). - Inserting data with milliseconds precision (``23:59:59.999``) will lead to throwing away milliseconds (``23:59:59``). + Inserting data with milliseconds precision (``23:59:59.999``) will lead to **throwing away milliseconds**. .. [6] Clickhouse will raise an exception that data in format ``2001-01-01 23:59:59.999999`` has data ``.999999`` which does not match format ``YYYY-MM-DD hh:mm:ss``. @@ -304,8 +306,11 @@ This dialect does not have type conversion between some types, like Clickhouse ` The is a way to avoid this - just cast everything to ``String``. -Read unsupported column type -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Explicit type cast +------------------ + +``DBReader`` +~~~~~~~~~~~~ Use ``CAST`` or ``toJSONString`` to get column data as string in JSON format, and then cast string column in resulting dataframe to proper type using `from_json `_: @@ -332,8 +337,8 @@ and then cast string column in resulting dataframe to proper type using `from_js from_json(df.array_column, column_type).alias("array_column"), ) -Write unsupported column type -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +``DBWriter`` +~~~~~~~~~~~~ Convert dataframe column to JSON using `to_json `_, and write it as ``String`` column in Clickhouse: @@ -342,7 +347,7 @@ and write it as ``String`` column in Clickhouse: clickhouse.execute( """ - CREATE TABLE target_tbl AS ( + CREATE TABLE default.target_tbl AS ( id Int32, array_column_json String, ) @@ -360,7 +365,7 @@ and write it as ``String`` column in Clickhouse: writer.run(df) -Then you can parse this column on Clickhouse side: +Then you can parse this column on Clickhouse side - for example, by creating a view: .. code:: sql diff --git a/docs/connection/db_connection/greenplum/execute.rst b/docs/connection/db_connection/greenplum/execute.rst index 356df27b3..25fbe8962 100644 --- a/docs/connection/db_connection/greenplum/execute.rst +++ b/docs/connection/db_connection/greenplum/execute.rst @@ -28,12 +28,10 @@ Syntax support This method supports **any** query syntax supported by Greenplum, like: -* ``SELECT ... FROM ...`` -* ``WITH alias AS (...) SELECT ...`` - -Queries like ``SHOW ...`` are not supported. - -It does not support multiple queries in the same operation, like ``SET ...; SELECT ...;``. +* ✅︎ ``SELECT ... FROM ...`` +* ✅︎ ``WITH alias AS (...) SELECT ...`` +* ✅︎ ``SELECT func(arg1, arg2)`` or ``{call func(arg1, arg2)}`` - special syntax for calling functions +* ❌ ``SET ...; SELECT ...;`` - multiple statements not supported Examples ^^^^^^^^ @@ -65,15 +63,14 @@ Syntax support This method supports **any** query syntax supported by Greenplum, like: -* ``CREATE TABLE ...``, ``CREATE VIEW ...``, and so on -* ``ALTER ...`` -* ``INSERT INTO ... AS SELECT ...`` -* ``DROP TABLE ...``, ``DROP VIEW ...``, and so on -* ``CALL procedure(arg1, arg2) ...`` -* ``SELECT func(arg1, arg2)`` or ``{call func(arg1, arg2)}`` - special syntax for calling functions -* etc - -It does not support multiple queries in the same operation, like ``SET ...; CREATE TABLE ...;``. +* ✅︎ ``CREATE TABLE ...``, ``CREATE VIEW ...``, and so on +* ✅︎ ``ALTER ...`` +* ✅︎ ``INSERT INTO ... AS SELECT ...`` +* ✅︎ ``DROP TABLE ...``, ``DROP VIEW ...``, and so on +* ✅︎ ``CALL procedure(arg1, arg2) ...`` +* ✅︎ ``SELECT func(arg1, arg2)`` or ``{call func(arg1, arg2)}`` - special syntax for calling functions +* ✅︎ other statements not mentioned here +* ❌ ``SET ...; SELECT ...;`` - multiple statements not supported Examples ^^^^^^^^ diff --git a/docs/connection/db_connection/greenplum/types.rst b/docs/connection/db_connection/greenplum/types.rst index d6404569e..fb3077e9f 100644 --- a/docs/connection/db_connection/greenplum/types.rst +++ b/docs/connection/db_connection/greenplum/types.rst @@ -23,7 +23,7 @@ This is how Greenplum connector performs this: Yes, **all columns of a table**, not just selected ones. This means that if source table **contains** columns with unsupported type, the entire table cannot be read. -Writing to some existing Clickhuse table +Writing to some existing Greenplum table ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ This is how Greenplum connector performs this: @@ -33,7 +33,7 @@ This is how Greenplum connector performs this: * For each column in query result get column name and Greenplum type. * Match table columns with DataFrame columns (by name, case insensitive). If some column is present only in target table, but not in DataFrame (like ``DEFAULT`` or ``SERIAL`` column), and vice versa, raise an exception. - See `Write unsupported column type`_. + See `Explicit type cast`_. * Find corresponding ``Spark type`` -> ``Greenplumtype (write)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. * If ``Greenplumtype (write)`` match ``Greenplum type (read)``, no additional casts will be performed, DataFrame column will be written to Greenplum as is. * If ``Greenplumtype (write)`` does not match ``Greenplum type (read)``, DataFrame column will be casted to target column type **on Greenplum side**. For example, you can write column with text data to ``json`` column which Greenplum connector currently does not support. @@ -272,8 +272,11 @@ Columns of these types cannot be read/written by Spark: The is a way to avoid this - just cast unsupported types to ``text``. But the way this can be done is not a straightforward. -Read unsupported column type -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Explicit type cast +------------------ + +``DBReader`` +~~~~~~~~~~~~ Unfortunately, it is not possible to cast unsupported column to some supported type on ``DBReader`` side: @@ -334,8 +337,8 @@ You can then parse this column on Spark side using `from_json ` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Use this method to execute some ``SELECT`` query which returns **small number or rows**, like reading +Oracle config, or reading data from some reference table. + +Method accepts :obj:`JDBCOptions `. + +Connection opened using this method should be then closed with :obj:`Oracle.close `. + +Syntax support +^^^^^^^^^^^^^^ + +This method supports **any** query syntax supported by Oracle, like: + +* ✅︎ ``SELECT ... FROM ...`` +* ✅︎ ``WITH alias AS (...) SELECT ...`` +* ✅︎ ``SELECT func(arg1, arg2) FROM DUAL`` - call function +* ✅︎ ``SHOW ...`` +* ❌ ``SET ...; SELECT ...;`` - multiple statements not supported + +Examples +^^^^^^^^ + +.. code-block:: python + + from onetl.connection import Oracle + + oracle = Oracle(...) + + df = oracle.fetch( + "SELECT value FROM some.reference_table WHERE key = 'some_constant'", + options=Oracle.JDBCOptions(query_timeout=10), + ) + oracle.close() + value = df.collect()[0][0] # get value from first row and first column + +Use :obj:`Oracle.execute ` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it. + +Method accepts :obj:`JDBCOptions `. + +Connection opened using this method should be then closed with :obj:`Oracle.close `. + +Syntax support +^^^^^^^^^^^^^^ + +This method supports **any** query syntax supported by Oracle, like: + +* ✅︎ ``CREATE TABLE ...``, ``CREATE VIEW ...`` +* ✅︎ ``ALTER ...`` +* ✅︎ ``INSERT INTO ... AS SELECT ...`` +* ✅︎ ``DROP TABLE ...``, ``DROP VIEW ...``, and so on +* ✅︎ ``CALL procedure(arg1, arg2) ...`` or ``{call procedure(arg1, arg2)}`` - special syntax for calling procedure +* ✅︎ ``DECLARE ... BEGIN ... END`` - execute PL/SQL statement +* ✅︎ other statements not mentioned here +* ❌ ``SET ...; SELECT ...;`` - multiple statements not supported + +Examples +^^^^^^^^ + +.. code-block:: python + + from onetl.connection import Oracle + + oracle = Oracle(...) + + with oracle: + oracle.execute("DROP TABLE schema.table") + oracle.execute( + """ + CREATE TABLE schema.table AS ( + id bigint GENERATED ALWAYS AS IDENTITY, + key VARCHAR2(4000), + value NUMBER + ) + """, + options=Oracle.JDBCOptions(query_timeout=10), + ) + + +References +---------- .. currentmodule:: onetl.connection.db_connection.oracle.connection diff --git a/docs/connection/db_connection/oracle/index.rst b/docs/connection/db_connection/oracle/index.rst index 519250fb5..ec9005397 100644 --- a/docs/connection/db_connection/oracle/index.rst +++ b/docs/connection/db_connection/oracle/index.rst @@ -7,6 +7,7 @@ Oracle :maxdepth: 1 :caption: Connection + prerequisites connection .. toctree:: @@ -14,5 +15,12 @@ Oracle :caption: Operations read + sql write execute + +.. toctree:: + :maxdepth: 1 + :caption: Troubleshooting + + types diff --git a/docs/connection/db_connection/oracle/prerequisites.rst b/docs/connection/db_connection/oracle/prerequisites.rst new file mode 100644 index 000000000..c86bc393c --- /dev/null +++ b/docs/connection/db_connection/oracle/prerequisites.rst @@ -0,0 +1,112 @@ +.. _oracle-prerequisites: + +Prerequisites +============= + +Version Compatibility +--------------------- + +* Oracle Server versions: 23, 21, 19, 18, 12.2 and __probably__ 11.2 (tested, but it's not mentioned in official docs). +* Spark versions: 2.3.x - 3.5.x +* Java versions: 8 - 20 + +See `official documentation `_. + +Installing PySpark +------------------ + +To use Oracle connector you should have PySpark installed (or injected to ``sys.path``) +BEFORE creating the connector instance. + +See :ref:`install-spark` installation instruction for more details. + +Connecting to Oracle +-------------------- + +Connection port +~~~~~~~~~~~~~~~ + +Connection is usually performed to port 1521. Port may differ for different Oracle instances. +Please ask your Oracle administrator to provide required information. + +Connection host +~~~~~~~~~~~~~~~ + +It is possible to connect to Oracle by using either DNS name of host or it's IP address. + +If you're using Oracle cluster, it is currently possible to connect only to **one specific node**. +Connecting to multiple nodes to perform load balancing, as well as automatic failover to new master/replica are not supported. + +Connect as proxy user +~~~~~~~~~~~~~~~~~~~~~ + +It is possible to connect to database as another user without knowing this user password. + +This can be enabled by granting user a special ``CONNECT THROUGH`` permission: + +.. code-block:: sql + + ALTER USER schema_owner GRANT CONNECT THROUGH proxy_user; + +Then you can connect to Oracle using credentials of ``proxy_user`` but specify that you need permissions of ``schema_owner``: + +.. code-block:: python + + oracle = Oracle( + ..., + user="proxy_user[schema_owner]", + password="proxy_user password", + ) + +See `official documentation `_. + +Required grants +~~~~~~~~~~~~~~~ + +Ask your Oracle cluster administrator to set following grants for a user, +used for creating a connection: + +.. tabs:: + + .. code-tab:: sql Read + Write (schema is owned by user) + + -- allow user to log in + GRANT CREATE SESSION TO username; + + -- allow creating tables in user schema + GRANT CREATE TABLE TO username; + + -- allow read & write access to specific table + GRANT SELECT, INSERT ON username.mytable TO username; + + .. code-tab:: sql Read + Write (schema is not owned by user) + + -- allow user to log in + GRANT CREATE SESSION TO username; + + -- allow creating tables in any schema, + -- as Oracle does not support specifying exact schema name + GRANT CREATE ANY TABLE TO username; + + -- only if if_exists="replace_entire_table" is used: + -- allow dropping/truncating tables in any schema, + -- as Oracle does not support specifying exact schema name + GRANT DROP ANY TABLE TO username; + + -- allow read & write access to specific table + GRANT SELECT, INSERT ON someschema.mytable TO username; + + .. code-tab:: sql Read only + + -- allow user to log in + GRANT CREATE SESSION TO username; + + -- allow read access to specific table + GRANT SELECT ON someschema.mytable TO username; + +More details can be found in official documentation: + * `GRANT `_ + * `SELECT `_ + * `CREATE TABLE `_ + * `INSERT `_ + * `TRUNCATE TABLE `_ diff --git a/docs/connection/db_connection/oracle/read.rst b/docs/connection/db_connection/oracle/read.rst index ffd393e6e..5877e2caf 100644 --- a/docs/connection/db_connection/oracle/read.rst +++ b/docs/connection/db_connection/oracle/read.rst @@ -1,18 +1,76 @@ .. _oracle-read: -Reading from Oracle -=================== +Reading from Oracle using ``DBReader`` +====================================== -There are 2 ways of distributed data reading from Oracle: +.. warning:: -* Using :obj:`DBReader ` with different :ref:`strategy` -* Using :obj:`Oracle.sql ` + Please take into account :ref:`oracle-types` -Both methods accept :obj:`JDBCReadOptions ` +:obj:`DBReader ` supports :ref:`strategy` for incremental data reading, +but does not support custom queries, like JOINs. -.. currentmodule:: onetl.connection.db_connection.oracle.connection +Supported DBReader features +--------------------------- -.. automethod:: Oracle.sql +* ✅︎ ``columns`` +* ✅︎ ``where`` +* ✅︎ ``hwm``, supported strategies: +* * ✅︎ :ref:`snapshot-strategy` +* * ✅︎ :ref:`incremental-strategy` +* * ✅︎ :ref:`snapshot-batch-strategy` +* * ✅︎ :ref:`incremental-batch-strategy` +* ✅︎ ``hint`` (see `official documentation `_) +* ❌ ``df_schema`` +* ✅︎ ``options`` (see :obj:`JDBCReadOptions `) + +Examples +-------- + +Snapshot strategy: + +.. code-block:: python + + from onetl.connection import Oracle + from onetl.db import DBReader + + oracle = Oracle(...) + + reader = DBReader( + connection=oracle, + source="schema.table", + columns=["id", "key", "CAST(value AS VARCHAR2(4000)) value", "updated_dt"], + where="key = 'something'", + hint="INDEX(schema.table key_index)", + options=Oracle.ReadOptions(partition_column="id", num_partitions=10), + ) + df = reader.run() + +Incremental strategy: + +.. code-block:: python + + from onetl.connection import Oracle + from onetl.db import DBReader + from onetl.strategy import IncrementalStrategy + + oracle = Oracle(...) + + reader = DBReader( + connection=oracle, + source="schema.table", + columns=["id", "key", "CAST(value AS VARCHAR2(4000)) value", "updated_dt"], + where="key = 'something'", + hint="INDEX(schema.table key_index)", + hwm=DBReader.AutoDetectHWM(name="oracle_hwm", expression="updated_dt"), + options=Oracle.ReadOptions(partition_column="id", num_partitions=10), + ) + + with IncrementalStrategy(): + df = reader.run() + +Read options +------------ .. currentmodule:: onetl.connection.db_connection.jdbc_connection.options diff --git a/docs/connection/db_connection/oracle/sql.rst b/docs/connection/db_connection/oracle/sql.rst new file mode 100644 index 000000000..3ea0832d1 --- /dev/null +++ b/docs/connection/db_connection/oracle/sql.rst @@ -0,0 +1,55 @@ +.. _oracle-sql: + +Reading from Oracle using ``Oracle.sql`` +======================================== + +.. warning:: + + Please take into account :ref:`oracle-types` + +:obj:`Oracle.sql ` allows passing custom SQL query, +but does not support incremental strategies. + +Method also accepts :obj:`JDBCReadOptions `. + +Syntax support +-------------- + +Only queries with the following syntax are supported: + +* ``SELECT ...`` +* ``WITH alias AS (...) SELECT ...`` + +Queries like ``SHOW ...`` are not supported. + +This method also does not support multiple queries in the same operation, like ``SET ...; SELECT ...;``. + +Examples +-------- + +.. code-block:: python + + from onetl.connection import Oracle + + oracle = Oracle(...) + df = oracle.sql( + """ + SELECT + id, + key, + CAST(value AS VARCHAR2(4000)) value, + updated_at + FROM + some.mytable + WHERE + key = 'something' + """, + options=Oracle.ReadOptions(partition_column="id", num_partitions=10), + ) + +References +---------- + +.. currentmodule:: onetl.connection.db_connection.oracle.connection + +.. automethod:: Oracle.sql diff --git a/docs/connection/db_connection/oracle/types.rst b/docs/connection/db_connection/oracle/types.rst new file mode 100644 index 000000000..907afe1c9 --- /dev/null +++ b/docs/connection/db_connection/oracle/types.rst @@ -0,0 +1,382 @@ +.. _oracle-types: + +Oracle <-> Spark type mapping +============================= + +Type detection & casting +------------------------ + +Spark's DataFrames always have a ``schema`` which is a list of columns with corresponding Spark types. All operations on a column are performed using column type. + +Reading from Oracle +~~~~~~~~~~~~~~~~~~~ + +This is how Oracle connector performs this: + +* For each column in query result (``SELECT column1, column2, ... FROM table ...``) get column name and Oracle type. +* Find corresponding ``Oracle type (read)`` -> ``Spark type`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* Create DataFrame from query with specific column names and Spark types. + +Writing to some existing Oracle table +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This is how Oracle connector performs this: + +* Get names of columns in DataFrame. [1]_ +* Perform ``SELECT * FROM table LIMIT 0`` query. +* Take only columns present in DataFrame (by name, case insensitive). For each found column get Clickhouse type. +* **Find corresponding** ``Oracle type (read)`` -> ``Spark type`` **combination** (see below) for each DataFrame column. If no combination is found, raise exception. [2]_ +* Find corresponding ``Spark type`` -> ``Oracle type (write)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* If ``Oracle type (write)`` match ``Oracle type (read)``, no additional casts will be performed, DataFrame column will be written to Oracle as is. +* If ``Oracle type (write)`` does not match ``Oracle type (read)``, DataFrame column will be casted to target column type **on Oracle side**. + For example, you can write column with text data to ``int`` column, if column contains valid integer values within supported value range and precision. + +.. [1] + This allows to write data to tables with ``DEFAULT`` and ``GENERATED`` columns - if DataFrame has no such column, + it will be populated by Oracle. + +.. [2] + + Yes, this is weird. + +Create new table using Spark +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. warning:: + + ABSOLUTELY NOT RECOMMENDED! + +This is how Oracle connector performs this: + +* Find corresponding ``Spark type`` -> ``Oracle type (create)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* Generate DDL for creating table in Oracle, like ``CREATE TABLE (col1 ...)``, and run it. +* Write DataFrame to created table as is. + +But Oracle connector support only limited number of types and almost no custom clauses (like ``PARTITION BY``, ``INDEX``, etc). +So instead of relying on Spark to create tables: + +.. dropdown:: See example + + .. code:: python + + writer = DBWriter( + connection=oracle, + table="public.table", + options=Oracle.WriteOptions(if_exists="append"), + ) + writer.run(df) + +Always prefer creating table with desired DDL **BEFORE WRITING DATA**: + +.. dropdown:: See example + + .. code:: python + + oracle.execute( + """ + CREATE TABLE username.table AS ( + id NUMBER, + business_dt TIMESTAMP(6), + value VARCHAR2(2000) + ) + """, + ) + + writer = DBWriter( + connection=oracle, + table="public.table", + options=Oracle.WriteOptions(if_exists="append"), + ) + writer.run(df) + +See Oracle `CREATE TABLE `_ documentation. + +Supported types +--------------- + +References +~~~~~~~~~~ + +See `List of Oracle types `_. + +Here you can find source code with type conversions: + +* `JDBC -> Spark `_ +* `Spark -> JDBC `_ + +Numeric types +~~~~~~~~~~~~~ + ++----------------------------------+-----------------------------------+-------------------------------+---------------------------+ +| Oracle type (read) | Spark type | Oracle type (write) | Oracle type (create) | ++==================================+===================================+===============================+===========================+ +| ``NUMBER`` | ``DecimalType(P=38, S=10)`` | ``NUMBER(P=38, S=10)`` | ``NUMBER(P=38, S=10)`` | ++----------------------------------+-----------------------------------+-------------------------------+---------------------------+ +| ``NUMBER(P=0..38)`` | ``DecimalType(P=0..38, S=0)`` | ``NUMBER(P=0..38, S=0)`` | ``NUMBER(P=38, S=0)`` | ++----------------------------------+-----------------------------------+-------------------------------+---------------------------+ +| ``NUMBER(P=0..38, S=0..38)`` | ``DecimalType(P=0..38, S=0..38)`` | ``NUMBER(P=0..38, S=0..38)`` | ``NUMBER(P=38, S=0..38)`` | ++----------------------------------+-----------------------------------+-------------------------------+---------------------------+ +| ``NUMBER(P=..., S=-127..-1)`` | unsupported [3]_ | | | ++----------------------------------+-----------------------------------+-------------------------------+---------------------------+ +| ``FLOAT`` | ``DecimalType(P=38, S=10)`` | ``NUMBER(P=38, S=10)`` | ``NUMBER(P=38, S=10)`` | ++----------------------------------+ | | | +| ``FLOAT(N=1..126)`` | | | | ++----------------------------------+ | | | +| ``REAL`` | | | | ++----------------------------------+ | | | +| ``DOUBLE PRECISION`` | | | | ++----------------------------------+-----------------------------------+-------------------------------+---------------------------+ +| ``BINARY_FLOAT`` | ``FloatType()`` | ``NUMBER(P=19, S=4)`` | ``NUMBER(P=19, S=4)`` | ++----------------------------------+-----------------------------------+ | | +| ``BINARY_DOUBLE`` | ``DoubleType()`` | | | ++----------------------------------+-----------------------------------+-------------------------------+---------------------------+ +| ``SMALLINT`` | ``DecimalType(P=38, S=0)`` | ``NUMBER(P=38, S=0)`` | ``NUMBER(P=38, S=0)`` | ++----------------------------------+ | | | +| ``INTEGER`` | | | | ++----------------------------------+-----------------------------------+-------------------------------+---------------------------+ +| ``LONG`` | ``StringType()`` | ``CLOB`` | ``CLOB`` | ++----------------------------------+-----------------------------------+-------------------------------+---------------------------+ + +.. [3] + + Oracle support decimal types with negative scale, like ``NUMBER(38, -10)``. Spark doesn't. + +Temporal types +~~~~~~~~~~~~~~ + ++--------------------------------------------+------------------------------------+---------------------------------+---------------------------------+ +| Oracle type (read) | Spark type | Oracle type (write) | Oracle type (create) | ++============================================+====================================+=================================+=================================+ +| ``DATE``, days | ``TimestampType()``, microseconds | ``TIMESTAMP(6)``, microseconds | ``TIMESTAMP(6)``, microseconds | ++--------------------------------------------+------------------------------------+---------------------------------+---------------------------------+ +| ``TIMESTAMP``, microseconds | ``TimestampType()``, microseconds | ``TIMESTAMP(6)``, microseconds | ``TIMESTAMP(6)``, microseconds | ++--------------------------------------------+ | | | +| ``TIMESTAMP(0)``, seconds | | | | ++--------------------------------------------+ | | | +| ``TIMESTAMP(3)``, milliseconds | | | | ++--------------------------------------------+ | | | +| ``TIMESTAMP(6)``, microseconds | | | | ++--------------------------------------------+------------------------------------+---------------------------------+---------------------------------+ +| ``TIMESTAMP(9)``, nanoseconds | ``TimestampType()``, microseconds, | ``TIMESTAMP(6)``, microseconds, | ``TIMESTAMP(6)``, microseconds, | +| | **precision loss** [4]_ | **precision loss** | **precision loss** | ++--------------------------------------------+------------------------------------+---------------------------------+---------------------------------+ +| ``TIMESTAMP WITH TIME ZONE`` | unsupported | | | ++--------------------------------------------+ | | | +| ``TIMESTAMP(N=0..9) WITH TIME ZONE`` | | | | ++--------------------------------------------+ | | | +| ``TIMESTAMP WITH LOCAL TIME ZONE`` | | | | ++--------------------------------------------+ | | | +| ``TIMESTAMP(N=0..9) WITH LOCAL TIME ZONE`` | | | | ++--------------------------------------------+ | | | +| ``INTERVAL YEAR TO MONTH`` | | | | ++--------------------------------------------+ | | | +| ``INTERVAL DAY TO SECOND`` | | | | ++--------------------------------------------+------------------------------------+---------------------------------+---------------------------------+ + +.. [4] + Oracle support timestamp up to nanoseconds precision (``23:59:59.999999999``), + but Spark ``TimestampType()`` supports datetime up to microseconds precision (``23:59:59.999999``). + Nanoseconds will be lost during read or write operations. + +String types +~~~~~~~~~~~~ + ++-----------------------------+------------------+---------------------+----------------------+ +| Oracle type (read) | Spark type | Oracle type (write) | Oracle type (create) | ++=============================+==================+=====================+======================+ +| ``CHAR`` | ``StringType()`` | ``CLOB`` | ``CLOB`` | ++-----------------------------+ | | | +| ``CHAR(N CHAR)`` | | | | ++-----------------------------+ | | | +| ``CHAR(N BYTE)`` | | | | ++-----------------------------+ | | | +| ``NCHAR`` | | | | ++-----------------------------+ | | | +| ``NCHAR(N)`` | | | | ++-----------------------------+ | | | +| ``VARCHAR(N)`` | | | | ++-----------------------------+ | | | +| ``LONG VARCHAR`` | | | | ++-----------------------------+ | | | +| ``VARCHAR2(N CHAR)`` | | | | ++-----------------------------+ | | | +| ``VARCHAR2(N BYTE)`` | | | | ++-----------------------------+ | | | +| ``NVARCHAR2(N)`` | | | | ++-----------------------------+ | | | +| ``CLOB`` | | | | ++-----------------------------+ | | | +| ``NCLOB`` | | | | ++-----------------------------+------------------+---------------------+----------------------+ + +Binary types +~~~~~~~~~~~~ + ++--------------------------+------------------+---------------------+----------------------+ +| Oracle type (read) | Spark type | Oracle type (write) | Oracle type (create) | ++==========================+==================+=====================+======================+ +| ``RAW(N)`` | ``BinaryType()`` | ``BLOB`` | ``BLOB`` | ++--------------------------+ | | | +| ``LONG RAW`` | | | | ++--------------------------+ | | | +| ``BLOB`` | | | | ++--------------------------+------------------+---------------------+----------------------+ +| ``BFILE`` | unsupported | | | ++--------------------------+------------------+---------------------+----------------------+ + +Struct types +~~~~~~~~~~~~ + ++-------------------------------------+------------------+---------------------+----------------------+ +| Oracle type (read) | Spark type | Oracle type (write) | Oracle type (create) | ++=====================================+==================+=====================+======================+ +| ``XMLType`` | ``StringType()`` | ``CLOB`` | ``CLOB`` | ++-------------------------------------+ | | | +| ``URIType`` | | | | ++-------------------------------------+ | | | +| ``DBURIType`` | | | | ++-------------------------------------+ | | | +| ``XDBURIType`` | | | | ++-------------------------------------+ | | | +| ``HTTPURIType`` | | | | ++-------------------------------------+ | | | +| ``CREATE TYPE ... AS OBJECT (...)`` | | | | ++-------------------------------------+------------------+---------------------+----------------------+ +| ``JSON`` | unsupported | | | ++-------------------------------------+ | | | +| ``CREATE TYPE ... AS VARRAY ...`` | | | | ++-------------------------------------+ | | | +| ``CREATE TYPE ... AS TABLE OF ...`` | | | | ++-------------------------------------+------------------+---------------------+----------------------+ + +Special types +~~~~~~~~~~~~~ + ++--------------------+-------------------+---------------------+----------------------+ +| Oracle type (read) | Spark type | Oracle type (write) | Oracle type (create) | ++====================+===================+=====================+======================+ +| ``BOOLEAN`` | ``BooleanType()`` | ``BOOLEAN`` | ``NUMBER(P=1, S=0)`` | ++--------------------+-------------------+---------------------+----------------------+ +| ``ROWID`` | ``StringType()`` | ``CLOB`` | ``CLOB`` | ++--------------------+ | | | +| ``UROWID`` | | | | ++--------------------+ | | | +| ``UROWID(N)`` | | | | ++--------------------+-------------------+---------------------+----------------------+ +| ``ANYTYPE`` | unsupported | | | ++--------------------+ | | | +| ``ANYDATA`` | | | | ++--------------------+ | | | +| ``ANYDATASET`` | | | | ++--------------------+-------------------+---------------------+----------------------+ + +Explicit type cast +------------------ + +``DBReader`` +~~~~~~~~~~~~ + +It is possible to explicitly cast column of unsupported type using ``DBReader(columns=...)`` syntax. + +For example, you can use ``CAST(column AS CLOB)`` to convert data to string representation on Oracle side, and so it will be read as Spark's ``StringType()``. + +It is also possible to use `JSON_ARRAY `_ +or `JSON_OBJECT `_ Oracle functions +to convert column of any type to string representation, and then parse this column on Spark side using +`from_json `_: + +.. code-block:: python + + from pyspark.sql.types import IntegerType + + from onetl.connection import Oracle + from onetl.db import DBReader + + oracle = Oracle(...) + + DBReader( + connection=oracle, + columns=[ + "id", + "supported_column", + "CAST(unsupported_column AS VARCHAR2(4000)) unsupported_column_str", + # or + "JSON_ARRAY(array_column) array_column_json", + ], + ) + df = reader.run() + + # Spark requires all columns to have some type, describe it + column_type = IntegerType() + + # cast column content to proper Spark type + df = df.select( + df.id, + df.supported_column, + # explicit cast + df.unsupported_column_str.cast("integer").alias("parsed_integer"), + # or explicit json parsing + from_json(df.array_column_json, schema).alias("array_column"), + ) + +``DBWriter`` +~~~~~~~~~~~~ + +It is always possible to convert data on Spark side to string, and then write it to ``text`` column in Oracle table. + +For example, you can convert data using `to_json `_ function. + +.. code:: python + + from pyspark.sql.functions import to_json + + from onetl.connection import Oracle + from onetl.db import DBReader + + oracle = Oracle(...) + + oracle.execute( + """ + CREATE TABLE schema.target_table ( + id INTEGER, + supported_column TIMESTAMP, + array_column_json VARCHAR2(4000) -- any string type, actually + ) + """, + ) + + write_df = df.select( + df.id, + df.supported_column, + to_json(df.unsupported_column).alias("array_column_json"), + ) + + writer = DBWriter( + connection=oracle, + target="schema.target_table", + ) + writer.run(write_df) + +Then you can parse this column on Oracle side - for example, by creating a view: + +.. code-block:: sql + + SELECT + id, + supported_column, + JSON_VALUE(array_column_json, '$[0]' RETURNING NUMBER) AS array_item_0 + FROM + schema.target_table + +Or by using `VIRTUAL column `_: + +.. code-block:: sql + + CREATE TABLE schema.target_table ( + id INTEGER, + supported_column TIMESTAMP, + array_column_json VARCHAR2(4000), -- any string type, actually + array_item_0 GENERATED ALWAYS AS (JSON_VALUE(array_column_json, '$[0]' RETURNING NUMBER)) VIRTUAL + ) + +But data will be parsed on each table read in any case, as Oracle does no support ``GENERATED ALWAYS AS (...) STORED`` columns. diff --git a/docs/connection/db_connection/oracle/write.rst b/docs/connection/db_connection/oracle/write.rst index 78c57d915..0f21ec2ac 100644 --- a/docs/connection/db_connection/oracle/write.rst +++ b/docs/connection/db_connection/oracle/write.rst @@ -1,9 +1,47 @@ .. _oracle-write: -Writing to Oracle -================= +Writing to Oracle using ``DBWriter`` +==================================== -For writing data to Oracle, use :obj:`DBWriter ` with options below. +For writing data to Oracle, use :obj:`DBWriter `. + +.. warning:: + + Please take into account :ref:`oracle-types` + +.. warning:: + + It is always recommended to create table explicitly using :obj:`Oracle.execute ` + instead of relying on Spark's table DDL generation. + + This is because Spark's DDL generator can create columns with different precision and types than it is expected, + causing precision loss or other issues. + +Examples +-------- + +.. code-block:: python + + from onetl.connection import Oracle + from onetl.db import DBWriter + + oracle = Oracle(...) + + df = ... # data is here + + writer = DBWriter( + connection=oracle, + target="schema.table", + options=Oracle.WriteOptions(if_exists="append"), + ) + + writer.run(df) + + +Write options +------------- + +Method above accepts :obj:`JDBCWriteOptions ` .. currentmodule:: onetl.connection.db_connection.jdbc_connection.options diff --git a/docs/connection/db_connection/postgres/execute.rst b/docs/connection/db_connection/postgres/execute.rst index ea9bd5048..ac9f553a5 100644 --- a/docs/connection/db_connection/postgres/execute.rst +++ b/docs/connection/db_connection/postgres/execute.rst @@ -1,7 +1,7 @@ .. _postgres-execute: Executing statements in Postgres -================================== +================================ How to ------ @@ -9,7 +9,7 @@ How to There are 2 ways to execute some statement in Postgres Use :obj:`Postgres.fetch ` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Use this method to execute some ``SELECT`` query which returns **small number or rows**, like reading Postgres config, or reading data from some reference table. @@ -23,12 +23,9 @@ Syntax support This method supports **any** query syntax supported by Postgres, like: -* ``SELECT ... FROM ...`` -* ``WITH alias AS (...) SELECT ...`` - -Queries like ``SHOW ...`` are not supported. - -It does not support multiple queries in the same operation, like ``SET ...; SELECT ...;``. +* ✅︎ ``SELECT ... FROM ...`` +* ✅︎ ``WITH alias AS (...) SELECT ...``\ +* ❌ ``SET ...; SELECT ...;`` - multiple statements not supported Examples ^^^^^^^^ @@ -60,15 +57,14 @@ Syntax support This method supports **any** query syntax supported by Postgres, like: -* ``CREATE TABLE ...``, ``CREATE VIEW ...`` -* ``ALTER ...`` -* ``INSERT INTO ... AS SELECT ...`` -* ``DROP TABLE ...``, ``DROP VIEW ...``, and so on -* ``CALL procedure(arg1, arg2) ...`` -* ``SELECT func(arg1, arg2)`` or ``{call func(arg1, arg2)}`` - special syntax for calling functions -* etc - -It does not support multiple queries in the same operation, like ``SET ...; CREATE TABLE ...;``. +* ✅︎ ``CREATE TABLE ...``, ``CREATE VIEW ...``, and so on +* ✅︎ ``ALTER ...`` +* ✅︎ ``INSERT INTO ... AS SELECT ...`` +* ✅︎ ``DROP TABLE ...``, ``DROP VIEW ...``, and so on +* ✅︎ ``CALL procedure(arg1, arg2) ...`` +* ✅︎ ``SELECT func(arg1, arg2)`` or ``{call func(arg1, arg2)}`` - special syntax for calling functions +* ✅︎ other statements not mentioned here +* ❌ ``SET ...; SELECT ...;`` - multiple statements not supported Examples ^^^^^^^^ @@ -84,7 +80,7 @@ Examples postgres.execute( """ CREATE TABLE schema.table AS ( - id biging ALWAYS GENERATED AS IDENTITY, + id bigint GENERATED ALWAYS AS IDENTITY, key text, value real ) diff --git a/docs/connection/db_connection/postgres/prerequisites.rst b/docs/connection/db_connection/postgres/prerequisites.rst index 1921a3830..509b54bc0 100644 --- a/docs/connection/db_connection/postgres/prerequisites.rst +++ b/docs/connection/db_connection/postgres/prerequisites.rst @@ -23,12 +23,6 @@ See :ref:`install-spark` installation instruction for more details. Connecting to Postgres ----------------------- -Connection port -~~~~~~~~~~~~~~~ - -Connection is usuallu performed to port 5432. Port may differ for different Postgres instances. -Please ask your Postgres administrator to provide required information. - Allowing connection to Postgres instance ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -37,8 +31,16 @@ e.g. by updating ``pg_hba.conf`` file. See `official documentation `_. -Postgres cluster interaction -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Connection port +~~~~~~~~~~~~~~~ + +Connection is usually performed to port 5432. Port may differ for different Postgres instances. +Please ask your Postgres administrator to provide required information. + +Connection host +~~~~~~~~~~~~~~~ + +It is possible to connect to Postgres by using either DNS name of host or it's IP address. If you're using Postgres cluster, it is currently possible to connect only to **one specific node**. Connecting to multiple nodes to perform load balancing, as well as automatic failover to new master/replica are not supported. @@ -57,7 +59,10 @@ used for creating a connection: GRANT USAGE, CREATE ON SCHEMA myschema TO username; -- allow read & write access to specific table - GRANT SELECT, INSERT, TRUNCATE ON myschema.mytable TO username; + GRANT SELECT, INSERT ON myschema.mytable TO username; + + -- only if if_exists="replace_entire_table" is used: + GRANT TRUNCATE ON myschema.mytable TO username; .. code-tab:: sql Read only diff --git a/docs/connection/db_connection/postgres/read.rst b/docs/connection/db_connection/postgres/read.rst index fc4a8e3b3..0733f8e08 100644 --- a/docs/connection/db_connection/postgres/read.rst +++ b/docs/connection/db_connection/postgres/read.rst @@ -1,7 +1,7 @@ .. _postgres-read: Reading from Postgres using ``DBReader`` -========================================== +======================================== .. warning:: diff --git a/docs/connection/db_connection/postgres/sql.rst b/docs/connection/db_connection/postgres/sql.rst index ae6d5602c..1430381a1 100644 --- a/docs/connection/db_connection/postgres/sql.rst +++ b/docs/connection/db_connection/postgres/sql.rst @@ -1,7 +1,7 @@ .. _postgres-sql: Reading from Postgres using ``Postgres.sql`` -================================================ +============================================ .. warning:: @@ -18,7 +18,7 @@ Syntax support Only queries with the following syntax are supported: * ``SELECT ...`` -* ``WITH ... SELECT ...`` +* ``WITH alias AS (...) SELECT ...`` Queries like ``SHOW ...`` are not supported. diff --git a/docs/connection/db_connection/postgres/types.rst b/docs/connection/db_connection/postgres/types.rst index b149b64bd..224d4ee4d 100644 --- a/docs/connection/db_connection/postgres/types.rst +++ b/docs/connection/db_connection/postgres/types.rst @@ -9,7 +9,7 @@ Type detection & casting Spark's DataFrames always have a ``schema`` which is a list of columns with corresponding Spark types. All operations on a column are performed using column type. Reading from Postgres -~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~ This is how Postgres connector performs this: @@ -20,8 +20,8 @@ This is how Postgres connector performs this: .. [1] All Postgres types that doesn't have corresponding Java type are converted to ``String``. -Writing to some existing Clickhuse table -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Writing to some existing Postgres table +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ This is how Postgres connector performs this: @@ -319,17 +319,18 @@ Geo types +----------------------+-----------------------+-----------------------+-------------------------+ Explicit type cast -------------------- +------------------ ``DBReader`` -~~~~~~~~~~~ +~~~~~~~~~~~~ It is possible to explicitly cast column of unsupported type using ``DBReader(columns=...)`` syntax. For example, you can use ``CAST(column AS text)`` to convert data to string representation on Postgres side, and so it will be read as Spark's ``StringType()``. -It is also possible to use ``to_json`` Postgres function for convert column of any type to string representation, -and then parse this column on Spark side using `from_json `_: +It is also possible to use `to_json `_ Postgres function +to convert column of any type to string representation, and then parse this column on Spark side using +`from_json `_: .. code-block:: python @@ -348,7 +349,7 @@ and then parse this column on Spark side using `from_json 'field' + array_column_json->'0' AS array_item_0 FROM schema.target_table diff --git a/docs/connection/db_connection/postgres/write.rst b/docs/connection/db_connection/postgres/write.rst index 1f5f2ed5c..ad0c42f68 100644 --- a/docs/connection/db_connection/postgres/write.rst +++ b/docs/connection/db_connection/postgres/write.rst @@ -1,7 +1,7 @@ .. _postgres-write: Writing to Postgres using ``DBWriter`` -======================================== +====================================== For writing data to Postgres, use :obj:`DBWriter `. diff --git a/onetl/connection/db_connection/oracle/connection.py b/onetl/connection/db_connection/oracle/connection.py index 7008d09b5..f6b1ad2d9 100644 --- a/onetl/connection/db_connection/oracle/connection.py +++ b/onetl/connection/db_connection/oracle/connection.py @@ -73,29 +73,9 @@ class Oracle(JDBCConnection): Based on Maven package ``com.oracle.database.jdbc:ojdbc8:23.2.0.0`` (`official Oracle JDBC driver `_). - .. dropdown:: Version compatibility - - * Oracle Server versions: 23, 21, 19, 18, 12.2 and probably 11.2 (tested, but that's not official). - * Spark versions: 2.3.x - 3.5.x - * Java versions: 8 - 20 - - See `official documentation `_. - .. warning:: - To use Oracle connector you should have PySpark installed (or injected to ``sys.path``) - BEFORE creating the connector instance. - - You can install PySpark as follows: - - .. code:: bash - - pip install onetl[spark] # latest PySpark version - - # or - pip install onetl pyspark=3.5.0 # pass specific PySpark version - - See :ref:`install-spark` installation instruction for more details. + Before using this connector please take into account :ref:`oracle-prerequisites` Parameters ---------- @@ -116,16 +96,16 @@ class Oracle(JDBCConnection): .. warning :: - Be careful, to correct work you must provide ``sid`` or ``service_name`` + You should provide either ``sid`` or ``service_name``, not both of them service_name : str, default: ``None`` Specifies one or more names by which clients can connect to the instance. - For example: ``MYDATA``. + For example: ``PDB1``. .. warning :: - Be careful, for correct work you must provide ``sid`` or ``service_name`` + You should provide either ``sid`` or ``service_name``, not both of them spark : :obj:`pyspark.sql.SparkSession` Spark session. @@ -133,24 +113,22 @@ class Oracle(JDBCConnection): extra : dict, default: ``None`` Specifies one or more extra parameters by which clients can connect to the instance. - For example: ``{"defaultBatchValue": 100}`` + For example: ``{"remarksReporting": "false"}`` - See `Oracle JDBC driver properties documentation - `_ - for more details + See official documentation: + * `Connection parameters `_ + * `Connection properties `_ Examples -------- - Oracle connection initialization + Connect to Oracle using ``sid``: .. code:: python from onetl.connection import Oracle from pyspark.sql import SparkSession - extra = {"defaultBatchValue": 100} - # Create Spark session with Oracle driver loaded maven_packages = Oracle.get_packages() spark = ( @@ -165,7 +143,22 @@ class Oracle(JDBCConnection): user="user", password="*****", sid="XE", - extra=extra, + extra={"remarksReporting": "false"}, + spark=spark, + ) + + Using ``service_name``: + + .. code:: python + + ... + + oracle = Oracle( + host="database.host.or.ip", + user="user", + password="*****", + service_name="PDB1", + extra={"remarksReporting": "false"}, spark=spark, ) From ab2139facaa82ae7a2717acb83df1ffa4d430c84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Thu, 14 Mar 2024 14:54:13 +0000 Subject: [PATCH 45/63] [DOP-13681] Improve MongoDB documentation --- .../next_release/236.improvement.rst | 4 + .../db_connection/mongodb/index.rst | 8 + .../db_connection/mongodb/pipeline.rst | 25 ++ .../db_connection/mongodb/prerequisites.rst | 77 +++++ .../connection/db_connection/mongodb/read.rst | 130 ++++++++- .../db_connection/mongodb/types.rst | 266 ++++++++++++++++++ .../db_connection/mongodb/write.rst | 38 ++- .../db_connection/mongodb/connection.py | 47 +--- .../db_connection/mongodb/options.py | 2 +- 9 files changed, 547 insertions(+), 50 deletions(-) create mode 100644 docs/changelog/next_release/236.improvement.rst create mode 100644 docs/connection/db_connection/mongodb/pipeline.rst create mode 100644 docs/connection/db_connection/mongodb/prerequisites.rst create mode 100644 docs/connection/db_connection/mongodb/types.rst diff --git a/docs/changelog/next_release/236.improvement.rst b/docs/changelog/next_release/236.improvement.rst new file mode 100644 index 000000000..a56c624fc --- /dev/null +++ b/docs/changelog/next_release/236.improvement.rst @@ -0,0 +1,4 @@ +Improve MongoDB documentation: + * Add "Types" section describing mapping between MongoDB and Spark types + * Add "Prerequisites" section describing different aspects of connecting to MongoDB + * Separate documentation of ``DBReader`` and ``MongoDB.pipeline`` diff --git a/docs/connection/db_connection/mongodb/index.rst b/docs/connection/db_connection/mongodb/index.rst index a863265a4..6eacc8548 100644 --- a/docs/connection/db_connection/mongodb/index.rst +++ b/docs/connection/db_connection/mongodb/index.rst @@ -7,6 +7,7 @@ MongoDB :maxdepth: 1 :caption: Connection + prerequisites connection .. toctree:: @@ -14,4 +15,11 @@ MongoDB :caption: Operations read + pipeline write + +.. toctree:: + :maxdepth: 1 + :caption: Troubleshooting + + types diff --git a/docs/connection/db_connection/mongodb/pipeline.rst b/docs/connection/db_connection/mongodb/pipeline.rst new file mode 100644 index 000000000..d3c876e4d --- /dev/null +++ b/docs/connection/db_connection/mongodb/pipeline.rst @@ -0,0 +1,25 @@ +.. _mongodb-sql: + +Reading from MongoDB using ``MongoDB.pipeline`` +=============================================== + +.. warning:: + + Please take into account :ref:`mongodb-types` + +:obj:`MongoDB.sql ` allows passing custom pipeline, +but does not support incremental strategies. + +References +---------- + +.. currentmodule:: onetl.connection.db_connection.mongodb.connection + +.. automethod:: MongoDB.pipeline + +.. currentmodule:: onetl.connection.db_connection.mongodb.options + +.. autopydantic_model:: MongoDBPipelineOptions + :member-order: bysource + :model-show-field-summary: false + :field-show-constraints: false diff --git a/docs/connection/db_connection/mongodb/prerequisites.rst b/docs/connection/db_connection/mongodb/prerequisites.rst new file mode 100644 index 000000000..6bc6e90e9 --- /dev/null +++ b/docs/connection/db_connection/mongodb/prerequisites.rst @@ -0,0 +1,77 @@ +.. _mongodb-prerequisites: + +Prerequisites +============= + +Version Compatibility +--------------------- + +* MongoDB server versions: 4.0 or higher +* Spark versions: 3.2.x - 3.4.x +* Scala versions: 2.12 - 2.13 +* Java versions: 8 - 20 + +See `official documentation `_. + +Installing PySpark +------------------ + +To use MongoDB connector you should have PySpark installed (or injected to ``sys.path``) +BEFORE creating the connector instance. + +See :ref:`install-spark` installation instruction for more details. + +Connecting to MongoDB +--------------------- + +Connection host +~~~~~~~~~~~~~~~ + +It is possible to connect to MongoDB host by using either DNS name of host or it's IP address. + +It is also possible to connect to MongoDB shared cluster: + +.. code-block:: python + + + mongo = MongoDB( + host="master.host.or.ip", + user="user", + password="*****", + database="target_database", + spark=spark, + extra={ + # read data from secondary cluster node, switch to primary if not available + "readPreference": "secondaryPreferred", + }, + ) + +Supported ``readPreference`` values are described in `official documentation `_. + +Connection port +~~~~~~~~~~~~~~~ + +Connection is usually performed to port ``27017``. Port may differ for different MongoDB instances. +Please ask your MongoDB administrator to provide required information. + +Required grants +~~~~~~~~~~~~~~~ + +Ask your MongoDB cluster administrator to set following grants for a user, +used for creating a connection: + +.. tabs:: + + .. code-tab:: js Read + Write + + // allow writing data to specific database + db.grantRolesToUser("username", [{db: "somedb", role: "readWrite"}]) + + .. code-tab:: js Read only + + // allow reading data from specific database + db.grantRolesToUser("username", [{db: "somedb", role: "read"}]) + +See: + * `db.grantRolesToUser documentation `_ + * `MongoDB builtin roles `_ diff --git a/docs/connection/db_connection/mongodb/read.rst b/docs/connection/db_connection/mongodb/read.rst index b3d51686a..8fad2120c 100644 --- a/docs/connection/db_connection/mongodb/read.rst +++ b/docs/connection/db_connection/mongodb/read.rst @@ -1,16 +1,127 @@ .. _mongodb-read: -Reading from MongoDB -==================== +Reading from MongoDB using ``DBReader`` +======================================= -There are 2 ways of distributed data reading from MongoDB: +.. warning:: -* Using :obj:`DBReader ` with different :ref:`strategy` and :obj:`MongoDBReadOptions ` -* Using :obj:`MongoDB.pipeline ` with :obj:`MongoDBPipelineOptions ` + Please take into account :ref:`mongodb-types` -.. currentmodule:: onetl.connection.db_connection.mongodb.connection +:obj:`DBReader ` supports :ref:`strategy` for incremental data reading, +but does not support custom pipelines, e.g. aggregation. -.. automethod:: MongoDB.pipeline +Supported DBReader features +--------------------------- + +* ❌ ``columns`` (for now, all document fields are read) +* ✅︎ ``where`` (passed to ``{"$match": ...}`` aggregation pipeline) +* ✅︎ ``hwm``, supported strategies: +* * ✅︎ :ref:`snapshot-strategy` +* * ✅︎ :ref:`incremental-strategy` +* * ✅︎ :ref:`snapshot-batch-strategy` +* * ✅︎ :ref:`incremental-batch-strategy` +* * Note that ``expression`` field of HWM can only be a field name, not a custom expression +* ✅︎ ``hint`` (see `official documentation `_) +* ✅︎ ``df_schema`` (mandatory) +* ✅︎ ``options`` (see :obj:`MongoDBReadOptions `) + +Examples +-------- + +Snapshot strategy: + +.. code-block:: python + + from onetl.connection import MongoDB + from onetl.db import DBReader + + from pyspark.sql.types import ( + StructType, + StructField, + IntegerType, + StringType, + TimestampType, + ) + + mongodb = MongoDB(...) + + # mandatory + df_schema = StructType( + [ + StructField("_id", StringType()), + StructField("some", StringType()), + StructField( + "field", + StructType( + [ + StructField("nested", IntegerType()), + ], + ), + ), + StructField("updated_dt", TimestampType()), + ] + ) + + reader = DBReader( + connection=mongodb, + source="some_collection", + df_schema=df_schema, + where={"field": {"$eq": 123}}, + hint={"field": 1}, + options=MongoDBReadOptions(batchSize=10000), + ) + df = reader.run() + +Incremental strategy: + +.. code-block:: python + + from onetl.connection import MongoDB + from onetl.db import DBReader + from onetl.strategy import IncrementalStrategy + + from pyspark.sql.types import ( + StructType, + StructField, + IntegerType, + StringType, + TimestampType, + ) + + mongodb = MongoDB(...) + + # mandatory + df_schema = StructType( + [ + StructField("_id", StringType()), + StructField("some", StringType()), + StructField( + "field", + StructType( + [ + StructField("nested", IntegerType()), + ], + ), + ), + StructField("updated_dt", TimestampType()), + ] + ) + + reader = DBReader( + connection=mongodb, + source="some_collection", + df_schema=df_schema, + where={"field": {"$eq": 123}}, + hint={"field": 1}, + hwm=DBReader.AutoDetectHWM(name="mongodb_hwm", expression="updated_dt"), + options=MongoDBReadOptions(batchSize=10000), + ) + + with IncrementalStrategy(): + df = reader.run() + +Read options +------------ .. currentmodule:: onetl.connection.db_connection.mongodb.options @@ -18,8 +129,3 @@ There are 2 ways of distributed data reading from MongoDB: :member-order: bysource :model-show-field-summary: false :field-show-constraints: false - -.. autopydantic_model:: MongoDBPipelineOptions - :member-order: bysource - :model-show-field-summary: false - :field-show-constraints: false diff --git a/docs/connection/db_connection/mongodb/types.rst b/docs/connection/db_connection/mongodb/types.rst new file mode 100644 index 000000000..331a663d0 --- /dev/null +++ b/docs/connection/db_connection/mongodb/types.rst @@ -0,0 +1,266 @@ +.. _mongodb-types: + +MongoDB <-> Spark type mapping +============================== + +Type detection & casting +------------------------ + +Spark's DataFrames always have a ``schema`` which is a list of fields with corresponding Spark types. All operations on a field are performed using field type. + +MongoDB is, by design, __schemaless__. So there are 2 ways how this can be handled: + +* User provides DataFrame schema explicitly: + + .. dropdown:: See example + + .. code-block:: python + + from onetl.connection import MongoDB + from onetl.db import DBReader + + from pyspark.sql.types import ( + StructType, + StructField, + IntegerType, + StringType, + TimestampType, + ) + + mongodb = MongoDB(...) + + df_schema = StructType( + [ + StructField("_id", StringType()), + StructField("some", StringType()), + StructField( + "field", + StructType( + [ + StructField("nested", IntegerType()), + ] + ), + ), + ] + ) + + reader = DBReader( + connection=mongodb, + source="some_collection", + df_schema=df_schema, + ) + df = reader.run() + + # or + + df = mongodb.pipeline( + collection="some_collection", + df_schema=df_schema, + ) + +* Rely on MongoDB connector schema infer: + + .. code-block:: python + + df = mongodb.pipeline(collection="some_collection") + + In this case MongoDB connector read a sample of collection documents, and build DataFrame schema based on document fields and values. + +It is highly recommended to pass ``df_schema`` explicitly, to avoid type conversion issues. + +References +~~~~~~~~~~ + +Here you can find source code with type conversions: + +* `MongoDB -> Spark `_ +* `Spark -> MongoDB `_ + +Supported types +--------------- + +See `official documentation `_ + +Numeric types +~~~~~~~~~~~~~ + ++---------------------+-----------------------------+----------------------+ +| MongoDB type (read) | Spark type | MongoDB type (write) | ++=====================+=============================+======================+ +| ``Decimal128`` | ``DecimalType(P=34, S=32)`` | ``Decimal128`` | ++---------------------+-----------------------------+----------------------+ +| ``-`` | ``FloatType()`` | ``Double`` | ++---------------------+-----------------------------+ | +| ``Double`` | ``DoubleType()`` | | ++---------------------+-----------------------------+----------------------+ +| ``-`` | ``ByteType()`` | ``Int32`` | ++---------------------+-----------------------------+ | +| ``-`` | ``ShortType()`` | | ++---------------------+-----------------------------+ | +| ``Int32`` | ``IntegerType()`` | | ++---------------------+-----------------------------+----------------------+ +| ``Int64`` | ``LongType()`` | ``Int64`` | ++---------------------+-----------------------------+----------------------+ + +Temporal types +~~~~~~~~~~~~~~ + ++------------------------+-----------------------------------+-------------------------+ +| MongoDB type (read) | Spark type | MongoDB type (write) | ++========================+===================================+=========================+ +| ``-`` | ``DateType()``, days | ``Date``, milliseconds | ++------------------------+-----------------------------------+-------------------------+ +| ``Date``, milliseconds | ``TimestampType()``, microseconds | ``Date``, milliseconds, | +| | | **precision loss** [2]_ | ++------------------------+-----------------------------------+-------------------------+ +| ``Timestamp``, seconds | ``TimestampType()``, microseconds | ``Date``, milliseconds | ++------------------------+-----------------------------------+-------------------------+ +| ``-`` | ``DayTimeIntervalType()`` | unsupported | ++------------------------+-----------------------------------+-------------------------+ + +.. warning:: + + Note that types in MongoDB and Spark have different value ranges: + + +---------------+--------------------------------+--------------------------------+---------------------+--------------------------------+--------------------------------+ + | MongoDB type | Min value | Max value | Spark type | Min value | Max value | + +===============+================================+================================+=====================+================================+================================+ + | ``Date`` | -290 million years | 290 million years | ``TimestampType()`` | ``0001-01-01 00:00:00.000000`` | ``9999-12-31 23:59:59.999999`` | + +---------------+--------------------------------+--------------------------------+ | | | + | ``Timestamp`` | ``1970-01-01 00:00:00`` | ``2106-02-07 09:28:16`` | | | | + +---------------+--------------------------------+--------------------------------+---------------------+--------------------------------+--------------------------------+ + + So not all values can be read from MongoDB to Spark, and can written from Spark DataFrame to MongoDB. + + References: + * `MongoDB Date type documentation `_ + * `MongoDB Timestamp documentation `_ + * `Spark DateType documentation `_ + * `Spark TimestampType documentation `_ + +.. [2] + MongoDB ``Date`` type has precision up to milliseconds (``23:59:59.999``). + Inserting data with microsecond precision (``23:59:59.999999``) + will lead to **throwing away microseconds**. + +String types +~~~~~~~~~~~~~ + +Note: fields of deprecated MongoDB type ``Symbol`` are excluded during read. + ++---------------------+------------------+----------------------+ +| MongoDB type (read) | Spark type | MongoDB type (write) | ++=====================+==================+======================+ +| ``String`` | ``StringType()`` | ``String`` | ++---------------------+ | | +| ``Code`` | | | ++---------------------+ | | +| ``RegExp`` | | | ++---------------------+------------------+----------------------+ + +Binary types +~~~~~~~~~~~~ + ++---------------------+-------------------+----------------------+ +| MongoDB type (read) | Spark type | MongoDB type (write) | ++=====================+===================+======================+ +| ``Boolean`` | ``BooleanType()`` | ``Boolean`` | ++---------------------+-------------------+----------------------+ +| ``Binary`` | ``BinaryType()`` | ``Binary`` | ++---------------------+-------------------+----------------------+ + +Struct types +~~~~~~~~~~~~ + ++---------------------+-----------------------+----------------------+ +| MongoDB type (read) | Spark type | MongoDB type (write) | ++=====================+=======================+======================+ +| ``Array[T]`` | ``ArrayType(T)`` | ``Array[T]`` | ++---------------------+-----------------------+----------------------+ +| ``Object[...]`` | ``StructType([...])`` | ``Object[...]`` | ++---------------------+-----------------------+ | +| ``-`` | ``MapType(...)`` | | ++---------------------+-----------------------+----------------------+ + +Special types +~~~~~~~~~~~~~ + ++---------------------+---------------------------------------------------------+---------------------------------------+ +| MongoDB type (read) | Spark type | MongoDB type (write) | ++=====================+=========================================================+=======================================+ +| ``ObjectId`` | ``StringType()`` | ``String`` | ++---------------------+ | | +| ``MaxKey`` | | | ++---------------------+ | | +| ``MinKey`` | | | ++---------------------+---------------------------------------------------------+---------------------------------------+ +| ``Null`` | ``NullType()`` | ``Null`` | ++---------------------+ | | +| ``Undefined`` | | | ++---------------------+---------------------------------------------------------+---------------------------------------+ +| ``DBRef`` | ``StructType([$ref: StringType(), $id: StringType()])`` | ``Object[$ref: String, $id: String]`` | ++---------------------+---------------------------------------------------------+---------------------------------------+ + +Explicit type cast +------------------ + +``DBReader`` +~~~~~~~~~~~~ + +Currently it is not possible to cast field types using ``DBReader``. But this can be done using ``MongoDB.pipeline``. + +``MongoDB.pipeline`` +~~~~~~~~~~~~~~~~~~~~ + +You can use ``$project`` aggregation to cast field types: + +.. code-block:: python + + from pyspark.sql.types import IntegerType, StructField, StructType + + from onetl.connection import MongoDB + from onetl.db import DBReader + + mongodb = MongoDB(...) + + df = mongodb.pipeline( + collection="my_collection", + pipeline=[ + { + "$project": { + # convert unsupported_field to string + "unsupported_field_str": { + "$convert": { + "input": "$unsupported_field", + "to": "string", + }, + }, + # skip unsupported_field from result + "unsupported_field": 0, + } + } + ], + ) + + # cast field content to proper Spark type + df = df.select( + df.id, + df.supported_field, + # explicit cast + df.unsupported_field_str.cast("integer").alias("parsed_integer"), + ) + +``DBWriter`` +~~~~~~~~~~~~ + +Convert dataframe field to string on Spark side, and then write it to MongoDB: + +.. code:: python + + + df = df.select( + df.id, + df.unsupported_field.cast("string").alias("array_field_json"), + ) + + writer.run(df) diff --git a/docs/connection/db_connection/mongodb/write.rst b/docs/connection/db_connection/mongodb/write.rst index 5fff32d70..3ae86fece 100644 --- a/docs/connection/db_connection/mongodb/write.rst +++ b/docs/connection/db_connection/mongodb/write.rst @@ -1,9 +1,41 @@ .. _mongodb-write: -Writing to MongoDB -================== +Writing to MongoDB using ``DBWriter`` +===================================== -For writing data to MongoDB, use :obj:`DBWriter ` with options below. +For writing data to MongoDB, use :obj:`DBWriter `. + +.. warning:: + + Please take into account :ref:`mongodb-types` + +Examples +-------- + +.. code-block:: python + + from onetl.connection import MongoDB + from onetl.db import DBWriter + + mongodb = MongoDB(...) + + df = ... # data is here + + writer = DBWriter( + connection=mongodb, + target="schema.table", + options=MongoDB.WriteOptions( + if_exists="append", + ), + ) + + writer.run(df) + + +Write options +------------- + +Method above accepts :obj:`JDBCWriteOptions ` .. currentmodule:: onetl.connection.db_connection.mongodb.options diff --git a/onetl/connection/db_connection/mongodb/connection.py b/onetl/connection/db_connection/mongodb/connection.py index b34b6810a..1c72acacb 100644 --- a/onetl/connection/db_connection/mongodb/connection.py +++ b/onetl/connection/db_connection/mongodb/connection.py @@ -50,33 +50,12 @@ class Config: class MongoDB(DBConnection): """MongoDB connection. |support_hooks| - Based on package ``org.mongodb.spark:mongo-spark-connector`` - (`MongoDB connector for Spark `_) - - .. dropdown:: Version compatibility - - * MongoDB server versions: 4.0 or higher - * Spark versions: 3.2.x - 3.4.x - * Scala versions: 2.12 - 2.13 - * Java versions: 8 - 20 - - See `official documentation `_. + Based on package ``org.mongodb.spark:mongo-spark-connector:10.1.1`` + (`MongoDB connector for Spark `_) .. warning:: - To use MongoDB connector you should have PySpark installed (or injected to ``sys.path``) - BEFORE creating the connector instance. - - You can install PySpark as follows: - - .. code:: bash - - pip install onetl[spark] # latest PySpark version - - # or - pip install onetl pyspark=3.4.2 # pass specific PySpark version - - See :ref:`install-spark` installation instruction for more details. + Before using this connector please take into account :ref:`mongodb-prerequisites` Parameters ---------- @@ -118,7 +97,7 @@ class MongoDB(DBConnection): from pyspark.sql import SparkSession # Create Spark session with MongoDB connector loaded - maven_packages = MongoDB.get_packages(spark_version="3.2") + maven_packages = MongoDB.get_packages(spark_version="3.4") spark = ( SparkSession.builder.appName("spark-app-name") .config("spark.jars.packages", ",".join(maven_packages)) @@ -132,7 +111,7 @@ class MongoDB(DBConnection): password="*****", database="target_database", spark=spark, - ) + ).check() """ database: str @@ -181,8 +160,8 @@ def get_packages( from onetl.connection import MongoDB - MongoDB.get_packages(scala_version="2.11") - MongoDB.get_packages(spark_version="3.2") + MongoDB.get_packages(scala_version="2.12") + MongoDB.get_packages(spark_version="3.4") """ @@ -249,7 +228,7 @@ def pipeline( .. code:: js - db.collection.aggregate([{"$match": ...}, {"$group": ...}]) + db.collection_name.aggregate([{"$match": ...}, {"$group": ...}]) but pipeline is executed on Spark executors, in a distributed way. @@ -285,13 +264,13 @@ def pipeline( Examples -------- - Get document with a specific ``_id``: + Get document with a specific ``field`` value: .. code:: python df = connection.pipeline( collection="collection_name", - pipeline={"$match": {"_id": {"$eq": 1}}}, + pipeline={"$match": {"field": {"$eq": 1}}}, ) Calculate aggregation and get result: @@ -324,7 +303,7 @@ def pipeline( df_schema = StructType( [ - StructField("_id", IntegerType()), + StructField("_id", StringType()), StructField("some_string", StringType()), StructField("some_int", IntegerType()), StructField("some_datetime", TimestampType()), @@ -344,8 +323,8 @@ def pipeline( df = connection.pipeline( collection="collection_name", - pipeline={"$match": {"_id": {"$eq": 1}}}, - options=MongoDB.PipelineOptions(hint={"_id": 1}), + pipeline={"$match": {"field": {"$eq": 1}}}, + options=MongoDB.PipelineOptions(hint={"field": 1}), ) """ diff --git a/onetl/connection/db_connection/mongodb/options.py b/onetl/connection/db_connection/mongodb/options.py index 84022b8a5..2ddc7a068 100644 --- a/onetl/connection/db_connection/mongodb/options.py +++ b/onetl/connection/db_connection/mongodb/options.py @@ -117,7 +117,7 @@ class MongoDBPipelineOptions(GenericOptions): .. code:: python MongoDB.PipelineOptions( - hint="{'_id': 1}", + hint={"some_field": 1}, ) """ From 9c8ccf2abcfcfce0d1062e37f48419d8995d1b03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Thu, 14 Mar 2024 14:59:25 +0000 Subject: [PATCH 46/63] [DOP-13681] Allow empty pipeline in MongoDB.pipeline --- docs/changelog/next_release/237.feature.rst | 1 + .../db_connection/mongodb/connection.py | 23 +++--- .../test_mongodb_integration.py | 80 +++++++++++++++---- 3 files changed, 75 insertions(+), 29 deletions(-) create mode 100644 docs/changelog/next_release/237.feature.rst diff --git a/docs/changelog/next_release/237.feature.rst b/docs/changelog/next_release/237.feature.rst new file mode 100644 index 000000000..b053e437c --- /dev/null +++ b/docs/changelog/next_release/237.feature.rst @@ -0,0 +1 @@ +Allow calling ``MongoDB.pipeline(...)`` with passing just collection name, without explicit aggregation pipeline. diff --git a/onetl/connection/db_connection/mongodb/connection.py b/onetl/connection/db_connection/mongodb/connection.py index 1c72acacb..acaaa6539 100644 --- a/onetl/connection/db_connection/mongodb/connection.py +++ b/onetl/connection/db_connection/mongodb/connection.py @@ -216,7 +216,7 @@ def package_spark_3_4(cls) -> str: def pipeline( self, collection: str, - pipeline: dict | list[dict], + pipeline: dict | list[dict] | None = None, df_schema: StructType | None = None, options: MongoDBPipelineOptions | dict | None = None, ): @@ -237,28 +237,20 @@ def pipeline( This method does not support :ref:`strategy`, use :obj:`DBReader ` instead - .. note:: - - Statement is executed in read-write connection, - so if you're calling some stored functions, you can make changes - to the data source. - - Unfortunately, Spark does no provide any option to change this behavior. - Parameters ---------- collection : str Collection name. - pipeline : dict | list[dict] + pipeline : dict | list[dict], optional Pipeline containing a database query. See `Aggregation pipeline syntax `_. - df_schema : StructType, default: ``None`` + df_schema : StructType, optional Schema describing the resulting DataFrame. - options : PipelineOptions | dict, default: ``None`` + options : PipelineOptions | dict, optional Additional pipeline options, see :obj:`~PipelineOptions`. Examples @@ -331,7 +323,9 @@ def pipeline( log.info("|%s| Executing aggregation pipeline:", self.__class__.__name__) read_options = self.PipelineOptions.parse(options).dict(by_alias=True, exclude_none=True) - pipeline = self.dialect.prepare_pipeline(pipeline) + if pipeline: + pipeline = self.dialect.prepare_pipeline(pipeline) + log_with_indent(log, "collection = %r", collection) log_json(log, pipeline, name="pipeline") @@ -342,7 +336,8 @@ def pipeline( log_options(log, read_options) read_options["collection"] = collection - read_options["aggregation.pipeline"] = json.dumps(pipeline) + if pipeline: + read_options["aggregation.pipeline"] = json.dumps(pipeline) read_options["connection.uri"] = self.connection_url spark_reader = self.spark.read.format("mongodb").options(**read_options) diff --git a/tests/tests_integration/tests_db_connection_integration/test_mongodb_integration.py b/tests/tests_integration/tests_db_connection_integration/test_mongodb_integration.py index 6283df1a1..86daa7d91 100644 --- a/tests/tests_integration/tests_db_connection_integration/test_mongodb_integration.py +++ b/tests/tests_integration/tests_db_connection_integration/test_mongodb_integration.py @@ -48,10 +48,9 @@ def test_mongodb_connection_check_fail(processing, spark): mongo.check() -def test_mongodb_connection_read_pipeline_match( +def test_mongodb_connection_read_pipeline( spark, prepare_schema_table, - load_table_data, processing, ): mongo = MongoDB( @@ -63,23 +62,60 @@ def test_mongodb_connection_read_pipeline_match( spark=spark, ) + write_df = processing.create_pandas_df() + + processing.insert_data( + schema=prepare_schema_table.schema, + table=prepare_schema_table.table, + values=write_df, + ) + df = mongo.pipeline( collection=prepare_schema_table.table, - pipeline={"$match": {"_id": {"$eq": 1}}}, ) - assert df - assert df.count() == 1 + processing.assert_equal_df( + df=df, + other_frame=write_df, + ) - collected = df.collect() - assert collected[0][0] == 1 # _id +def test_mongodb_connection_read_pipeline_match( + spark, + prepare_schema_table, + processing, +): + mongo = MongoDB( + host=processing.host, + port=processing.port, + user=processing.user, + password=processing.password, + database=processing.database, + spark=spark, + ) + + write_df = processing.create_pandas_df() + + processing.insert_data( + schema=prepare_schema_table.schema, + table=prepare_schema_table.table, + values=write_df, + ) + + df = mongo.pipeline( + collection=prepare_schema_table.table, + pipeline={"$match": {"_id": {"$eq": 1}}}, + ) + + processing.assert_equal_df( + df=df, + other_frame=write_df[write_df._id == 1], + ) def test_mongodb_connection_read_pipeline_match_with_df_schema( spark, prepare_schema_table, - load_table_data, processing, ): mongo = MongoDB( @@ -91,21 +127,27 @@ def test_mongodb_connection_read_pipeline_match_with_df_schema( spark=spark, ) + write_df = processing.create_pandas_df() + + processing.insert_data( + schema=prepare_schema_table.schema, + table=prepare_schema_table.table, + values=write_df, + ) + df = mongo.pipeline( collection=prepare_schema_table.table, pipeline={"$match": {"_id": {"$eq": 1}}}, df_schema=prepare_schema_table.schema, ) - assert df - assert df.count() == 1 - - collected = df.collect() - - assert collected[0]["_id"] == 1 + processing.assert_equal_df( + df=df, + other_frame=write_df[write_df._id == 1], + ) -def test_mongodb_connection_read_pipeline_group(spark, prepare_schema_table, load_table_data, processing): +def test_mongodb_connection_read_pipeline_group(spark, prepare_schema_table, processing): mongo = MongoDB( host=processing.host, port=processing.port, @@ -115,6 +157,14 @@ def test_mongodb_connection_read_pipeline_group(spark, prepare_schema_table, loa spark=spark, ) + write_df = processing.create_pandas_df(min_id=1, max_id=100) + + processing.insert_data( + schema=prepare_schema_table.schema, + table=prepare_schema_table.table, + values=write_df, + ) + df = mongo.pipeline( collection=prepare_schema_table.table, pipeline={"$group": {"_id": 1, "min": {"$min": "$hwm_int"}, "max": {"$max": "$hwm_int"}}}, From dabcc48890e370d21aa34a8cf82486d162de732b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Fri, 15 Mar 2024 13:34:52 +0000 Subject: [PATCH 47/63] [DOP-13255] Improve MongoDB documentation --- docs/connection/db_connection/mongodb/types.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/connection/db_connection/mongodb/types.rst b/docs/connection/db_connection/mongodb/types.rst index 331a663d0..bd5978aa9 100644 --- a/docs/connection/db_connection/mongodb/types.rst +++ b/docs/connection/db_connection/mongodb/types.rst @@ -115,7 +115,9 @@ Temporal types +------------------------+-----------------------------------+-------------------------+ | ``Timestamp``, seconds | ``TimestampType()``, microseconds | ``Date``, milliseconds | +------------------------+-----------------------------------+-------------------------+ -| ``-`` | ``DayTimeIntervalType()`` | unsupported | +| ``-`` | ``TimestampNTZType()`` | unsupported | ++------------------------+-----------------------------------+ | +| ``-`` | ``DayTimeIntervalType()`` | | +------------------------+-----------------------------------+-------------------------+ .. warning:: From 5813112e7e5cd23474727c13afedd586f5ee2e9f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 18 Mar 2024 06:44:22 +0000 Subject: [PATCH 48/63] Bump the github-actions group with 1 update Bumps the github-actions group with 1 update: [tj-actions/changed-files](https://github.com/tj-actions/changed-files). Updates `tj-actions/changed-files` from 42 to 43 - [Release notes](https://github.com/tj-actions/changed-files/releases) - [Changelog](https://github.com/tj-actions/changed-files/blob/main/HISTORY.md) - [Commits](https://github.com/tj-actions/changed-files/compare/v42...v43) --- updated-dependencies: - dependency-name: tj-actions/changed-files dependency-type: direct:production update-type: version-update:semver-major dependency-group: github-actions ... Signed-off-by: dependabot[bot] --- .github/workflows/get-matrix.yml | 46 ++++++++++++++++---------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/.github/workflows/get-matrix.yml b/.github/workflows/get-matrix.yml index de57b161f..e4579eb28 100644 --- a/.github/workflows/get-matrix.yml +++ b/.github/workflows/get-matrix.yml @@ -86,7 +86,7 @@ jobs: - name: Check if base files are changed id: changed-base - uses: tj-actions/changed-files@v42 + uses: tj-actions/changed-files@v43 with: files_from_source_file: .github/workflows/data/base/tracked.txt files_ignore_from_source_file: .github/workflows/data/base/ignored.txt @@ -97,7 +97,7 @@ jobs: - name: Check if db-related files are changed id: changed-db - uses: tj-actions/changed-files@v42 + uses: tj-actions/changed-files@v43 with: files_from_source_file: .github/workflows/data/db/tracked.txt files_ignore_from_source_file: .github/workflows/data/db/ignored.txt @@ -108,7 +108,7 @@ jobs: - name: Check if file-related files are changed id: changed-file - uses: tj-actions/changed-files@v42 + uses: tj-actions/changed-files@v43 with: files_from_source_file: .github/workflows/data/file/tracked.txt files_ignore_from_source_file: .github/workflows/data/file/ignored.txt @@ -119,7 +119,7 @@ jobs: - name: Check if file-df-related files are changed id: changed-file-df - uses: tj-actions/changed-files@v42 + uses: tj-actions/changed-files@v43 with: files_from_source_file: .github/workflows/data/file-df/tracked.txt files_ignore_from_source_file: .github/workflows/data/file-df/ignored.txt @@ -130,7 +130,7 @@ jobs: - name: Check if core files are changed id: changed-core - uses: tj-actions/changed-files@v42 + uses: tj-actions/changed-files@v43 with: files_from_source_file: .github/workflows/data/core/tracked.txt files_ignore_from_source_file: .github/workflows/data/core/ignored.txt @@ -160,7 +160,7 @@ jobs: - name: Check if Clickhouse files are changed id: changed-clickhouse - uses: tj-actions/changed-files@v42 + uses: tj-actions/changed-files@v43 with: files_from_source_file: .github/workflows/data/clickhouse/tracked.txt files_ignore_from_source_file: .github/workflows/data/clickhouse/ignored.txt @@ -190,7 +190,7 @@ jobs: - name: Check if Greenplum files are changed id: changed-greenplum - uses: tj-actions/changed-files@v42 + uses: tj-actions/changed-files@v43 with: files_from_source_file: .github/workflows/data/greenplum/tracked.txt files_ignore_from_source_file: .github/workflows/data/greenplum/ignored.txt @@ -220,7 +220,7 @@ jobs: - name: Check if Hive files are changed id: changed-hive - uses: tj-actions/changed-files@v42 + uses: tj-actions/changed-files@v43 with: files_from_source_file: .github/workflows/data/hive/tracked.txt files_ignore_from_source_file: .github/workflows/data/hive/ignored.txt @@ -250,7 +250,7 @@ jobs: - name: Check if Kafka files are changed id: changed-kafka - uses: tj-actions/changed-files@v42 + uses: tj-actions/changed-files@v43 with: files_from_source_file: .github/workflows/data/kafka/tracked.txt files_ignore_from_source_file: .github/workflows/data/kafka/ignored.txt @@ -280,7 +280,7 @@ jobs: - name: Check if LocalFS files are changed id: changed-local-fs - uses: tj-actions/changed-files@v42 + uses: tj-actions/changed-files@v43 with: files_from_source_file: .github/workflows/data/local-fs/tracked.txt files_ignore_from_source_file: .github/workflows/data/local-fs/ignored.txt @@ -310,7 +310,7 @@ jobs: - name: Check if MongoDB files are changed id: changed-mongodb - uses: tj-actions/changed-files@v42 + uses: tj-actions/changed-files@v43 with: files_from_source_file: .github/workflows/data/mongodb/tracked.txt files_ignore_from_source_file: .github/workflows/data/mongodb/ignored.txt @@ -340,7 +340,7 @@ jobs: - name: Check if MSSQL files are changed id: changed-mssql - uses: tj-actions/changed-files@v42 + uses: tj-actions/changed-files@v43 with: files_from_source_file: .github/workflows/data/mssql/tracked.txt files_ignore_from_source_file: .github/workflows/data/mssql/ignored.txt @@ -370,7 +370,7 @@ jobs: - name: Check if MySQL files are changed id: changed-mysql - uses: tj-actions/changed-files@v42 + uses: tj-actions/changed-files@v43 with: files_from_source_file: .github/workflows/data/mysql/tracked.txt files_ignore_from_source_file: .github/workflows/data/mysql/ignored.txt @@ -400,7 +400,7 @@ jobs: - name: Check if Oracle files are changed id: changed-oracle - uses: tj-actions/changed-files@v42 + uses: tj-actions/changed-files@v43 with: files_from_source_file: .github/workflows/data/oracle/tracked.txt files_ignore_from_source_file: .github/workflows/data/oracle/ignored.txt @@ -430,7 +430,7 @@ jobs: - name: Check if Postgres files are changed id: changed-postgres - uses: tj-actions/changed-files@v42 + uses: tj-actions/changed-files@v43 with: files_from_source_file: .github/workflows/data/postgres/tracked.txt files_ignore_from_source_file: .github/workflows/data/postgres/ignored.txt @@ -460,7 +460,7 @@ jobs: - name: Check if Teradata files are changed id: changed-teradata - uses: tj-actions/changed-files@v42 + uses: tj-actions/changed-files@v43 with: files_from_source_file: .github/workflows/data/teradata/tracked.txt files_ignore_from_source_file: .github/workflows/data/teradata/ignored.txt @@ -490,7 +490,7 @@ jobs: - name: Check if FTP files are changed id: changed-ftp - uses: tj-actions/changed-files@v42 + uses: tj-actions/changed-files@v43 with: files_from_source_file: .github/workflows/data/ftp/tracked.txt files_ignore_from_source_file: .github/workflows/data/ftp/ignored.txt @@ -520,7 +520,7 @@ jobs: - name: Check if FTPS files are changed id: changed-ftps - uses: tj-actions/changed-files@v42 + uses: tj-actions/changed-files@v43 with: files_from_source_file: .github/workflows/data/ftps/tracked.txt files_ignore_from_source_file: .github/workflows/data/ftps/ignored.txt @@ -550,7 +550,7 @@ jobs: - name: Check if HDFS files are changed id: changed-hdfs - uses: tj-actions/changed-files@v42 + uses: tj-actions/changed-files@v43 with: files_from_source_file: .github/workflows/data/hdfs/tracked.txt files_ignore_from_source_file: .github/workflows/data/hdfs/ignored.txt @@ -580,7 +580,7 @@ jobs: - name: Check if S3 files are changed id: changed-s3 - uses: tj-actions/changed-files@v42 + uses: tj-actions/changed-files@v43 with: files_from_source_file: .github/workflows/data/s3/tracked.txt files_ignore_from_source_file: .github/workflows/data/s3/ignored.txt @@ -610,7 +610,7 @@ jobs: - name: Check if SFTP files are changed id: changed-sftp - uses: tj-actions/changed-files@v42 + uses: tj-actions/changed-files@v43 with: files_from_source_file: .github/workflows/data/sftp/tracked.txt files_ignore_from_source_file: .github/workflows/data/sftp/ignored.txt @@ -640,7 +640,7 @@ jobs: - name: Check if Samba files are changed id: changed-samba - uses: tj-actions/changed-files@v42 + uses: tj-actions/changed-files@v43 with: files_from_source_file: .github/workflows/data/samba/tracked.txt files_ignore_from_source_file: .github/workflows/data/samba/ignored.txt @@ -670,7 +670,7 @@ jobs: - name: Check if WebDAV files are changed id: changed-webdav - uses: tj-actions/changed-files@v42 + uses: tj-actions/changed-files@v43 with: files_from_source_file: .github/workflows/data/webdav/tracked.txt files_ignore_from_source_file: .github/workflows/data/webdav/ignored.txt From 2655a718c4d68e4ab3e1a54a5e61f7f7b368e8d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Wed, 13 Mar 2024 13:14:08 +0000 Subject: [PATCH 49/63] [DOP-13252] Improve MSSQL documentation --- .../next_release/235.improvement.rst | 5 + .../db_connection/clickhouse/sql.rst | 4 +- .../db_connection/clickhouse/types.rst | 68 +++- .../db_connection/greenplum/types.rst | 25 +- .../db_connection/mssql/execute.rst | 90 +++++ docs/connection/db_connection/mssql/index.rst | 10 +- .../db_connection/mssql/prerequisites.rst | 77 ++++ docs/connection/db_connection/mssql/read.rst | 72 +++- docs/connection/db_connection/mssql/sql.rst | 52 +++ docs/connection/db_connection/mssql/types.rst | 371 ++++++++++++++++++ docs/connection/db_connection/mssql/write.rst | 44 ++- .../db_connection/mysql/prerequisites.rst | 2 +- docs/connection/db_connection/mysql/sql.rst | 32 +- docs/connection/db_connection/mysql/types.rst | 144 +++---- .../db_connection/oracle/prerequisites.rst | 6 +- docs/connection/db_connection/oracle/sql.rst | 10 +- .../connection/db_connection/oracle/types.rst | 25 +- .../connection/db_connection/postgres/sql.rst | 9 +- .../db_connection/postgres/types.rst | 24 +- .../db_connection/mssql/connection.py | 23 +- 20 files changed, 935 insertions(+), 158 deletions(-) create mode 100644 docs/changelog/next_release/235.improvement.rst create mode 100644 docs/connection/db_connection/mssql/prerequisites.rst create mode 100644 docs/connection/db_connection/mssql/sql.rst create mode 100644 docs/connection/db_connection/mssql/types.rst diff --git a/docs/changelog/next_release/235.improvement.rst b/docs/changelog/next_release/235.improvement.rst new file mode 100644 index 000000000..6875f015f --- /dev/null +++ b/docs/changelog/next_release/235.improvement.rst @@ -0,0 +1,5 @@ +Improve MSSQL documentation: + * Add "Types" section describing mapping between MSSQL and Spark types + * Add "Prerequisites" section describing different aspects of connecting to MSSQL + * Separate documentation of ``DBReader`` and ``MSSQL.sql`` + * Add examples for ``MSSQL.fetch`` and ``MSSQL.execute`` diff --git a/docs/connection/db_connection/clickhouse/sql.rst b/docs/connection/db_connection/clickhouse/sql.rst index cd331ef33..658bd052a 100644 --- a/docs/connection/db_connection/clickhouse/sql.rst +++ b/docs/connection/db_connection/clickhouse/sql.rst @@ -17,8 +17,8 @@ Syntax support Only queries with the following syntax are supported: -* ``SELECT ...`` -* ``WITH alias AS (...) SELECT ...`` +* ✅︎ ``SELECT ... FROM ...`` +* ✅︎ ``WITH alias AS (...) SELECT ...`` Queries like ``SHOW ...`` are not supported. diff --git a/docs/connection/db_connection/clickhouse/types.rst b/docs/connection/db_connection/clickhouse/types.rst index e9bdd22e9..1df369dd4 100644 --- a/docs/connection/db_connection/clickhouse/types.rst +++ b/docs/connection/db_connection/clickhouse/types.rst @@ -55,7 +55,7 @@ But Spark does not have specific dialect for Clickhouse, so Generic JDBC dialect Generic dialect is using SQL ANSI type names while creating tables in target database, not database-specific types. If some cases this may lead to using wrong column type. For example, Spark creates column of type ``TIMESTAMP`` -which corresponds to Clickhouse's type ``DateTime32`` (precision up to seconds) +which corresponds to Clickhouse type ``DateTime32`` (precision up to seconds) instead of more precise ``DateTime64`` (precision up to nanoseconds). This may lead to incidental precision loss, or sometimes data cannot be written to created table at all. @@ -192,7 +192,10 @@ Numeric types Temporal types ~~~~~~~~~~~~~~ -Note: ``DateTime(P, TZ)`` has the same precision as ``DateTime(P)``. +Notes: + * Datetime with timezone has the same precision as without timezone + * ``DateTime`` is alias for ``DateTime32`` + * ``TIMESTAMP`` is alias for ``DateTime32``, but ``TIMESTAMP(N)`` is alias for ``DateTime64(N)`` +-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ | Clickhouse type (read) | Spark type | Clickhousetype (write) | Clickhouse type (create) | @@ -238,6 +241,31 @@ Note: ``DateTime(P, TZ)`` has the same precision as ``DateTime(P)``. | ``IntervalYear`` | | | | +-----------------------------------+--------------------------------------+----------------------------------+-------------------------------+ +.. warning:: + + Note that types in Clickhouse and Spark have different value ranges: + + +------------------------+-----------------------------------+-----------------------------------+---------------------+--------------------------------+--------------------------------+ + | Clickhouse type | Min value | Max value | Spark type | Min value | Max value | + +========================+===================================+===================================+=====================+================================+================================+ + | ``Date`` | ``1970-01-01`` | ``2149-06-06`` | ``DateType()`` | ``0001-01-01`` | ``9999-12-31`` | + +------------------------+-----------------------------------+-----------------------------------+---------------------+--------------------------------+--------------------------------+ + | ``DateTime32`` | ``1970-01-01 00:00:00`` | ``2106-02-07 06:28:15`` | ``TimestampType()`` | ``0001-01-01 00:00:00.000000`` | ``9999-12-31 23:59:59.999999`` | + +------------------------+-----------------------------------+-----------------------------------+ | | | + | ``DateTime64(P=0..8)`` | ``1900-01-01 00:00:00.00000000`` | ``2299-12-31 23:59:59.99999999`` | | | | + +------------------------+-----------------------------------+-----------------------------------+ | | | + | ``DateTime64(P=9)`` | ``1900-01-01 00:00:00.000000000`` | ``2262-04-11 23:47:16.999999999`` | | | | + +------------------------+-----------------------------------+-----------------------------------+---------------------+--------------------------------+--------------------------------+ + + So not all of values in Spark DataFrame can be written to Clickhouse. + + References: + * `Clickhouse Date documentation `_ + * `Clickhouse Datetime32 documentation `_ + * `Clickhouse Datetime64 documentation `_ + * `Spark DateType documentation `_ + * `Spark TimestampType documentation `_ + .. [4] Clickhouse support datetime up to nanoseconds precision (``23:59:59.999999999``), but Spark ``TimestampType()`` supports datetime up to microseconds precision (``23:59:59.999999``). @@ -257,17 +285,17 @@ String types +--------------------------------------+------------------+------------------------+--------------------------+ | Clickhouse type (read) | Spark type | Clickhousetype (write) | Clickhouse type (create) | +======================================+==================+========================+==========================+ -| ``IPv4`` | ``StringType()`` | ``String`` | ``String`` | +| ``FixedString(N)`` | ``StringType()`` | ``String`` | ``String`` | +--------------------------------------+ | | | -| ``IPv6`` | | | | +| ``String`` | | | | +--------------------------------------+ | | | | ``Enum8`` | | | | +--------------------------------------+ | | | | ``Enum16`` | | | | +--------------------------------------+ | | | -| ``FixedString(N)`` | | | | +| ``IPv4`` | | | | +--------------------------------------+ | | | -| ``String`` | | | | +| ``IPv6`` | | | | +--------------------------------------+------------------+ | | | ``-`` | ``BinaryType()`` | | | +--------------------------------------+------------------+------------------------+--------------------------+ @@ -352,7 +380,7 @@ and write it as ``String`` column in Clickhouse: array_column_json String, ) ENGINE = MergeTree() - ORDER BY time + ORDER BY id """, ) @@ -369,18 +397,34 @@ Then you can parse this column on Clickhouse side - for example, by creating a v .. code:: sql - SELECT id, JSONExtract(json_column, 'Array(String)') FROM target_tbl + SELECT + id, + JSONExtract(json_column, 'Array(String)') AS array_column + FROM target_tbl + +You can also use `ALIAS `_ +or `MATERIALIZED `_ columns +to avoid writing such expression in every ``SELECT`` clause all the time: -You can also use `ALIAS `_ columns -to avoid writing such expression in every ``SELECT`` clause all the time. +.. code-block:: sql + + CREATE TABLE default.target_tbl AS ( + id Int32, + array_column_json String, + -- computed column + array_column Array(String) ALIAS JSONExtract(json_column, 'Array(String)') + -- or materialized column + -- array_column Array(String) MATERIALIZED JSONExtract(json_column, 'Array(String)') + ) + ENGINE = MergeTree() + ORDER BY id Downsides: * Using ``SELECT JSONExtract(...)`` or ``ALIAS`` column can be expensive, because value is calculated on every row access. This can be especially harmful if such column is used in ``WHERE`` clause. -* Both ``ALIAS`` columns are not included in ``SELECT *`` clause, they should be added explicitly: ``SELECT *, calculated_column FROM table``. +* ``ALIAS`` and ``MATERIALIZED`` columns are not included in ``SELECT *`` clause, they should be added explicitly: ``SELECT *, calculated_column FROM table``. .. warning:: - `MATERIALIZED `_ and `EPHEMERAL `_ columns are not supported by Spark because they cannot be selected to determine target column type. diff --git a/docs/connection/db_connection/greenplum/types.rst b/docs/connection/db_connection/greenplum/types.rst index fb3077e9f..baea34914 100644 --- a/docs/connection/db_connection/greenplum/types.rst +++ b/docs/connection/db_connection/greenplum/types.rst @@ -102,7 +102,9 @@ See Greenplum `CREATE TABLE `_. +See: + * `official connector documentation `_ + * `list of Greenplum types `_ Numeric types ~~~~~~~~~~~~~ @@ -181,6 +183,27 @@ Temporal types | ``tstzrange`` | | | | +------------------------------------+-------------------------+-----------------------+-------------------------+ +.. warning:: + + Note that types in Greenplum and Spark have different value ranges: + + +----------------+---------------------------------+----------------------------------+---------------------+--------------------------------+--------------------------------+ + | Greenplum type | Min value | Max value | Spark type | Min value | Max value | + +================+=================================+==================================+=====================+================================+================================+ + | ``date`` | ``-4713-01-01`` | ``5874897-01-01`` | ``DateType()`` | ``0001-01-01`` | ``9999-12-31`` | + +----------------+---------------------------------+----------------------------------+---------------------+--------------------------------+--------------------------------+ + | ``timestamp`` | ``-4713-01-01 00:00:00.000000`` | ``294276-12-31 23:59:59.999999`` | ``TimestampType()`` | ``0001-01-01 00:00:00.000000`` | ``9999-12-31 23:59:59.999999`` | + +----------------+---------------------------------+----------------------------------+ | | | + | ``time`` | ``00:00:00.000000`` | ``24:00:00.000000`` | | | | + +----------------+---------------------------------+----------------------------------+---------------------+--------------------------------+--------------------------------+ + + So not all of values can be read from Greenplum to Spark. + + References: + * `Greenplum types documentation `_ + * `Spark DateType documentation `_ + * `Spark TimestampType documentation `_ + .. [3] ``time`` type is the same as ``timestamp`` with date ``1970-01-01``. So instead of reading data from Postgres like ``23:59:59`` diff --git a/docs/connection/db_connection/mssql/execute.rst b/docs/connection/db_connection/mssql/execute.rst index bed53fec5..54156ed76 100644 --- a/docs/connection/db_connection/mssql/execute.rst +++ b/docs/connection/db_connection/mssql/execute.rst @@ -3,6 +3,96 @@ Executing statements in MSSQL ============================= +How to +------ + +There are 2 ways to execute some statement in MSSQL + +Use :obj:`MSSQL.fetch ` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Use this method to execute some ``SELECT`` query which returns **small number or rows**, like reading +MSSQL config, or reading data from some reference table. + +Method accepts :obj:`JDBCOptions `. + +Connection opened using this method should be then closed with :obj:`MSSQL.close `. + +Syntax support +^^^^^^^^^^^^^^ + +This method supports **any** query syntax supported by MSSQL, like: + +* ✅︎ ``SELECT ... FROM ...`` +* ✅︎ ``WITH alias AS (...) SELECT ...`` +* ✅︎ ``SELECT func(arg1, arg2) FROM DUAL`` - call function +* ❌ ``SET ...; SELECT ...;`` - multiple statements not supported + +Examples +^^^^^^^^ + +.. code-block:: python + + from onetl.connection import MSSQL + + mssql = MSSQL(...) + + df = mssql.fetch( + "SELECT value FROM some.reference_table WHERE key = 'some_constant'", + options=MSSQL.JDBCOptions(query_timeout=10), + ) + mssql.close() + value = df.collect()[0][0] # get value from first row and first column + +Use :obj:`MSSQL.execute ` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it. + +Method accepts :obj:`JDBCOptions `. + +Connection opened using this method should be then closed with :obj:`MSSQL.close `. + +Syntax support +^^^^^^^^^^^^^^ + +This method supports **any** query syntax supported by MSSQL, like: + +* ✅︎ ``CREATE TABLE ...``, ``CREATE VIEW ...`` +* ✅︎ ``ALTER ...`` +* ✅︎ ``INSERT INTO ... AS SELECT ...`` +* ✅︎ ``DROP TABLE ...``, ``DROP VIEW ...``, and so on +* ✅︎ ``CALL procedure(arg1, arg2) ...`` or ``{call procedure(arg1, arg2)}`` - special syntax for calling procedure +* ✅︎ ``DECLARE ... BEGIN ... END`` - execute PL/SQL statement +* ✅︎ other statements not mentioned here +* ❌ ``SET ...; SELECT ...;`` - multiple statements not supported + +Examples +^^^^^^^^ + +.. code-block:: python + + from onetl.connection import MSSQL + + mssql = MSSQL(...) + + with mssql: + mssql.execute("DROP TABLE schema.table") + mssql.execute( + """ + CREATE TABLE schema.table AS ( + id bigint GENERATED ALWAYS AS IDENTITY, + key VARCHAR2(4000), + value NUMBER + ) + """, + options=MSSQL.JDBCOptions(query_timeout=10), + ) + + +References +---------- + .. currentmodule:: onetl.connection.db_connection.mssql.connection .. automethod:: MSSQL.fetch diff --git a/docs/connection/db_connection/mssql/index.rst b/docs/connection/db_connection/mssql/index.rst index 5e511e83e..696dfda53 100644 --- a/docs/connection/db_connection/mssql/index.rst +++ b/docs/connection/db_connection/mssql/index.rst @@ -1,12 +1,13 @@ .. _mssql: MSSQL -===== +====== .. toctree:: :maxdepth: 1 :caption: Connection + prerequisites connection .. toctree:: @@ -14,5 +15,12 @@ MSSQL :caption: Operations read + sql write execute + +.. toctree:: + :maxdepth: 1 + :caption: Troubleshooting + + types diff --git a/docs/connection/db_connection/mssql/prerequisites.rst b/docs/connection/db_connection/mssql/prerequisites.rst new file mode 100644 index 000000000..33786d61c --- /dev/null +++ b/docs/connection/db_connection/mssql/prerequisites.rst @@ -0,0 +1,77 @@ +.. _mssql-prerequisites: + +Prerequisites +============= + +Version Compatibility +--------------------- + +* SQL Server versions: 2014 - 2022 +* Spark versions: 2.3.x - 3.5.x +* Java versions: 8 - 20 + +See `official documentation `_ +and `official compatibility matrix `_. + +Installing PySpark +------------------ + +To use MSSQL connector you should have PySpark installed (or injected to ``sys.path``) +BEFORE creating the connector instance. + +See :ref:`install-spark` installation instruction for more details. + +Connecting to MSSQL +-------------------- + +Connection port +~~~~~~~~~~~~~~~ + +Connection is usually performed to port 1443. Port may differ for different MSSQL instances. +Please ask your MSSQL administrator to provide required information. + +Connection host +~~~~~~~~~~~~~~~ + +It is possible to connect to MSSQL by using either DNS name of host or it's IP address. + +If you're using MSSQL cluster, it is currently possible to connect only to **one specific node**. +Connecting to multiple nodes to perform load balancing, as well as automatic failover to new master/replica are not supported. + +Required grants +~~~~~~~~~~~~~~~ + +Ask your MSSQL cluster administrator to set following grants for a user, +used for creating a connection: + +.. tabs:: + + .. code-tab:: sql Read + Write (schema is owned by user) + + -- allow creating tables for user + GRANT CREATE TABLE TO username; + + -- allow read & write access to specific table + GRANT SELECT, INSERT ON username.mytable TO username; + + -- only if if_exists="replace_entire_table" is used: + -- allow dropping/truncating tables in any schema + GRANT ALTER ON username.mytable TO username; + + .. code-tab:: sql Read + Write (schema is not owned by user) + + -- allow creating tables for user + GRANT CREATE TABLE TO username; + + -- allow managing tables in specific schema, and inserting data to tables + GRANT ALTER, SELECT, INSERT ON SCHEMA::someschema TO username; + + .. code-tab:: sql Read only + + -- allow read access to specific table + GRANT SELECT ON someschema.mytable TO username; + +More details can be found in official documentation: + * `GRANT ON DATABASE `_ + * `GRANT ON OBJECT `_ + * `GRANT ON SCHEMA `_ diff --git a/docs/connection/db_connection/mssql/read.rst b/docs/connection/db_connection/mssql/read.rst index 3a336f823..50c958c68 100644 --- a/docs/connection/db_connection/mssql/read.rst +++ b/docs/connection/db_connection/mssql/read.rst @@ -1,18 +1,74 @@ .. _mssql-read: -Reading from MSSQL -================== +Reading from MSSQL using ``DBReader`` +====================================== -There are 2 ways of distributed data reading from MSSQL: +.. warning:: -* Using :obj:`DBReader ` with different :ref:`strategy` -* Using :obj:`MSSQL.sql ` + Please take into account :ref:`mssql-types` -Both methods accept :obj:`JDBCReadOptions ` +:obj:`DBReader ` supports :ref:`strategy` for incremental data reading, +but does not support custom queries, like JOINs. -.. currentmodule:: onetl.connection.db_connection.mssql.connection +Supported DBReader features +--------------------------- -.. automethod:: MSSQL.sql +* ✅︎ ``columns`` +* ✅︎ ``where`` +* ✅︎ ``hwm``, supported strategies: +* * ✅︎ :ref:`snapshot-strategy` +* * ✅︎ :ref:`incremental-strategy` +* * ✅︎ :ref:`snapshot-batch-strategy` +* * ✅︎ :ref:`incremental-batch-strategy` +* ❌ ``hint`` (MSSQL does support hints, but DBReader not, at least for now) +* ❌ ``df_schema`` +* ✅︎ ``options`` (see :obj:`JDBCReadOptions `) + +Examples +-------- + +Snapshot strategy: + +.. code-block:: python + + from onetl.connection import MSSQL + from onetl.db import DBReader + + mssql = MSSQL(...) + + reader = DBReader( + connection=mssql, + source="schema.table", + columns=["id", "key", "CAST(value AS text) value", "updated_dt"], + where="key = 'something'", + options=MSSQL.ReadOptions(partition_column="id", num_partitions=10), + ) + df = reader.run() + +Incremental strategy: + +.. code-block:: python + + from onetl.connection import MSSQL + from onetl.db import DBReader + from onetl.strategy import IncrementalStrategy + + mssql = MSSQL(...) + + reader = DBReader( + connection=mssql, + source="schema.table", + columns=["id", "key", "CAST(value AS text) value", "updated_dt"], + where="key = 'something'", + hwm=DBReader.AutoDetectHWM(name="mssql_hwm", expression="updated_dt"), + options=MSSQL.ReadOptions(partition_column="id", num_partitions=10), + ) + + with IncrementalStrategy(): + df = reader.run() + +Read options +------------ .. currentmodule:: onetl.connection.db_connection.jdbc_connection.options diff --git a/docs/connection/db_connection/mssql/sql.rst b/docs/connection/db_connection/mssql/sql.rst new file mode 100644 index 000000000..cf3467de4 --- /dev/null +++ b/docs/connection/db_connection/mssql/sql.rst @@ -0,0 +1,52 @@ +.. _mssql-sql: + +Reading from MSSQL using ``MSSQL.sql`` +======================================== + +.. warning:: + + Please take into account :ref:`mssql-types` + +:obj:`MSSQL.sql ` allows passing custom SQL query, +but does not support incremental strategies. + +Method also accepts :obj:`JDBCReadOptions `. + +Syntax support +-------------- + +Only queries with the following syntax are supported: + +* ✅︎ ``SELECT ... FROM ...`` +* ❌ ``WITH alias AS (...) SELECT ...`` +* ❌ ``SET ...; SELECT ...;`` - multiple statements not supported + +Examples +-------- + +.. code-block:: python + + from onetl.connection import MSSQL + + mssql = MSSQL(...) + df = mssql.sql( + """ + SELECT + id, + key, + CAST(value AS text) value, + updated_at + FROM + some.mytable + WHERE + key = 'something' + """, + options=MSSQL.ReadOptions(partition_column="id", num_partitions=10), + ) + +References +---------- + +.. currentmodule:: onetl.connection.db_connection.mssql.connection + +.. automethod:: MSSQL.sql diff --git a/docs/connection/db_connection/mssql/types.rst b/docs/connection/db_connection/mssql/types.rst new file mode 100644 index 000000000..ea7fd7102 --- /dev/null +++ b/docs/connection/db_connection/mssql/types.rst @@ -0,0 +1,371 @@ +.. _mssql-types: + +MSSQL <-> Spark type mapping +================================= + +Type detection & casting +------------------------ + +Spark's DataFrames always have a ``schema`` which is a list of columns with corresponding Spark types. All operations on a column are performed using column type. + +Reading from MSSQL +~~~~~~~~~~~~~~~~~~~~~~~ + +This is how MSSQL connector performs this: + +* For each column in query result (``SELECT column1, column2, ... FROM table ...``) get column name and MSSQL type. +* Find corresponding ``MSSQL type (read)`` -> ``Spark type`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* Create DataFrame from query with specific column names and Spark types. + +Writing to some existing MSSQL table +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This is how MSSQL connector performs this: + +* Get names of columns in DataFrame. [1]_ +* Perform ``SELECT * FROM table LIMIT 0`` query. +* Take only columns present in DataFrame (by name, case insensitive). For each found column get MSSQL type. +* Find corresponding ``Spark type`` -> ``MSSQL type (write)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* If ``MSSQL type (write)`` match ``MSSQL type (read)``, no additional casts will be performed, DataFrame column will be written to MSSQL as is. +* If ``MSSQL type (write)`` does not match ``MSSQL type (read)``, DataFrame column will be casted to target column type **on MSSQL side**. + For example, you can write column with text data to ``int`` column, if column contains valid integer values within supported value range and precision [2]_. + +.. [1] + This allows to write data to tables with ``DEFAULT`` and ``GENERATED`` columns - if DataFrame has no such column, + it will be populated by MSSQL. + +.. [2] + This is true only if DataFrame column is a ``StringType()``, because text value is parsed automatically to tagret column type. + + But other types cannot be silently converted, like ``int -> text``. This requires explicit casting, see `DBWriter`_. + +Create new table using Spark +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. warning:: + + ABSOLUTELY NOT RECOMMENDED! + +This is how MSSQL connector performs this: + +* Find corresponding ``Spark type`` -> ``MSSQL type (create)`` combination (see below) for each DataFrame column. If no combination is found, raise exception. +* Generate DDL for creating table in MSSQL, like ``CREATE TABLE (col1 ...)``, and run it. +* Write DataFrame to created table as is. + +But some cases this may lead to using wrong column type. For example, Spark creates column of type ``timestamp`` +which corresponds to MSSQL's type ``timestamp(0)`` (precision up to seconds) +instead of more precise ``timestamp(6)`` (precision up to nanoseconds). +This may lead to incidental precision loss, or sometimes data cannot be written to created table at all. + +So instead of relying on Spark to create tables: + +.. dropdown:: See example + + .. code:: python + + writer = DBWriter( + connection=mssql, + table="myschema.target_tbl", + options=MSSQL.WriteOptions( + if_exists="append", + ), + ) + writer.run(df) + +Always prefer creating tables with specific types **BEFORE WRITING DATA**: + +.. dropdown:: See example + + .. code:: python + + mssql.execute( + """ + CREATE TABLE schema.table AS ( + id bigint, + key text, + value datetime2(6) -- specific type and precision + ) + """, + ) + + writer = DBWriter( + connection=mssql, + table="myschema.target_tbl", + options=MSSQL.WriteOptions(if_exists="append"), + ) + writer.run(df) + +References +~~~~~~~~~~ + +Here you can find source code with type conversions: + +* `MSSQL -> JDBC `_ +* `JDBC -> Spark `_ +* `Spark -> JDBC `_ +* `JDBC -> MSSQL `_ + +Supported types +--------------- + +See `official documentation `_ + +Numeric types +~~~~~~~~~~~~~ + ++-------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| MSSQL type (read) | Spark type | MSSQL type (write) | MSSQL type (create) | ++===============================+===================================+===============================+===============================+ +| ``decimal`` | ``DecimalType(P=18, S=0)`` | ``decimal(P=18, S=0)`` | ``decimal(P=18, S=0)`` | ++-------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``decimal(P=0..38)`` | ``DecimalType(P=0..38, S=0)`` | ``decimal(P=0..38, S=0)`` | ``decimal(P=0..38, S=0)`` | ++-------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``decimal(P=0..38, S=0..38)`` | ``DecimalType(P=0..38, S=0..38)`` | ``decimal(P=0..38, S=0..38)`` | ``decimal(P=0..38, S=0..38)`` | ++-------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``real`` | ``FloatType()`` | ``real`` | ``real`` | ++-------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``float`` | ``DoubleType()`` | ``float`` | ``float`` | ++-------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``smallint`` | ``ShortType()`` | ``smallint`` | ``smallint`` | ++-------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``tinyint`` | ``IntegerType()`` | ``int`` | ``int`` | ++-------------------------------+ | | | +| ``int`` | | | | ++-------------------------------+-----------------------------------+-------------------------------+-------------------------------+ +| ``bigint`` | ``LongType()`` | ``bigint`` | ``bigint`` | ++-------------------------------+-----------------------------------+-------------------------------+-------------------------------+ + +Temporal types +~~~~~~~~~~~~~~ + +.. note:: + + MSSQL ``timestamp`` type is alias for ``rowversion`` (see `Special types`_). It is not a temporal type! + ++------------------------------------------+--------------------------------------+-----------------------------------+-------------------------------+ +| MSSQL type (read) | Spark type | MSSQL type (write) | MSSQL type (create) | ++==========================================+======================================+===================================+===============================+ +| ``date`` | ``DateType()`` | ``date`` | ``date`` | ++------------------------------------------+--------------------------------------+-----------------------------------+-------------------------------+ +| ``smalldatetime``, minutes | ``TimestampType()``, microseconds | ``datetime2(6)``, microseconds | ``datetime``, milliseconds | ++------------------------------------------+ | | | +| ``datetime``, milliseconds | | | | ++------------------------------------------+ | | | +| ``datetime2(0)``, seconds | | | | ++------------------------------------------+ | | | +| ``datetime2(3)``, milliseconds | | | | ++------------------------------------------+--------------------------------------+-----------------------------------+-------------------------------+ +| ``datetime2(6)``, microseconds | ``TimestampType()``, microseconds | ``datetime2(6)``, microseconds | ``datetime``, milliseconds, | ++------------------------------------------+--------------------------------------+-----------------------------------+ **precision loss** [3]_ | +| ``datetime2(7)``, 100s of nanoseconds | ``TimestampType()``, microseconds, | ``datetime2(6)``, microseconds, | | +| | **precision loss** [4]_ | **precision loss** [4]_ | | ++------------------------------------------+--------------------------------------+-----------------------------------+-------------------------------+ +| ``time(0)``, seconds | ``TimestampType()``, microseconds, | ``datetime2(6)``, microseconds | ``datetime``, milliseconds | ++------------------------------------------+ with time format quirks [5]_ | | | +| ``time(3)``, milliseconds | | | | ++------------------------------------------+--------------------------------------+-----------------------------------+-------------------------------+ +| ``time(6)``, microseconds | ``TimestampType()``, microseconds, | ``datetime2(6)``, microseconds | ``datetime``, milliseconds, | ++ | with time format quirks [5]_ | | **precision loss** [3]_ | ++------------------------------------------+--------------------------------------+-----------------------------------+ + +| ``time``, 100s of nanoseconds | ``TimestampType()``, microseconds, | ``datetime2(6)``, microseconds | | ++------------------------------------------+ **precision loss** [4]_, | **precision loss** [3]_ | | +| ``time(7)``, 100s of nanoseconds | with time format quirks [5]_ | | | ++------------------------------------------+--------------------------------------+-----------------------------------+-------------------------------+ +| ``datetimeoffset`` | ``StringType()`` | ``nvarchar`` | ``nvarchar`` | ++------------------------------------------+--------------------------------------+-----------------------------------+-------------------------------+ + +.. warning:: + + Note that types in MSSQL and Spark have different value ranges: + + +-------------------+--------------------------------+--------------------------------+---------------------+--------------------------------+--------------------------------+ + | MySQL type | Min value | Max value | Spark type | Min value | Max value | + +===================+================================+================================+=====================+================================+================================+ + | ``smalldatetime`` | ``1900-01-01 00:00:00`` | ``2079-06-06 23:59:00`` | ``TimestampType()`` | ``0001-01-01 00:00:00.000000`` | ``9999-12-31 23:59:59.999999`` | + +-------------------+--------------------------------+--------------------------------+ | | | + | ``datetime`` | ``1753-01-01 00:00:00.000`` | ``9999-12-31 23:59:59.997`` | | | | + +-------------------+--------------------------------+--------------------------------+ | | | + | ``datetime2`` | ``0001-01-01 00:00:00.000000`` | ``9999-12-31 23:59:59.999999`` | | | | + +-------------------+--------------------------------+--------------------------------+ | | | + | ``time`` | ``00:00:00.0000000`` | ``23:59:59.9999999`` | | | | + +-------------------+--------------------------------+--------------------------------+---------------------+--------------------------------+--------------------------------+ + + So not all of values in Spark DataFrame can be written to MSSQL. + + References: + * `Clickhouse DateTime documentation `_ + * `Clickhouse DateTime documentation `_ + * `Spark DateType documentation `_ + * `Spark TimestampType documentation `_ + +.. [3] + MSSQL dialect for Spark generates DDL with type ``datetime`` which has precision up to milliseconds (``23:59:59.999``, 10\ :superscript:`-3` seconds). + Inserting data with microsecond and higher precision (``23:59:59.999999`` .. ``23.59:59.9999999``, 10\ :superscript:`-6` .. 10\ :superscript:`-7` seconds) + will lead to **throwing away microseconds**. + +.. [4] + MSSQL support timestamp up to 100s of nanoseconds precision (``23:59:59.9999999999``, 10\ :superscript:`-7` seconds), + but Spark ``TimestampType()`` supports datetime up to microseconds precision (``23:59:59.999999``, 10\ :superscript:`-6` seconds). + Last digit will be lost during read or write operations. + +.. [5] + ``time`` type is the same as ``timestamp`` with date ``1970-01-01``. So instead of reading data from MSSQL like ``23:59:59.999999`` + it is actually read ``1970-01-01 23:59:59.999999``, and vice versa. + +String types +~~~~~~~~~~~~~ + ++-------------------+------------------+--------------------+---------------------+ +| MSSQL type (read) | Spark type | MSSQL type (write) | MSSQL type (create) | ++===================+==================+====================+=====================+ +| ``char`` | ``StringType()`` | ``nvarchar`` | ``nvarchar`` | ++-------------------+ | | | +| ``char(N)`` | | | | ++-------------------+ | | | +| ``nchar`` | | | | ++-------------------+ | | | +| ``nchar(N)`` | | | | ++-------------------+ | | | +| ``varchar`` | | | | ++-------------------+ | | | +| ``varchar(N)`` | | | | ++-------------------+ | | | +| ``nvarchar`` | | | | ++-------------------+ | | | +| ``nvarchar(N)`` | | | | ++-------------------+ | | | +| ``mediumtext`` | | | | ++-------------------+ | | | +| ``text`` | | | | ++-------------------+ | | | +| ``ntext`` | | | | ++-------------------+ | | | +| ``xml`` | | | | ++-------------------+------------------+--------------------+---------------------+ + +Binary types +~~~~~~~~~~~~ + ++--------------------+-------------------+--------------------+---------------------+ +| MSSQL type (read) | Spark type | MSSQL type (write) | MSSQL type (create) | ++====================+===================+====================+=====================+ +| ``bit`` | ``BooleanType()`` | ``bit`` | ``bit`` | ++--------------------+-------------------+--------------------+---------------------+ +| ``binary`` | ``BinaryType()`` | ``varbinary`` | ``varbinary`` | ++--------------------+ | | | +| ``binary(N)`` | | | | ++--------------------+ | | | +| ``varbinary`` | | | | ++--------------------+ | | | +| ``varbinary(N)`` | | | | ++--------------------+ | | | +| ``image`` | | | | ++--------------------+-------------------+--------------------+---------------------+ + +Special types +~~~~~~~~~~~~~~ + ++---------------------------+------------------+--------------------+---------------------+ +| MSSQL type (read) | Spark type | MSSQL type (write) | MSSQL type (create) | ++===========================+==================+====================+=====================+ +| ``geography`` | ``BinaryType()`` | ``varbinary`` | ``varbinary`` | ++---------------------------+ | | | +| ``geometry`` | | | | ++---------------------------+ | | | +| ``hierarchyid`` | | | | ++---------------------------+ | | | +| ``rowversion`` | | | | ++---------------------------+------------------+--------------------+---------------------+ +| ``sql_variant`` | unsupported | | | ++---------------------------+------------------+--------------------+---------------------+ +| ``sysname`` | ``StringType()`` | ``nvarchar`` | ``nvarchar`` | ++---------------------------+ | | | +| ``uniqueidentifier`` | | | | ++---------------------------+------------------+--------------------+---------------------+ + +Explicit type cast +------------------ + +``DBReader`` +~~~~~~~~~~~~ + +It is possible to explicitly cast column type using ``DBReader(columns=...)`` syntax. + +For example, you can use ``CAST(column AS text)`` to convert data to string representation on MSSQL side, and so it will be read as Spark's ``StringType()``: + +.. code-block:: python + + from onetl.connection import MSSQL + from onetl.db import DBReader + + mssql = MSSQL(...) + + DBReader( + connection=mssql, + columns=[ + "id", + "supported_column", + "CAST(unsupported_column AS text) unsupported_column_str", + ], + ) + df = reader.run() + + # cast column content to proper Spark type + df = df.select( + df.id, + df.supported_column, + # explicit cast + df.unsupported_column_str.cast("integer").alias("parsed_integer"), + ) + +``DBWriter`` +~~~~~~~~~~~~ + +Convert dataframe column to JSON using `to_json `_, +and write it as ``text`` column in MSSQL: + +.. code:: python + + mssql.execute( + """ + CREATE TABLE schema.target_tbl AS ( + id bigint, + struct_column_json text -- any string type, actually + ) + """, + ) + + from pyspark.sql.functions import to_json + + df = df.select( + df.id, + to_json(df.struct_column).alias("struct_column_json"), + ) + + writer.run(df) + +Then you can parse this column on MSSQL side - for example, by creating a view: + +.. code:: sql + + SELECT + id, + JSON_VALUE(struct_column_json, "$.nested.field") AS nested_field + FROM target_tbl + +Or by using `computed column `_: + +.. code-block:: sql + + CREATE TABLE schema.target_table ( + id bigint, + supported_column datetime2(6), + struct_column_json text, -- any string type, actually + -- computed column + nested_field AS (JSON_VALUE(struct_column_json, "$.nested.field")) + -- or persisted column + -- nested_field AS (JSON_VALUE(struct_column_json, "$.nested.field")) PERSISTED + ) + +By default, column value is calculated on every table read. +Column marked as ``PERSISTED`` is calculated during insert, but this require additional space. diff --git a/docs/connection/db_connection/mssql/write.rst b/docs/connection/db_connection/mssql/write.rst index c8a5e5906..eb15fe93e 100644 --- a/docs/connection/db_connection/mssql/write.rst +++ b/docs/connection/db_connection/mssql/write.rst @@ -1,9 +1,47 @@ .. _mssql-write: -Writing to MSSQL -================ +Writing to MSSQL using ``DBWriter`` +==================================== -For writing data to MSSQL, use :obj:`DBWriter ` with options below. +For writing data to MSSQL, use :obj:`DBWriter `. + +.. warning:: + + Please take into account :ref:`mssql-types` + +.. warning:: + + It is always recommended to create table explicitly using :obj:`MSSQL.execute ` + instead of relying on Spark's table DDL generation. + + This is because Spark's DDL generator can create columns with different precision and types than it is expected, + causing precision loss or other issues. + +Examples +-------- + +.. code-block:: python + + from onetl.connection import MSSQL + from onetl.db import DBWriter + + mssql = MSSQL(...) + + df = ... # data is here + + writer = DBWriter( + connection=mssql, + target="schema.table", + options=MSSQL.WriteOptions(if_exists="append"), + ) + + writer.run(df) + + +Write options +------------- + +Method above accepts :obj:`JDBCWriteOptions ` .. currentmodule:: onetl.connection.db_connection.jdbc_connection.options diff --git a/docs/connection/db_connection/mysql/prerequisites.rst b/docs/connection/db_connection/mysql/prerequisites.rst index eeb59cf44..99b7820cb 100644 --- a/docs/connection/db_connection/mysql/prerequisites.rst +++ b/docs/connection/db_connection/mysql/prerequisites.rst @@ -47,7 +47,7 @@ used for creating a connection: .. code-tab:: sql Read + Write - -- allow external tables in the same schema as target table + -- allow creating tables in the target schema GRANT CREATE ON myschema.* TO username@'192.168.1.%'; -- allow read & write access to specific table diff --git a/docs/connection/db_connection/mysql/sql.rst b/docs/connection/db_connection/mysql/sql.rst index 515dcf77c..6ef094cd4 100644 --- a/docs/connection/db_connection/mysql/sql.rst +++ b/docs/connection/db_connection/mysql/sql.rst @@ -1,13 +1,13 @@ -.. _clickhouse-sql: +.. _mysql-sql: -Reading from Clickhouse using ``Clickhouse.sql`` -================================================ +Reading from MySQL using ``MySQL.sql`` +====================================== .. warning:: - Please take into account :ref:`clickhouse-types` + Please take into account :ref:`mysql-types` -:obj:`Clickhouse.sql ` allows passing custom SQL query, +:obj:`MySQL.sql ` allows passing custom SQL query, but does not support incremental strategies. Method also accepts :obj:`JDBCReadOptions `. @@ -17,22 +17,20 @@ Syntax support Only queries with the following syntax are supported: -* ``SELECT ...`` -* ``WITH alias AS (...) SELECT ...`` - -Queries like ``SHOW ...`` are not supported. - -This method also does not support multiple queries in the same operation, like ``SET ...; SELECT ...;``. +* ✅︎ ``SELECT ... FROM ...`` +* ✅︎ ``WITH alias AS (...) SELECT ...`` +* ❌ ``SHOW ...`` +* ❌ ``SET ...; SELECT ...;`` - multiple statements not supported Examples -------- .. code-block:: python - from onetl.connection import Clickhouse + from onetl.connection import MySQL - clickhouse = Clickhouse(...) - df = clickhouse.sql( + mysql = MySQL(...) + df = mysql.sql( """ SELECT id, @@ -44,12 +42,12 @@ Examples WHERE key = 'something' """, - options=Clickhouse.ReadOptions(partition_column="id", num_partitions=10), + options=MySQL.ReadOptions(partition_column="id", num_partitions=10), ) References ---------- -.. currentmodule:: onetl.connection.db_connection.clickhouse.connection +.. currentmodule:: onetl.connection.db_connection.mysql.connection -.. automethod:: Clickhouse.sql +.. automethod:: MySQL.sql diff --git a/docs/connection/db_connection/mysql/types.rst b/docs/connection/db_connection/mysql/types.rst index 4968bd434..431665f87 100644 --- a/docs/connection/db_connection/mysql/types.rst +++ b/docs/connection/db_connection/mysql/types.rst @@ -47,7 +47,7 @@ This is how MySQL connector performs this: * Write DataFrame to created table as is. But some cases this may lead to using wrong column type. For example, Spark creates column of type ``timestamp`` -which corresponds to MySQL's type ``timestamp(0)`` (precision up to seconds) +which corresponds to MySQL type ``timestamp(0)`` (precision up to seconds) instead of more precise ``timestamp(6)`` (precision up to nanoseconds). This may lead to incidental precision loss, or sometimes data cannot be written to created table at all. @@ -152,61 +152,57 @@ Temporal types +-----------------------------------+ | | | | ``date`` | | | | +-----------------------------------+--------------------------------------+-----------------------------------+-------------------------------+ -| ``datetime``, seconds | ``TimestampType()``, microseconds | ``timestamp(N=6)``, microseconds | ``timestamp(N=0)``, seconds | +| ``datetime``, seconds | ``TimestampType()``, microseconds | ``timestamp(6)``, microseconds | ``timestamp(0)``, seconds | +-----------------------------------+ | | | | ``timestamp``, seconds | | | | +-----------------------------------+ | | | -| ``datetime(N=0)``, seconds | | | | +| ``datetime(0)``, seconds | | | | +-----------------------------------+ | | | -| ``timestamp(N=0)``, seconds | | | | +| ``timestamp(0)``, seconds | | | | +-----------------------------------+--------------------------------------+-----------------------------------+-------------------------------+ -| ``datetime(N=3)``, milliseconds | ``TimestampType()``, microseconds | ``timestamp(N=6)``, microseconds | ``timestamp(N=0)``, seconds, | +| ``datetime(3)``, milliseconds | ``TimestampType()``, microseconds | ``timestamp(6)``, microseconds | ``timestamp(0)``, seconds, | +-----------------------------------+ | | **precision loss** [4]_, | -| ``timestamp(N=6)``, milliseconds | | | | +| ``timestamp(3)``, milliseconds | | | | +-----------------------------------+ | | | -| ``datetime(N=6)``, microseconds | | | | +| ``datetime(6)``, microseconds | | | | +-----------------------------------+ | | | -| ``timestamp(N=6)``, microseconds | | | | +| ``timestamp(6)``, microseconds | | | | +-----------------------------------+--------------------------------------+-----------------------------------+-------------------------------+ -| ``time``, seconds | ``TimestampType()``, microseconds, | ``timestamp(N=6)``, microseconds | ``timestamp(N=0)``, seconds | +| ``time``, seconds | ``TimestampType()``, microseconds, | ``timestamp(6)``, microseconds | ``timestamp(0)``, seconds | +-----------------------------------+ with time format quirks [5]_ | | | -| ``time(N=0)``, seconds | | | | +| ``time(0)``, seconds | | | | +-----------------------------------+--------------------------------------+-----------------------------------+-------------------------------+ -| ``time(N=3)``, milliseconds | ``TimestampType()``, microseconds | ``timestamp(N=6)``, microseconds | ``timestamp(N=0)``, seconds, | +| ``time(3)``, milliseconds | ``TimestampType()``, microseconds | ``timestamp(6)``, microseconds | ``timestamp(0)``, seconds, | +-----------------------------------+ with time format quirks [5]_ | | **precision loss** [4]_, | -| ``time(N=6)``, microseconds | | | | +| ``time(6)``, microseconds | | | | +-----------------------------------+--------------------------------------+-----------------------------------+-------------------------------+ .. warning:: Note that types in MySQL and Spark have different value ranges: - +-----------------------------+-------------------------+-------------------------+ - | Type | Min value | Max value | - +=============================+=========================+=========================+ - | MySQL's ``year`` | ``1901`` | ``2155`` | - +-----------------------------+-------------------------+-------------------------+ - | MySQL's ``date`` | ``1000-01-01`` | ``9999-12-31`` | - +-----------------------------+-------------------------+-------------------------+ - | Spark's ``DateType()`` | ``0001-01-01`` | ``9999-12-31`` | - +-----------------------------+-------------------------+-------------------------+ - | MySQL's ``datetime`` | ``1000-01-01 00:00:00`` | ``9999-12-31 23:59:59`` | - +-----------------------------+-------------------------+-------------------------+ - | MySQL's ``timestamp`` | ``1970-01-01 00:00:01`` | ``9999-12-31 23:59:59`` | - +-----------------------------+-------------------------+-------------------------+ - | MySQL's ``time`` | ``-838:59:59`` | ``838:59:59`` | - +-----------------------------+-------------------------+-------------------------+ - | Spark's ``TimestampType()`` | ``0001-01-01 00:00:00`` | ``9999-12-31 23:59:59`` | - +-----------------------------+-------------------------+-------------------------+ + +---------------+--------------------------------+--------------------------------+---------------------+--------------------------------+--------------------------------+ + | MySQL type | Min value | Max value | Spark type | Min value | Max value | + +===============+================================+================================+=====================+================================+================================+ + | ``year`` | ``1901`` | ``2155`` | ``DateType()`` | ``0001-01-01`` | ``9999-12-31`` | + +---------------+--------------------------------+--------------------------------+ | | | + | ``date`` | ``1000-01-01`` | ``9999-12-31`` | | | | + +---------------+--------------------------------+--------------------------------+---------------------+--------------------------------+--------------------------------+ + | ``datetime`` | ``1000-01-01 00:00:00.000000`` | ``9999-12-31 23:59:59.499999`` | ``TimestampType()`` | ``0001-01-01 00:00:00.000000`` | ``9999-12-31 23:59:59.999999`` | + +---------------+--------------------------------+--------------------------------+ | | | + | ``timestamp`` | ``1970-01-01 00:00:01.000000`` | ``9999-12-31 23:59:59.499999`` | | | | + +---------------+--------------------------------+--------------------------------+ | | | + | ``time`` | ``-838:59:59.000000`` | ``838:59:59.000000`` | | | | + +---------------+--------------------------------+--------------------------------+---------------------+--------------------------------+--------------------------------+ So Spark can read all the values from MySQL, but not all of values in Spark DataFrame can be written to MySQL. - See: - * `MySQL's year documentation `_ - * `MySQL's date, datetime & timestamp documentation `_ - * `MySQL's time documentation `_ - * `Spark's DateType documentation `_ - * `Spark's TimestampType documentation `_ + References: + * `MySQL year documentation `_ + * `MySQL date, datetime & timestamp documentation `_ + * `MySQL time documentation `_ + * `Spark DateType documentation `_ + * `Spark TimestampType documentation `_ .. [4] MySQL dialect generates DDL with MySQL type ``timestamp`` which is alias for ``timestamp(0)`` with precision up to seconds (``23:59:59``). @@ -244,44 +240,44 @@ String types Binary types ~~~~~~~~~~~~ -+---------------------------+------------------+--------------------+---------------------+ -| MySQL type (read) | Spark type | MySQL type (write) | MySQL type (create) | -+===========================+==================+====================+=====================+ -| ``binary`` | ``BinaryType()`` | ``blob`` | ``blob`` | -+---------------------------+ | | | -| ``binary(N)`` | | | | -+---------------------------+ | | | -| ``varbinary(N)`` | | | | -+---------------------------+ | | | -| ``mediumblob`` | | | | -+---------------------------+ | | | -| ``blob`` | | | | -+---------------------------+ | | | -| ``longblob`` | | | | -+---------------------------+------------------+--------------------+---------------------+ ++-------------------+------------------+--------------------+---------------------+ +| MySQL type (read) | Spark type | MySQL type (write) | MySQL type (create) | ++===================+==================+====================+=====================+ +| ``binary`` | ``BinaryType()`` | ``blob`` | ``blob`` | ++-------------------+ | | | +| ``binary(N)`` | | | | ++-------------------+ | | | +| ``varbinary(N)`` | | | | ++-------------------+ | | | +| ``mediumblob`` | | | | ++-------------------+ | | | +| ``blob`` | | | | ++-------------------+ | | | +| ``longblob`` | | | | ++-------------------+------------------+--------------------+---------------------+ Geometry types ~~~~~~~~~~~~~~ -+---------------------------+------------------+--------------------+---------------------+ -| MySQL type (read) | Spark type | MySQL type (write) | MySQL type (create) | -+===========================+==================+====================+=====================+ -| ``point`` | ``BinaryType()`` | ``blob`` | ``blob`` | -+---------------------------+ | | | -| ``linestring`` | | | | -+---------------------------+ | | | -| ``polygon`` | | | | -+---------------------------+ | | | -| ``geometry`` | | | | -+---------------------------+ | | | -| ``multipoint`` | | | | -+---------------------------+ | | | -| ``multilinestring`` | | | | -+---------------------------+ | | | -| ``multipolygon`` | | | | -+---------------------------+ | | | -| ``geometrycollection`` | | | | -+---------------------------+------------------+--------------------+---------------------+ ++------------------------+------------------+--------------------+---------------------+ +| MySQL type (read) | Spark type | MySQL type (write) | MySQL type (create) | ++========================+==================+====================+=====================+ +| ``point`` | ``BinaryType()`` | ``blob`` | ``blob`` | ++------------------------+ | | | +| ``linestring`` | | | | ++------------------------+ | | | +| ``polygon`` | | | | ++------------------------+ | | | +| ``geometry`` | | | | ++------------------------+ | | | +| ``multipoint`` | | | | ++------------------------+ | | | +| ``multilinestring`` | | | | ++------------------------+ | | | +| ``multipolygon`` | | | | ++------------------------+ | | | +| ``geometrycollection`` | | | | ++------------------------+------------------+--------------------+---------------------+ Explicit type cast ------------------ @@ -311,7 +307,7 @@ to convert column of any type to string representation, and then parse this colu columns=[ "id", "supported_column", - "CAST(unsupported_column AS TEXT) unsupported_column_str", + "CAST(unsupported_column AS text) unsupported_column_str", # or "JSON_OBJECT('key', value_column) json_column", ], @@ -362,7 +358,10 @@ Then you can parse this column on MySQL side - for example, by creating a view: .. code:: sql - SELECT id, json_column->"$[0]" AS array_item FROM target_tbl + SELECT + id, + array_column_json->"$[0]" AS array_item + FROM target_tbl Or by using `GENERATED column `_: @@ -372,7 +371,10 @@ Or by using `GENERATED column "$[0]")) VIRTUAL + -- or stired column + -- array_item_0 GENERATED ALWAYS AS (array_column_json->"$[0]")) STORED ) ``VIRTUAL`` column value is calculated on every table read. diff --git a/docs/connection/db_connection/oracle/prerequisites.rst b/docs/connection/db_connection/oracle/prerequisites.rst index c86bc393c..b5b64e437 100644 --- a/docs/connection/db_connection/oracle/prerequisites.rst +++ b/docs/connection/db_connection/oracle/prerequisites.rst @@ -88,14 +88,14 @@ used for creating a connection: -- as Oracle does not support specifying exact schema name GRANT CREATE ANY TABLE TO username; + -- allow read & write access to specific table + GRANT SELECT, INSERT ON someschema.mytable TO username; + -- only if if_exists="replace_entire_table" is used: -- allow dropping/truncating tables in any schema, -- as Oracle does not support specifying exact schema name GRANT DROP ANY TABLE TO username; - -- allow read & write access to specific table - GRANT SELECT, INSERT ON someschema.mytable TO username; - .. code-tab:: sql Read only -- allow user to log in diff --git a/docs/connection/db_connection/oracle/sql.rst b/docs/connection/db_connection/oracle/sql.rst index 3ea0832d1..6971de4ed 100644 --- a/docs/connection/db_connection/oracle/sql.rst +++ b/docs/connection/db_connection/oracle/sql.rst @@ -17,12 +17,10 @@ Syntax support Only queries with the following syntax are supported: -* ``SELECT ...`` -* ``WITH alias AS (...) SELECT ...`` - -Queries like ``SHOW ...`` are not supported. - -This method also does not support multiple queries in the same operation, like ``SET ...; SELECT ...;``. +* ✅︎ ``SELECT ... FROM ...`` +* ✅︎ ``WITH alias AS (...) SELECT ...`` +* ❌ ``SHOW ...`` +* ❌ ``SET ...; SELECT ...;`` - multiple statements not supported Examples -------- diff --git a/docs/connection/db_connection/oracle/types.rst b/docs/connection/db_connection/oracle/types.rst index 907afe1c9..c06e43b13 100644 --- a/docs/connection/db_connection/oracle/types.rst +++ b/docs/connection/db_connection/oracle/types.rst @@ -120,7 +120,7 @@ Numeric types +----------------------------------+-----------------------------------+-------------------------------+---------------------------+ | ``FLOAT`` | ``DecimalType(P=38, S=10)`` | ``NUMBER(P=38, S=10)`` | ``NUMBER(P=38, S=10)`` | +----------------------------------+ | | | -| ``FLOAT(N=1..126)`` | | | | +| ``FLOAT(N)`` | | | | +----------------------------------+ | | | | ``REAL`` | | | | +----------------------------------+ | | | @@ -162,17 +162,36 @@ Temporal types +--------------------------------------------+------------------------------------+---------------------------------+---------------------------------+ | ``TIMESTAMP WITH TIME ZONE`` | unsupported | | | +--------------------------------------------+ | | | -| ``TIMESTAMP(N=0..9) WITH TIME ZONE`` | | | | +| ``TIMESTAMP(N) WITH TIME ZONE`` | | | | +--------------------------------------------+ | | | | ``TIMESTAMP WITH LOCAL TIME ZONE`` | | | | +--------------------------------------------+ | | | -| ``TIMESTAMP(N=0..9) WITH LOCAL TIME ZONE`` | | | | +| ``TIMESTAMP(N) WITH LOCAL TIME ZONE`` | | | | +--------------------------------------------+ | | | | ``INTERVAL YEAR TO MONTH`` | | | | +--------------------------------------------+ | | | | ``INTERVAL DAY TO SECOND`` | | | | +--------------------------------------------+------------------------------------+---------------------------------+---------------------------------+ +.. warning:: + + Note that types in Oracle and Spark have different value ranges: + + +---------------+------------------------------------+-----------------------------------+---------------------+--------------------------------+--------------------------------+ + | Oracle type | Min value | Max value | Spark type | Min value | Max value | + +===============+====================================+===================================+=====================+================================+================================+ + | ``date`` | ``-4712-01-01`` | ``9999-01-01`` | ``DateType()`` | ``0001-01-01`` | ``9999-12-31`` | + +---------------+------------------------------------+-----------------------------------+---------------------+--------------------------------+--------------------------------+ + | ``timestamp`` | ``-4712-01-01 00:00:00.000000000`` | ``9999-12-31 23:59:59.999999999`` | ``TimestampType()`` | ``0001-01-01 00:00:00.000000`` | ``9999-12-31 23:59:59.999999`` | + +---------------+------------------------------------+-----------------------------------+---------------------+--------------------------------+--------------------------------+ + + So not all of values can be read from Oracle to Spark. + + References: + * `Oracle date, timestamp and intervals documentation `_ + * `Spark DateType documentation `_ + * `Spark TimestampType documentation `_ + .. [4] Oracle support timestamp up to nanoseconds precision (``23:59:59.999999999``), but Spark ``TimestampType()`` supports datetime up to microseconds precision (``23:59:59.999999``). diff --git a/docs/connection/db_connection/postgres/sql.rst b/docs/connection/db_connection/postgres/sql.rst index 1430381a1..e0316e2a4 100644 --- a/docs/connection/db_connection/postgres/sql.rst +++ b/docs/connection/db_connection/postgres/sql.rst @@ -17,12 +17,9 @@ Syntax support Only queries with the following syntax are supported: -* ``SELECT ...`` -* ``WITH alias AS (...) SELECT ...`` - -Queries like ``SHOW ...`` are not supported. - -This method also does not support multiple queries in the same operation, like ``SET ...; SELECT ...;``. +* ✅︎ ``SELECT ... FROM ...`` +* ✅︎ ``WITH alias AS (...) SELECT ...`` +* ❌ ``SET ...; SELECT ...;`` - multiple statements not supported Examples -------- diff --git a/docs/connection/db_connection/postgres/types.rst b/docs/connection/db_connection/postgres/types.rst index 224d4ee4d..5424289e4 100644 --- a/docs/connection/db_connection/postgres/types.rst +++ b/docs/connection/db_connection/postgres/types.rst @@ -38,8 +38,7 @@ This is how Postgres connector performs this: it will be populated by Postgres. .. [3] - This is true only if conversion of ``T_df -> T_tbl`` target or source types is ``text`` or ``StringType()``. So it is possible to - insert data back to column with original type, if data format is matching the column type. + This is true only if either DataFrame column is a ``StringType()``, or target column is ``text`` type. But other types cannot be silently converted, like ``bytea -> bit(N)``. This requires explicit casting, see `Manual conversion to string`_. @@ -202,6 +201,27 @@ Temporal types | ``tstzrange`` | | | | +------------------------------------+------------------------------+-----------------------+-------------------------+ +.. warning:: + + Note that types in Postgres and Spark have different value ranges: + + +---------------+---------------------------------+----------------------------------+---------------------+--------------------------------+--------------------------------+ + | Postgres type | Min value | Max value | Spark type | Min value | Max value | + +===============+=================================+==================================+=====================+================================+================================+ + | ``date`` | ``-4713-01-01`` | ``5874897-01-01`` | ``DateType()`` | ``0001-01-01`` | ``9999-12-31`` | + +---------------+---------------------------------+----------------------------------+---------------------+--------------------------------+--------------------------------+ + | ``timestamp`` | ``-4713-01-01 00:00:00.000000`` | ``294276-12-31 23:59:59.999999`` | ``TimestampType()`` | ``0001-01-01 00:00:00.000000`` | ``9999-12-31 23:59:59.999999`` | + +---------------+---------------------------------+----------------------------------+ | | | + | ``time`` | ``00:00:00.000000`` | ``24:00:00.000000`` | | | | + +---------------+---------------------------------+----------------------------------+---------------------+--------------------------------+--------------------------------+ + + So not all of values can be read from Postgres to Spark. + + References: + * `Postgres date/time types documentation `_ + * `Spark DateType documentation `_ + * `Spark TimestampType documentation `_ + .. [6] ``time`` type is the same as ``timestamp`` with date ``1970-01-01``. So instead of reading data from Postgres like ``23:59:59`` diff --git a/onetl/connection/db_connection/mssql/connection.py b/onetl/connection/db_connection/mssql/connection.py index c2716ce72..608ef7521 100644 --- a/onetl/connection/db_connection/mssql/connection.py +++ b/onetl/connection/db_connection/mssql/connection.py @@ -29,30 +29,9 @@ class MSSQL(JDBCConnection): (`official MSSQL JDBC driver `_). - .. dropdown:: Version compatibility - - * SQL Server versions: 2014 - 2022 - * Spark versions: 2.3.x - 3.5.x - * Java versions: 8 - 20 - - See `official documentation `_ - and `official compatibility matrix `_. - .. warning:: - To use MSSQL connector you should have PySpark installed (or injected to ``sys.path``) - BEFORE creating the connector instance. - - You can install PySpark as follows: - - .. code:: bash - - pip install onetl[spark] # latest PySpark version - - # or - pip install onetl pyspark=3.5.0 # pass specific PySpark version - - See :ref:`install-spark` installation instruction for more details. + Before using this connector please take into account :ref:`mssql-prerequisites` Parameters ---------- From 3e5925bde853731afb788a5ecc226905573db377 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Fri, 15 Mar 2024 10:35:14 +0000 Subject: [PATCH 50/63] [DOP-13739] Return back handling DBReader(columns="string") --- docs/changelog/next_release/238.bugfix.rst | 2 ++ onetl/base/base_db_connection.py | 2 +- .../dialect_mixins/support_columns_list.py | 13 ++++++-- onetl/db/db_reader/db_reader.py | 9 ++++-- .../test_common_reader_unit.py | 31 ++++++++++++++++++- 5 files changed, 51 insertions(+), 6 deletions(-) create mode 100644 docs/changelog/next_release/238.bugfix.rst diff --git a/docs/changelog/next_release/238.bugfix.rst b/docs/changelog/next_release/238.bugfix.rst new file mode 100644 index 000000000..558a16d08 --- /dev/null +++ b/docs/changelog/next_release/238.bugfix.rst @@ -0,0 +1,2 @@ +Return back handling of ``DBReader(columns="string")``. This was a valid syntax up to v0.10 release, but it was removed because +most of users neved used it. It looks that we were wrong, returning this behavior back, but with deprecation warning. diff --git a/onetl/base/base_db_connection.py b/onetl/base/base_db_connection.py index c11fb802e..670b15ec5 100644 --- a/onetl/base/base_db_connection.py +++ b/onetl/base/base_db_connection.py @@ -35,7 +35,7 @@ def validate_name(self, value: str) -> str: """ @abstractmethod - def validate_columns(self, columns: list[str] | None) -> list[str] | None: + def validate_columns(self, columns: str | list[str] | None) -> list[str] | None: """Check if ``columns`` value is valid. Raises diff --git a/onetl/connection/db_connection/dialect_mixins/support_columns_list.py b/onetl/connection/db_connection/dialect_mixins/support_columns_list.py index a45f47d28..e443cf8f8 100644 --- a/onetl/connection/db_connection/dialect_mixins/support_columns_list.py +++ b/onetl/connection/db_connection/dialect_mixins/support_columns_list.py @@ -2,15 +2,24 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations -from typing import Any +import warnings class SupportColumns: def validate_columns( self, - columns: Any, + columns: str | list[str] | None, ) -> list[str] | None: if columns is None: return ["*"] + if isinstance(columns, str): + warnings.warn( + f"Passing DBReader(columns={columns!r}) is deprecated since v0.10.0 and will be removed in v1.0.0. " + f"Use DBReader(columns=[{columns!r}] instead", + category=UserWarning, + stacklevel=3, + ) + return [columns] + return list(columns) diff --git a/onetl/db/db_reader/db_reader.py b/onetl/db/db_reader/db_reader.py index 83953b924..af038c24f 100644 --- a/onetl/db/db_reader/db_reader.py +++ b/onetl/db/db_reader/db_reader.py @@ -107,6 +107,11 @@ class DBReader(FrozenModel): It is recommended to pass column names explicitly to avoid selecting too many columns, and to avoid adding unexpected columns to dataframe if source DDL is changed. + .. deprecated:: 0.10.0 + + Syntax ``DBReader(columns="col1, col2")`` (string instead of list) is not supported, + and will be removed in v1.0.0 + where : Any, default: ``None`` Custom ``where`` for SQL query or MongoDB pipeline. @@ -384,8 +389,8 @@ def validate_source(cls, source, values): connection: BaseDBConnection = values["connection"] return connection.dialect.validate_name(source) - @validator("columns", always=True) # noqa: WPS231 - def validate_columns(cls, value: list[str] | None, values: dict) -> list[str] | None: + @validator("columns", always=True, pre=True) + def validate_columns(cls, value: str | list[str] | None, values: dict) -> list[str] | None: connection: BaseDBConnection = values["connection"] return connection.dialect.validate_columns(value) diff --git a/tests/tests_unit/test_db/test_db_reader_unit/test_common_reader_unit.py b/tests/tests_unit/test_db/test_db_reader_unit/test_common_reader_unit.py index a21fb1daa..5f61e6a07 100644 --- a/tests/tests_unit/test_db/test_db_reader_unit/test_common_reader_unit.py +++ b/tests/tests_unit/test_db/test_db_reader_unit/test_common_reader_unit.py @@ -58,7 +58,6 @@ def test_reader_hive_with_read_options(spark_mock): (), {}, set(), - "any,string", ], ) def test_reader_invalid_columns(spark_mock, columns): @@ -73,6 +72,7 @@ def test_reader_invalid_columns(spark_mock, columns): @pytest.mark.parametrize( "columns, real_columns", [ + (None, ["*"]), (["*"], ["*"]), (["abc", "cde"], ["abc", "cde"]), (["*", "abc"], ["*", "abc"]), @@ -88,6 +88,35 @@ def test_reader_valid_columns(spark_mock, columns, real_columns): assert reader.columns == real_columns +@pytest.mark.parametrize( + "column, real_columns, msg", + [ + ( + "*", + ["*"], + "Passing DBReader(columns='*') is deprecated since v0.10.0 and will be removed in v1.0.0. " + "Use DBReader(columns=['*'] instead", + ), + ( + "some, column", + ["some, column"], + "Passing DBReader(columns='some, column') is deprecated since v0.10.0 and will be removed in v1.0.0. " + "Use DBReader(columns=['some, column'] instead", + ), + ], + ids=["*", "some, column"], +) +def test_reader_legacy_columns(spark_mock, column, real_columns, msg): + with pytest.warns(UserWarning, match=re.escape(msg)): + reader = DBReader( + connection=Hive(cluster="rnd-dwh", spark=spark_mock), + table="schema.table", + columns=column, + ) + + assert reader.columns == real_columns + + @pytest.mark.parametrize( "hwm_column, real_hwm_expression", [ From 934cfc34bd84b522c872730e40eaca30de1e39ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Fri, 15 Mar 2024 13:18:48 +0000 Subject: [PATCH 51/63] [DOP-13737] Downgrade Greenplum package to 2.2.0 --- .github/workflows/data/greenplum/matrix.yml | 5 +++++ .github/workflows/nightly.yml | 1 + .github/workflows/test-greenplum.yml | 4 ++++ .github/workflows/tests.yml | 1 + docs/changelog/next_release/239.bugfix.rst | 8 +++++++ .../db_connection/greenplum/prerequisites.rst | 12 +++++++--- .../db_connection/greenplum/connection.py | 16 +++++++------- tests/fixtures/spark.py | 7 +++++- .../test_greenplum_unit.py | 22 +++++++++---------- 9 files changed, 53 insertions(+), 23 deletions(-) create mode 100644 docs/changelog/next_release/239.bugfix.rst diff --git a/.github/workflows/data/greenplum/matrix.yml b/.github/workflows/data/greenplum/matrix.yml index 05afbb3bd..2b66c0e19 100644 --- a/.github/workflows/data/greenplum/matrix.yml +++ b/.github/workflows/data/greenplum/matrix.yml @@ -25,14 +25,19 @@ latest: &latest matrix: small: - greenplum-version: 7.0.0 + package-version: 2.3.1 <<: *max full: - greenplum-version: 6.25.3 + package-version: 2.2.0 <<: *min - greenplum-version: 7.0.0 + package-version: 2.3.1 <<: *max nightly: - greenplum-version: 6.25.3 + package-version: 2.2.0 <<: *min - greenplum-version: 7.0.0 + package-version: 2.3.1 <<: *latest diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index 3c3b9d3bc..34a0af1b3 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -67,6 +67,7 @@ jobs: with: greenplum-version: ${{ matrix.greenplum-version }} spark-version: ${{ matrix.spark-version }} + package-version: ${{ matrix.package-version }} pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} diff --git a/.github/workflows/test-greenplum.yml b/.github/workflows/test-greenplum.yml index d8174979e..bf5e5012e 100644 --- a/.github/workflows/test-greenplum.yml +++ b/.github/workflows/test-greenplum.yml @@ -5,6 +5,9 @@ on: greenplum-version: required: true type: string + package-version: + required: true + type: string spark-version: required: true type: string @@ -107,6 +110,7 @@ jobs: mkdir reports/ || echo "Directory exists" sed '/^$/d' ./.env.local | sed '/^#/d' | sed 's/^/export /' > ./env source ./env + export ONETL_GP_PACKAGE_VERSION=${{ inputs.package-version }} ./pytest_runner.sh -m greenplum env: ONETL_DB_WITH_GREENPLUM: 'true' diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 3ee7f3f80..058ba2d54 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -66,6 +66,7 @@ jobs: with: greenplum-version: ${{ matrix.greenplum-version }} spark-version: ${{ matrix.spark-version }} + package-version: ${{ matrix.package-version }} pydantic-version: ${{ matrix.pydantic-version }} java-version: ${{ matrix.java-version }} python-version: ${{ matrix.python-version }} diff --git a/docs/changelog/next_release/239.bugfix.rst b/docs/changelog/next_release/239.bugfix.rst new file mode 100644 index 000000000..c6b9bc48d --- /dev/null +++ b/docs/changelog/next_release/239.bugfix.rst @@ -0,0 +1,8 @@ +Downgrade Greenplum package version from 2.3.0 to 2.2.0. This is because version 2.3.0 introduced issues with writing data to Greenplum 6.x. +Connector can open transaction with ``SELECT * FROM table LIMIT 0`` query, but does not close it, which leads to deadlocks. + +For using this connector with Greenplum 7.x, please pass package version explicitly: + +.. code-block:: python + + maven_packages = Greenplum.get_packages(package_version="2.3.0", ...) diff --git a/docs/connection/db_connection/greenplum/prerequisites.rst b/docs/connection/db_connection/greenplum/prerequisites.rst index a5509a003..04595766c 100644 --- a/docs/connection/db_connection/greenplum/prerequisites.rst +++ b/docs/connection/db_connection/greenplum/prerequisites.rst @@ -6,11 +6,11 @@ Prerequisites Version Compatibility --------------------- -* Greenplum server versions: 5.x, 6.x, 7.x +* Greenplum server versions: 5.x, 6.x, 7.x (requires ``Greenplum.get_packages(package_version="2.3.0")`` or higher) * Spark versions: 2.3.x - 3.2.x (Spark 3.3+ is not supported yet) * Java versions: 8 - 11 -See `official documentation `_. +See `official documentation `_. Installing PySpark ------------------ @@ -24,13 +24,19 @@ Downloading VMware package -------------------------- To use Greenplum connector you should download connector ``.jar`` file from -`VMware website `_ +`VMware website `_ and then pass it to Spark session. .. warning:: Please pay attention to :ref:`Spark & Scala version compatibility `. +.. warning:: + + There are issues with using package of version 2.3.0/2.3.1 with Greenplum 6.x - connector can + open transaction with ``SELECT * FROM table LIMIT 0`` query, but does not close it, which leads to deadlocks + during write. + There are several ways to do that. See :ref:`java-packages` for details. .. note:: diff --git a/onetl/connection/db_connection/greenplum/connection.py b/onetl/connection/db_connection/greenplum/connection.py index bdd509862..4460be149 100644 --- a/onetl/connection/db_connection/greenplum/connection.py +++ b/onetl/connection/db_connection/greenplum/connection.py @@ -67,7 +67,7 @@ class Config: class Greenplum(JDBCMixin, DBConnection): """Greenplum connection. |support_hooks| - Based on package ``io.pivotal:greenplum-spark:2.3.0`` + Based on package ``io.pivotal:greenplum-spark:2.2.0`` (`VMware Greenplum connector for Spark `_). .. warning:: @@ -187,7 +187,7 @@ def get_packages( Used only if ``scala_version=None``. - package_version : str, optional, default ``2.3.0`` + package_version : str, optional, default ``2.2.0`` Package version in format ``major.minor.patch`` Examples @@ -197,8 +197,8 @@ def get_packages( from onetl.connection import Greenplum - Greenplum.get_packages(scala_version="2.11") - Greenplum.get_packages(spark_version="3.2") + Greenplum.get_packages(scala_version="2.12") + Greenplum.get_packages(spark_version="3.2", package_version="2.3.0") """ @@ -206,7 +206,7 @@ def get_packages( if package_version: package_ver = Version.parse(package_version) else: - package_ver = Version(2, 3, 0) + package_ver = Version(2, 2, 0) if scala_version: scala_ver = Version.parse(scala_version) @@ -228,21 +228,21 @@ def package_spark_2_3(cls) -> str: """Get package name to be downloaded by Spark 2.3.""" msg = "`Greenplum.package_2_3` will be removed in 1.0.0, use `Greenplum.get_packages(spark_version='2.3')` instead" warnings.warn(msg, UserWarning, stacklevel=3) - return "io.pivotal:greenplum-spark_2.11:2.3.0" + return "io.pivotal:greenplum-spark_2.11:2.2.0" @classproperty def package_spark_2_4(cls) -> str: """Get package name to be downloaded by Spark 2.4.""" msg = "`Greenplum.package_2_4` will be removed in 1.0.0, use `Greenplum.get_packages(spark_version='2.4')` instead" warnings.warn(msg, UserWarning, stacklevel=3) - return "io.pivotal:greenplum-spark_2.11:2.3.0" + return "io.pivotal:greenplum-spark_2.11:2.2.0" @classproperty def package_spark_3_2(cls) -> str: """Get package name to be downloaded by Spark 3.2.""" msg = "`Greenplum.package_3_2` will be removed in 1.0.0, use `Greenplum.get_packages(spark_version='3.2')` instead" warnings.warn(msg, UserWarning, stacklevel=3) - return "io.pivotal:greenplum-spark_2.12:2.3.0" + return "io.pivotal:greenplum-spark_2.12:2.2.0" @property def instance_url(self) -> str: diff --git a/tests/fixtures/spark.py b/tests/fixtures/spark.py index f56fa4002..b2559d520 100644 --- a/tests/fixtures/spark.py +++ b/tests/fixtures/spark.py @@ -59,7 +59,12 @@ def maven_packages(): with_greenplum = os.getenv("ONETL_DB_WITH_GREENPLUM", "false").lower() == "true" if with_greenplum: # Greenplum connector jar is not publicly available, - packages.extend(Greenplum.get_packages(spark_version=pyspark_version)) + packages.extend( + Greenplum.get_packages( + spark_version=pyspark_version, + package_version=os.getenv("ONETL_GP_PACKAGE_VERSION") or None, + ), + ) if pyspark_version >= (2, 4): # There is no Avro package for Spark 2.3 diff --git a/tests/tests_unit/tests_db_connection_unit/test_greenplum_unit.py b/tests/tests_unit/tests_db_connection_unit/test_greenplum_unit.py index 71308e2f9..bda06d282 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_greenplum_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_greenplum_unit.py @@ -15,9 +15,9 @@ def test_greenplum_driver(): def test_greenplum_package(): warning_msg = re.escape("will be removed in 1.0.0, use `Greenplum.get_packages(spark_version=") with pytest.warns(UserWarning, match=warning_msg): - assert Greenplum.package_spark_2_3 == "io.pivotal:greenplum-spark_2.11:2.3.0" - assert Greenplum.package_spark_2_4 == "io.pivotal:greenplum-spark_2.11:2.3.0" - assert Greenplum.package_spark_3_2 == "io.pivotal:greenplum-spark_2.12:2.3.0" + assert Greenplum.package_spark_2_3 == "io.pivotal:greenplum-spark_2.11:2.2.0" + assert Greenplum.package_spark_2_4 == "io.pivotal:greenplum-spark_2.11:2.2.0" + assert Greenplum.package_spark_3_2 == "io.pivotal:greenplum-spark_2.12:2.2.0" def test_greenplum_get_packages_no_input(): @@ -56,15 +56,15 @@ def test_greenplum_get_packages_scala_version_not_supported(scala_version): "spark_version, scala_version, package", [ # use Scala version directly - (None, "2.11", "io.pivotal:greenplum-spark_2.11:2.3.0"), - (None, "2.12", "io.pivotal:greenplum-spark_2.12:2.3.0"), + (None, "2.11", "io.pivotal:greenplum-spark_2.11:2.2.0"), + (None, "2.12", "io.pivotal:greenplum-spark_2.12:2.2.0"), # Detect Scala version by Spark version - ("2.3", None, "io.pivotal:greenplum-spark_2.11:2.3.0"), - ("2.4", None, "io.pivotal:greenplum-spark_2.11:2.3.0"), - ("3.2", None, "io.pivotal:greenplum-spark_2.12:2.3.0"), + ("2.3", None, "io.pivotal:greenplum-spark_2.11:2.2.0"), + ("2.4", None, "io.pivotal:greenplum-spark_2.11:2.2.0"), + ("3.2", None, "io.pivotal:greenplum-spark_2.12:2.2.0"), # Override Scala version detected automatically - ("2.3", "2.11", "io.pivotal:greenplum-spark_2.11:2.3.0"), - ("2.4", "2.12", "io.pivotal:greenplum-spark_2.12:2.3.0"), + ("2.3", "2.11", "io.pivotal:greenplum-spark_2.11:2.2.0"), + ("2.4", "2.12", "io.pivotal:greenplum-spark_2.12:2.2.0"), ], ) def test_greenplum_get_packages(spark_version, scala_version, package): @@ -74,7 +74,7 @@ def test_greenplum_get_packages(spark_version, scala_version, package): @pytest.mark.parametrize( "package_version, scala_version, package", [ - (None, "2.12", "io.pivotal:greenplum-spark_2.12:2.3.0"), + (None, "2.12", "io.pivotal:greenplum-spark_2.12:2.2.0"), ("2.3.0", "2.12", "io.pivotal:greenplum-spark_2.12:2.3.0"), ("2.1.4", "2.12", "io.pivotal:greenplum-spark_2.12:2.1.4"), ], From a286840cff7a89b2cfdd9f6cd23dd64413b08cad Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 18 Mar 2024 21:05:41 +0000 Subject: [PATCH 52/63] [pre-commit.ci] pre-commit autoupdate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/psf/black: 24.2.0 → 24.3.0](https://github.com/psf/black/compare/24.2.0...24.3.0) - [github.com/PyCQA/autoflake: v2.3.0 → v2.3.1](https://github.com/PyCQA/autoflake/compare/v2.3.0...v2.3.1) --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 55ad11f4d..c591ac80d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -95,7 +95,7 @@ repos: args: [--py37-plus, --keep-runtime-typing] - repo: https://github.com/psf/black - rev: 24.2.0 + rev: 24.3.0 hooks: - id: black language_version: python3 @@ -113,7 +113,7 @@ repos: - id: check-useless-excludes - repo: https://github.com/PyCQA/autoflake - rev: v2.3.0 + rev: v2.3.1 hooks: - id: autoflake args: From b98cac7e501f4fab0a9337bc78868dd0edb386de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Fri, 15 Mar 2024 16:35:40 +0000 Subject: [PATCH 53/63] [DOP-13779] Improve Teradata documentation --- .github/workflows/tests.yml | 2 +- .../next_release/240.improvement.rst | 4 + .../db_connection/clickhouse/execute.rst | 42 ++++--- .../db_connection/clickhouse/read.rst | 25 +++- .../db_connection/clickhouse/sql.rst | 37 ++++-- .../db_connection/clickhouse/write.rst | 6 +- .../db_connection/greenplum/execute.rst | 35 +++--- .../db_connection/greenplum/read.rst | 35 ++++-- .../db_connection/greenplum/write.rst | 6 +- .../db_connection/mongodb/pipeline.rst | 14 ++- .../connection/db_connection/mongodb/read.rst | 16 ++- .../db_connection/mssql/execute.rst | 36 +++--- docs/connection/db_connection/mssql/read.rst | 25 +++- docs/connection/db_connection/mssql/sql.rst | 32 +++-- docs/connection/db_connection/mssql/write.rst | 7 +- .../db_connection/mysql/execute.rst | 41 +++--- docs/connection/db_connection/mysql/read.rst | 25 +++- docs/connection/db_connection/mysql/sql.rst | 32 +++-- docs/connection/db_connection/mysql/write.rst | 7 +- .../db_connection/oracle/execute.rst | 40 +++--- docs/connection/db_connection/oracle/read.rst | 25 +++- docs/connection/db_connection/oracle/sql.rst | 32 +++-- .../connection/db_connection/oracle/write.rst | 7 +- .../db_connection/postgres/execute.rst | 40 +++--- .../db_connection/postgres/read.rst | 25 +++- .../connection/db_connection/postgres/sql.rst | 32 +++-- .../db_connection/postgres/write.rst | 7 +- .../db_connection/teradata/execute.rst | 100 ++++++++++++++- .../db_connection/teradata/index.rst | 2 + .../db_connection/teradata/prerequisites.rst | 60 +++++++++ .../db_connection/teradata/read.rst | 117 ++++++++++++++++-- .../connection/db_connection/teradata/sql.rst | 66 ++++++++++ .../db_connection/teradata/write.rst | 113 ++++++++++++++++- .../connection/db_connection/hive/options.py | 4 +- .../jdbc_connection/connection.py | 42 +------ .../db_connection/jdbc_connection/options.py | 12 +- .../db_connection/jdbc_mixin/connection.py | 104 +--------------- .../db_connection/teradata/connection.py | 28 +---- 38 files changed, 887 insertions(+), 396 deletions(-) create mode 100644 docs/changelog/next_release/240.improvement.rst create mode 100644 docs/connection/db_connection/teradata/prerequisites.rst create mode 100644 docs/connection/db_connection/teradata/sql.rst diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 058ba2d54..f7a7cf074 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -398,7 +398,7 @@ jobs: token: ${{ secrets.CODECOV_TOKEN }} file: ./reports/coverage.xml fail_ci_if_error: true - plugin: 'noop' + plugin: noop - name: All done run: echo 1 diff --git a/docs/changelog/next_release/240.improvement.rst b/docs/changelog/next_release/240.improvement.rst new file mode 100644 index 000000000..2284bd0d9 --- /dev/null +++ b/docs/changelog/next_release/240.improvement.rst @@ -0,0 +1,4 @@ +Improve Teradata documentation: + * Add "Prerequisites" section describing different aspects of connecting to Teradata + * Separate documentation of ``DBReader`` and ``Teradata.sql`` + * Add examples for ``Teradata.fetch`` and ``Teradata.execute`` diff --git a/docs/connection/db_connection/clickhouse/execute.rst b/docs/connection/db_connection/clickhouse/execute.rst index ef9ad2ecf..11e01c753 100644 --- a/docs/connection/db_connection/clickhouse/execute.rst +++ b/docs/connection/db_connection/clickhouse/execute.rst @@ -3,20 +3,30 @@ Executing statements in Clickhouse ================================== +.. warning:: + + Methods below **read all the rows** returned from DB **to Spark driver memory**, and then convert them to DataFrame. + + Do **NOT** use them to read large amounts of data. Use :ref:`DBReader ` or :ref:`Clickhouse.sql ` instead. + How to ------ There are 2 ways to execute some statement in Clickhouse -Use :obj:`Clickhouse.fetch ` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Use ``Clickhouse.fetch`` +~~~~~~~~~~~~~~~~~~~~~~~~ Use this method to execute some ``SELECT`` query which returns **small number or rows**, like reading -Clickhouse config, or reading data from some reference table. +Clickhouse config, or reading data from some reference table. Method returns Spark DataFrame. Method accepts :obj:`JDBCOptions `. -Connection opened using this method should be then closed with :obj:`Clickhouse.close `. +Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. + +.. warning:: + + Please take into account :ref:`clickhouse-types`. Syntax support ^^^^^^^^^^^^^^ @@ -45,23 +55,23 @@ Examples clickhouse.close() value = df.collect()[0][0] # get value from first row and first column -Use :obj:`Clickhouse.execute ` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Use ``Clickhouse.execute`` +~~~~~~~~~~~~~~~~~~~~~~~~~~ Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it. Method accepts :obj:`JDBCOptions `. -Connection opened using this method should be then closed with :obj:`Clickhouse.close `. +Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. Syntax support ^^^^^^^^^^^^^^ This method supports **any** query syntax supported by Clickhouse, like: -* ✅︎ ``CREATE TABLE ...``, ``CREATE VIEW ...``, and so on +* ✅︎ ``CREATE TABLE ...``, ``CREATE VIEW ...``, ``DROP TABLE ...``, and so on * ✅︎ ``ALTER ...`` -* ✅︎ ``INSERT INTO ... AS SELECT ...`` +* ✅︎ ``INSERT INTO ... SELECT ...``, ``UPDATE ...``, ``DELETE ...``, and so on * ✅︎ ``DROP TABLE ...``, ``DROP VIEW ...``, and so on * ✅︎ other statements not mentioned here * ❌ ``SET ...; SELECT ...;`` - multiple statements not supported @@ -76,6 +86,7 @@ Examples clickhouse = Clickhouse(...) with clickhouse: + # automatically close connection after exiting this context manager clickhouse.execute("DROP TABLE schema.table") clickhouse.execute( """ @@ -90,14 +101,15 @@ Examples options=Clickhouse.JDBCOptions(query_timeout=10), ) -References ----------- +Notes +------ + +These methods **read all the rows** returned from DB **to Spark driver memory**, and then convert them to DataFrame. -.. currentmodule:: onetl.connection.db_connection.clickhouse.connection +So it should **NOT** be used to read large amounts of data. Use :ref:`DBReader ` or :ref:`Clickhouse.sql ` instead. -.. automethod:: Clickhouse.fetch -.. automethod:: Clickhouse.execute -.. automethod:: Clickhouse.close +Options +------- .. currentmodule:: onetl.connection.db_connection.jdbc_mixin.options diff --git a/docs/connection/db_connection/clickhouse/read.rst b/docs/connection/db_connection/clickhouse/read.rst index e421f340b..33bcff4ce 100644 --- a/docs/connection/db_connection/clickhouse/read.rst +++ b/docs/connection/db_connection/clickhouse/read.rst @@ -3,13 +3,13 @@ Reading from Clickhouse using ``DBReader`` ========================================== +:obj:`DBReader ` supports :ref:`strategy` for incremental data reading, +but does not support custom queries, like ``JOIN``. + .. warning:: Please take into account :ref:`clickhouse-types` -:obj:`DBReader ` supports :ref:`strategy` for incremental data reading, -but does not support custom queries, like JOINs. - Supported DBReader features --------------------------- @@ -67,8 +67,23 @@ Incremental strategy: with IncrementalStrategy(): df = reader.run() -Read options ------------- +Recommendations +--------------- + +Select only required columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of passing ``"*"`` in ``DBReader(columns=[...])`` prefer passing exact column names. This reduces the amount of data passed from Clickhouse to Spark. + +Pay attention to ``where`` value +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of filtering data on Spark side using ``df.filter(df.column == 'value')`` pass proper ``DBReader(where="column = 'value'")`` clause. +This both reduces the amount of data send from Clickhouse to Spark, and may also improve performance of the query. +Especially if there are indexes or partitions for columns used in ``where`` clause. + +Options +------- .. currentmodule:: onetl.connection.db_connection.jdbc_connection.options diff --git a/docs/connection/db_connection/clickhouse/sql.rst b/docs/connection/db_connection/clickhouse/sql.rst index 658bd052a..1a3a1d52a 100644 --- a/docs/connection/db_connection/clickhouse/sql.rst +++ b/docs/connection/db_connection/clickhouse/sql.rst @@ -3,14 +3,16 @@ Reading from Clickhouse using ``Clickhouse.sql`` ================================================ +``Clickhouse.sql`` allows passing custom SQL query, but does not support incremental strategies. + .. warning:: Please take into account :ref:`clickhouse-types` -:obj:`Clickhouse.sql ` allows passing custom SQL query, -but does not support incremental strategies. +.. warning:: -Method also accepts :obj:`JDBCReadOptions `. + Statement is executed in **read-write** connection, so if you're calling some functions/procedures with DDL/DML statements inside, + they can change data in your database. Syntax support -------------- @@ -19,10 +21,7 @@ Only queries with the following syntax are supported: * ✅︎ ``SELECT ... FROM ...`` * ✅︎ ``WITH alias AS (...) SELECT ...`` - -Queries like ``SHOW ...`` are not supported. - -This method also does not support multiple queries in the same operation, like ``SET ...; SELECT ...;``. +* ❌ ``SET ...; SELECT ...;`` - multiple statements not supported Examples -------- @@ -44,12 +43,26 @@ Examples WHERE key = 'something' """, - options=Clickhouse.ReadOptions(partition_column="id", num_partitions=10), + options=Clickhouse.ReadOptions( + partition_column="id", + num_partitions=10, + lower_bound=0, + upper_bound=1000, + ), ) -References ----------- +Recommendations +--------------- + +Select only required columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of passing ``SELECT * FROM ...`` prefer passing exact column names ``SELECT col1, col2, ...``. +This reduces the amount of data passed from Clickhouse to Spark. -.. currentmodule:: onetl.connection.db_connection.clickhouse.connection +Pay attention to ``where`` value +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. automethod:: Clickhouse.sql +Instead of filtering data on Spark side using ``df.filter(df.column == 'value')`` pass proper ``WHERE column = 'value'`` clause. +This both reduces the amount of data send from Clickhouse to Spark, and may also improve performance of the query. +Especially if there are indexes or partitions for columns used in ``where`` clause. diff --git a/docs/connection/db_connection/clickhouse/write.rst b/docs/connection/db_connection/clickhouse/write.rst index 09f5749f2..1fe56868b 100644 --- a/docs/connection/db_connection/clickhouse/write.rst +++ b/docs/connection/db_connection/clickhouse/write.rst @@ -11,7 +11,7 @@ For writing data to Clickhouse, use :obj:`DBWriter ` + It is always recommended to create table explicitly using :ref:`Clickhouse.execute ` instead of relying on Spark's table DDL generation. This is because Spark's DDL generator can create columns with different precision and types than it is expected, @@ -42,8 +42,8 @@ Examples writer.run(df) -Write options -------------- +Options +------- Method above accepts :obj:`JDBCWriteOptions ` diff --git a/docs/connection/db_connection/greenplum/execute.rst b/docs/connection/db_connection/greenplum/execute.rst index 25fbe8962..c0c415369 100644 --- a/docs/connection/db_connection/greenplum/execute.rst +++ b/docs/connection/db_connection/greenplum/execute.rst @@ -5,23 +5,29 @@ Executing statements in Greenplum .. warning:: - Unlike ``DBReader``, ``Greenplum.fetch`` and ``Greenplum.execute`` are implemented using Postgres JDBC connection, - and so types are handled a bit differently. See :ref:`postgres-types`. + Methods below **read all the rows** returned from DB **to Spark driver memory**, and then convert them to DataFrame. + + Do **NOT** use them to read large amounts of data. Use :ref:`DBReader ` instead. How to ------ There are 2 ways to execute some statement in Greenplum -Use :obj:`Greenplum.fetch ` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Use ``Greenplum.fetch`` +~~~~~~~~~~~~~~~~~~~~~~~ Use this method to execute some ``SELECT`` query which returns **small number or rows**, like reading -Greenplum config, or reading data from some reference table. +Greenplum config, or reading data from some reference table. Method returns Spark DataFrame. Method accepts :obj:`JDBCOptions `. -Connection opened using this method should be then closed with :obj:`Greenplum.close `. +Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. + +.. warning:: + + ``Greenplum.fetch`` is implemented using Postgres JDBC connection, + so types are handled a bit differently than in ``DBReader``. See :ref:`postgres-types`. Syntax support ^^^^^^^^^^^^^^ @@ -49,23 +55,23 @@ Examples greenplum.close() value = df.collect()[0][0] # get value from first row and first column -Use :obj:`Greenplum.execute ` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Use ``Greenplum.execute`` +~~~~~~~~~~~~~~~~~~~~~~~~~ Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it. Method accepts :obj:`JDBCOptions `. -Connection opened using this method should be then closed with :obj:`Greenplum.close `. +Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. Syntax support ^^^^^^^^^^^^^^ This method supports **any** query syntax supported by Greenplum, like: -* ✅︎ ``CREATE TABLE ...``, ``CREATE VIEW ...``, and so on +* ✅︎ ``CREATE TABLE ...``, ``CREATE VIEW ...``, ``DROP TABLE ...``, and so on * ✅︎ ``ALTER ...`` -* ✅︎ ``INSERT INTO ... AS SELECT ...`` +* ✅︎ ``INSERT INTO ... SELECT ...``, ``UPDATE ...``, ``DELETE ...``, and so on * ✅︎ ``DROP TABLE ...``, ``DROP VIEW ...``, and so on * ✅︎ ``CALL procedure(arg1, arg2) ...`` * ✅︎ ``SELECT func(arg1, arg2)`` or ``{call func(arg1, arg2)}`` - special syntax for calling functions @@ -82,6 +88,7 @@ Examples greenplum = Greenplum(...) with greenplum: + # automatically close connection after exiting this context manager greenplum.execute("DROP TABLE schema.table") greenplum.execute( """ @@ -136,12 +143,6 @@ The only port used while interacting with Greenplum in this case is ``5432`` (Gr Options ------- -.. currentmodule:: onetl.connection.db_connection.greenplum.connection - -.. automethod:: Greenplum.fetch -.. automethod:: Greenplum.execute -.. automethod:: Greenplum.close - .. currentmodule:: onetl.connection.db_connection.jdbc_mixin.options .. autopydantic_model:: JDBCOptions diff --git a/docs/connection/db_connection/greenplum/read.rst b/docs/connection/db_connection/greenplum/read.rst index 9f7d925cc..310521677 100644 --- a/docs/connection/db_connection/greenplum/read.rst +++ b/docs/connection/db_connection/greenplum/read.rst @@ -3,13 +3,13 @@ Reading from Greenplum using ``DBReader`` ========================================== +Data can be read from Greenplum to Spark using :obj:`DBReader `. +It also supports :ref:`strategy` for incremental data reading. + .. warning:: Please take into account :ref:`greenplum-types`. -Data can be read from Greenplum to Spark using :obj:`DBReader `. -It also supports :ref:`strategy` for incremental data reading. - .. note:: Unlike JDBC connectors, *Greenplum connector for Spark* does not support @@ -185,14 +185,29 @@ High-level schema is described in :ref:`greenplum-prerequisites`. You can find d Recommendations --------------- -``DBReader`` in case of Greenplum connector requires view or table to have some column which is used by Spark +Select only required columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of passing ``"*"`` in ``DBReader(columns=[...])`` prefer passing exact column names. This reduces the amount of data passed from Greenplum to Spark. + +Pay attention to ``where`` value +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of filtering data on Spark side using ``df.filter(df.column == 'value')`` pass proper ``DBReader(where="column = 'value'")`` clause. +This both reduces the amount of data send from Greenplum to Spark, and may also improve performance of the query. +Especially if there are indexes or partitions for columns used in ``where`` clause. + +Read data in parallel +~~~~~~~~~~~~~~~~~~~~~ + +``DBReader`` in case of Greenplum connector requires view or table to have a column which is used by Spark for parallel reads. Choosing proper column allows each Spark executor to read only part of data stored in the specified segment, avoiding moving large amounts of data between segments, which improves reading performance. -Parallel read using ``gp_segment_id`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Using ``gp_segment_id`` +^^^^^^^^^^^^^^^^^^^^^^^ By default, ``DBReader`` will use `gp_segment_id `_ column for parallel data reading. Each DataFrame partition will contain data of a specific Greenplum segment. @@ -228,8 +243,8 @@ If view is used, it is recommended to include ``gp_segment_id`` column to this v ) df = reader.run() -Parallel read using custom ``partition_column`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Using custom ``partition_column`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Sometimes table or view is lack of ``gp_segment_id`` column, but there is some column with value range correlated with Greenplum segment distribution. @@ -353,8 +368,8 @@ And each connection starts its own transaction which means that every executor w You should use `UNLOGGED `_ tables to write data to intermediate table without generating WAL logs. -Read options ------------- +Options +------- .. currentmodule:: onetl.connection.db_connection.greenplum.options diff --git a/docs/connection/db_connection/greenplum/write.rst b/docs/connection/db_connection/greenplum/write.rst index 11309db14..d7e993d41 100644 --- a/docs/connection/db_connection/greenplum/write.rst +++ b/docs/connection/db_connection/greenplum/write.rst @@ -12,7 +12,7 @@ with :obj:`GreenplumWriteOptions ` + It is always recommended to create table explicitly using :ref:`Greenplum.execute ` instead of relying on Spark's table DDL generation. This is because Spark's DDL generator can create columns with different types than it is expected. @@ -123,8 +123,8 @@ High-level schema is described in :ref:`greenplum-prerequisites`. You can find d deactivate "Spark driver" @enduml -Write options -------------- +Options +------- .. currentmodule:: onetl.connection.db_connection.greenplum.options diff --git a/docs/connection/db_connection/mongodb/pipeline.rst b/docs/connection/db_connection/mongodb/pipeline.rst index d3c876e4d..f9d60e0f6 100644 --- a/docs/connection/db_connection/mongodb/pipeline.rst +++ b/docs/connection/db_connection/mongodb/pipeline.rst @@ -3,12 +3,22 @@ Reading from MongoDB using ``MongoDB.pipeline`` =============================================== +:obj:`MongoDB.sql ` allows passing custom pipeline, +but does not support incremental strategies. + .. warning:: Please take into account :ref:`mongodb-types` -:obj:`MongoDB.sql ` allows passing custom pipeline, -but does not support incremental strategies. +Recommendations +--------------- + +Pay attention to ``pipeline`` value +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of filtering data on Spark side using ``df.filter(df.column == 'value')`` pass proper ``mongodb.pipeline(..., pipeline={"$match": {"column": {"$eq": "value"}}})`` value. +This both reduces the amount of data send from MongoDB to Spark, and may also improve performance of the query. +Especially if there are indexes for columns used in ``pipeline`` value. References ---------- diff --git a/docs/connection/db_connection/mongodb/read.rst b/docs/connection/db_connection/mongodb/read.rst index 8fad2120c..e90bc6e2b 100644 --- a/docs/connection/db_connection/mongodb/read.rst +++ b/docs/connection/db_connection/mongodb/read.rst @@ -3,13 +3,13 @@ Reading from MongoDB using ``DBReader`` ======================================= +:obj:`DBReader ` supports :ref:`strategy` for incremental data reading, +but does not support custom pipelines, e.g. aggregation. + .. warning:: Please take into account :ref:`mongodb-types` -:obj:`DBReader ` supports :ref:`strategy` for incremental data reading, -but does not support custom pipelines, e.g. aggregation. - Supported DBReader features --------------------------- @@ -120,6 +120,16 @@ Incremental strategy: with IncrementalStrategy(): df = reader.run() +Recommendations +--------------- + +Pay attention to ``where`` value +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of filtering data on Spark side using ``df.filter(df.column == 'value')`` pass proper ``DBReader(where={"column": {"$eq": "value"}})`` clause. +This both reduces the amount of data send from MongoDB to Spark, and may also improve performance of the query. +Especially if there are indexes for columns used in ``where`` clause. + Read options ------------ diff --git a/docs/connection/db_connection/mssql/execute.rst b/docs/connection/db_connection/mssql/execute.rst index 54156ed76..729c75a30 100644 --- a/docs/connection/db_connection/mssql/execute.rst +++ b/docs/connection/db_connection/mssql/execute.rst @@ -3,20 +3,30 @@ Executing statements in MSSQL ============================= +.. warning:: + + Methods below **read all the rows** returned from DB **to Spark driver memory**, and then convert them to DataFrame. + + Do **NOT** use them to read large amounts of data. Use :ref:`DBReader ` or :ref:`MSSQL.sql ` instead. + How to ------ There are 2 ways to execute some statement in MSSQL -Use :obj:`MSSQL.fetch ` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Use ``MSSQL.fetch`` +~~~~~~~~~~~~~~~~~~~ Use this method to execute some ``SELECT`` query which returns **small number or rows**, like reading -MSSQL config, or reading data from some reference table. +MSSQL config, or reading data from some reference table. Method returns Spark DataFrame. Method accepts :obj:`JDBCOptions `. -Connection opened using this method should be then closed with :obj:`MSSQL.close `. +Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. + +.. warning:: + + Please take into account :ref:`mssql-types`. Syntax support ^^^^^^^^^^^^^^ @@ -44,14 +54,14 @@ Examples mssql.close() value = df.collect()[0][0] # get value from first row and first column -Use :obj:`MSSQL.execute ` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Use ``MSSQL.execute`` +~~~~~~~~~~~~~~~~~~~~~ Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it. Method accepts :obj:`JDBCOptions `. -Connection opened using this method should be then closed with :obj:`MSSQL.close `. +Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. Syntax support ^^^^^^^^^^^^^^ @@ -77,6 +87,7 @@ Examples mssql = MSSQL(...) with mssql: + # automatically close connection after exiting this context manager mssql.execute("DROP TABLE schema.table") mssql.execute( """ @@ -89,15 +100,8 @@ Examples options=MSSQL.JDBCOptions(query_timeout=10), ) - -References ----------- - -.. currentmodule:: onetl.connection.db_connection.mssql.connection - -.. automethod:: MSSQL.fetch -.. automethod:: MSSQL.execute -.. automethod:: MSSQL.close +Options +------- .. currentmodule:: onetl.connection.db_connection.jdbc_mixin.options diff --git a/docs/connection/db_connection/mssql/read.rst b/docs/connection/db_connection/mssql/read.rst index 50c958c68..0c8599aea 100644 --- a/docs/connection/db_connection/mssql/read.rst +++ b/docs/connection/db_connection/mssql/read.rst @@ -3,13 +3,13 @@ Reading from MSSQL using ``DBReader`` ====================================== +:obj:`DBReader ` supports :ref:`strategy` for incremental data reading, +but does not support custom queries, like ``JOIN``. + .. warning:: Please take into account :ref:`mssql-types` -:obj:`DBReader ` supports :ref:`strategy` for incremental data reading, -but does not support custom queries, like JOINs. - Supported DBReader features --------------------------- @@ -67,8 +67,23 @@ Incremental strategy: with IncrementalStrategy(): df = reader.run() -Read options ------------- +Recommendations +--------------- + +Select only required columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of passing ``"*"`` in ``DBReader(columns=[...])`` prefer passing exact column names. This reduces the amount of data passed from MSSQL to Spark. + +Pay attention to ``where`` value +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of filtering data on Spark side using ``df.filter(df.column == 'value')`` pass proper ``DBReader(where="column = 'value'")`` clause. +This both reduces the amount of data send from MSSQL to Spark, and may also improve performance of the query. +Especially if there are indexes or partitions for columns used in ``where`` clause. + +Options +------- .. currentmodule:: onetl.connection.db_connection.jdbc_connection.options diff --git a/docs/connection/db_connection/mssql/sql.rst b/docs/connection/db_connection/mssql/sql.rst index cf3467de4..8a59d376a 100644 --- a/docs/connection/db_connection/mssql/sql.rst +++ b/docs/connection/db_connection/mssql/sql.rst @@ -3,14 +3,16 @@ Reading from MSSQL using ``MSSQL.sql`` ======================================== +``MSSQL.sql`` allows passing custom SQL query, but does not support incremental strategies. + .. warning:: Please take into account :ref:`mssql-types` -:obj:`MSSQL.sql ` allows passing custom SQL query, -but does not support incremental strategies. +.. warning:: -Method also accepts :obj:`JDBCReadOptions `. + Statement is executed in **read-write** connection, so if you're calling some functions/procedures with DDL/DML statements inside, + they can change data in your database. Syntax support -------------- @@ -41,12 +43,26 @@ Examples WHERE key = 'something' """, - options=MSSQL.ReadOptions(partition_column="id", num_partitions=10), + options=MSSQL.ReadOptions( + partition_column="id", + num_partitions=10, + lower_bound=0, + upper_bound=1000, + ), ) -References ----------- +Recommendations +--------------- + +Select only required columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of passing ``SELECT * FROM ...`` prefer passing exact column names ``SELECT col1, col2, ...``. +This reduces the amount of data passed from MSSQL to Spark. -.. currentmodule:: onetl.connection.db_connection.mssql.connection +Pay attention to ``where`` value +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. automethod:: MSSQL.sql +Instead of filtering data on Spark side using ``df.filter(df.column == 'value')`` pass proper ``WHERE column = 'value'`` clause. +This both reduces the amount of data send from MSSQL to Spark, and may also improve performance of the query. +Especially if there are indexes or partitions for columns used in ``where`` clause. diff --git a/docs/connection/db_connection/mssql/write.rst b/docs/connection/db_connection/mssql/write.rst index eb15fe93e..854283704 100644 --- a/docs/connection/db_connection/mssql/write.rst +++ b/docs/connection/db_connection/mssql/write.rst @@ -11,7 +11,7 @@ For writing data to MSSQL, use :obj:`DBWriter ` + It is always recommended to create table explicitly using :ref:`MSSQL.execute ` instead of relying on Spark's table DDL generation. This is because Spark's DDL generator can create columns with different precision and types than it is expected, @@ -37,9 +37,8 @@ Examples writer.run(df) - -Write options -------------- +Options +------- Method above accepts :obj:`JDBCWriteOptions ` diff --git a/docs/connection/db_connection/mysql/execute.rst b/docs/connection/db_connection/mysql/execute.rst index 243f4e704..34460edae 100644 --- a/docs/connection/db_connection/mysql/execute.rst +++ b/docs/connection/db_connection/mysql/execute.rst @@ -1,22 +1,32 @@ .. _mysql-execute: Executing statements in MySQL -================================== +============================= + +.. warning:: + + Methods below **read all the rows** returned from DB **to Spark driver memory**, and then convert them to DataFrame. + + Do **NOT** use them to read large amounts of data. Use :ref:`DBReader ` or :ref:`MySQL.sql ` instead. How to ------ There are 2 ways to execute some statement in MySQL -Use :obj:`MySQL.fetch ` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Use ``MySQL.fetch`` +~~~~~~~~~~~~~~~~~~~ Use this method to execute some ``SELECT`` query which returns **small number or rows**, like reading -MySQL config, or reading data from some reference table. +MySQL config, or reading data from some reference table. Method returns Spark DataFrame. Method accepts :obj:`JDBCOptions `. -Connection opened using this method should be then closed with :obj:`MySQL.close `. +Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. + +.. warning:: + + Please take into account :ref:`mysql-types`. Syntax support ^^^^^^^^^^^^^^ @@ -45,23 +55,23 @@ Examples mysql.close() value = df.collect()[0][0] # get value from first row and first column -Use :obj:`MySQL.execute ` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Use ``MySQL.execute`` +~~~~~~~~~~~~~~~~~~~~~ Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it. Method accepts :obj:`JDBCOptions `. -Connection opened using this method should be then closed with :obj:`MySQL.close `. +Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. Syntax support ^^^^^^^^^^^^^^ This method supports **any** query syntax supported by MySQL, like: -* ✅︎ ``CREATE TABLE ...``, ``CREATE VIEW ...``, and so on +* ✅︎ ``CREATE TABLE ...``, ``CREATE VIEW ...``, ``DROP TABLE ...``, and so on * ✅︎ ``ALTER ...`` -* ✅︎ ``INSERT INTO ... AS SELECT ...`` +* ✅︎ ``INSERT INTO ... SELECT ...``, ``UPDATE ...``, ``DELETE ...``, and so on * ✅︎ ``DROP TABLE ...``, ``DROP VIEW ...``, and so on * ✅︎ ``CALL procedure(arg1, arg2) ...`` or ``{call procedure(arg1, arg2)}`` - special syntax for calling procedure * ✅︎ other statements not mentioned here @@ -77,6 +87,7 @@ Examples mysql = MySQL(...) with mysql: + # automatically close connection after exiting this context manager mysql.execute("DROP TABLE schema.table") mysql.execute( """ @@ -90,14 +101,8 @@ Examples options=MySQL.JDBCOptions(query_timeout=10), ) -References ----------- - -.. currentmodule:: onetl.connection.db_connection.mysql.connection - -.. automethod:: MySQL.fetch -.. automethod:: MySQL.execute -.. automethod:: MySQL.close +Options +------- .. currentmodule:: onetl.connection.db_connection.jdbc_mixin.options diff --git a/docs/connection/db_connection/mysql/read.rst b/docs/connection/db_connection/mysql/read.rst index ebf7f7a49..e72da45f1 100644 --- a/docs/connection/db_connection/mysql/read.rst +++ b/docs/connection/db_connection/mysql/read.rst @@ -3,13 +3,13 @@ Reading from MySQL using ``DBReader`` ===================================== +:obj:`DBReader ` supports :ref:`strategy` for incremental data reading, +but does not support custom queries, like ``JOIN``. + .. warning:: Please take into account :ref:`mysql-types` -:obj:`DBReader ` supports :ref:`strategy` for incremental data reading, -but does not support custom queries, like JOINs. - Supported DBReader features --------------------------- @@ -69,8 +69,23 @@ Incremental strategy: with IncrementalStrategy(): df = reader.run() -Read options ------------- +Recommendations +--------------- + +Select only required columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of passing ``"*"`` in ``DBReader(columns=[...])`` prefer passing exact column names. This reduces the amount of data passed from Oracle to Spark. + +Pay attention to ``where`` value +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of filtering data on Spark side using ``df.filter(df.column == 'value')`` pass proper ``DBReader(where="column = 'value'")`` clause. +This both reduces the amount of data send from Oracle to Spark, and may also improve performance of the query. +Especially if there are indexes for columns used in ``where`` clause. + +Options +------- .. currentmodule:: onetl.connection.db_connection.jdbc_connection.options diff --git a/docs/connection/db_connection/mysql/sql.rst b/docs/connection/db_connection/mysql/sql.rst index 6ef094cd4..c161efa83 100644 --- a/docs/connection/db_connection/mysql/sql.rst +++ b/docs/connection/db_connection/mysql/sql.rst @@ -3,14 +3,16 @@ Reading from MySQL using ``MySQL.sql`` ====================================== +``MySQL.sql`` allows passing custom SQL query, but does not support incremental strategies. + .. warning:: Please take into account :ref:`mysql-types` -:obj:`MySQL.sql ` allows passing custom SQL query, -but does not support incremental strategies. +.. warning:: -Method also accepts :obj:`JDBCReadOptions `. + Statement is executed in **read-write** connection, so if you're calling some functions/procedures with DDL/DML statements inside, + they can change data in your database. Syntax support -------------- @@ -42,12 +44,26 @@ Examples WHERE key = 'something' """, - options=MySQL.ReadOptions(partition_column="id", num_partitions=10), + options=MySQL.ReadOptions( + partition_column="id", + num_partitions=10, + lower_bound=0, + upper_bound=1000, + ), ) -References ----------- +Recommendations +--------------- + +Select only required columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of passing ``SELECT * FROM ...`` prefer passing exact column names ``SELECT col1, col2, ...``. +This reduces the amount of data passed from MySQL to Spark. -.. currentmodule:: onetl.connection.db_connection.mysql.connection +Pay attention to ``where`` value +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. automethod:: MySQL.sql +Instead of filtering data on Spark side using ``df.filter(df.column == 'value')`` pass proper ``WHERE column = 'value'`` clause. +This both reduces the amount of data send from MySQL to Spark, and may also improve performance of the query. +Especially if there are indexes or partitions for columns used in ``where`` clause. diff --git a/docs/connection/db_connection/mysql/write.rst b/docs/connection/db_connection/mysql/write.rst index 6974c142f..2d7c056c9 100644 --- a/docs/connection/db_connection/mysql/write.rst +++ b/docs/connection/db_connection/mysql/write.rst @@ -11,7 +11,7 @@ For writing data to MySQL, use :obj:`DBWriter ` + It is always recommended to create table explicitly using :ref:`MySQL.execute ` instead of relying on Spark's table DDL generation. This is because Spark's DDL generator can create columns with different precision and types than it is expected, @@ -41,9 +41,8 @@ Examples writer.run(df) - -Write options -------------- +Options +------- Method above accepts :obj:`JDBCWriteOptions ` diff --git a/docs/connection/db_connection/oracle/execute.rst b/docs/connection/db_connection/oracle/execute.rst index 10b65731e..b8533b278 100644 --- a/docs/connection/db_connection/oracle/execute.rst +++ b/docs/connection/db_connection/oracle/execute.rst @@ -1,22 +1,32 @@ .. _oracle-execute: Executing statements in Oracle -================================== +============================== + +.. warning:: + + Methods below **read all the rows** returned from DB **to Spark driver memory**, and then convert them to DataFrame. + + Do **NOT** use them to read large amounts of data. Use :ref:`DBReader ` or :ref:`Oracle.sql ` instead. How to ------ There are 2 ways to execute some statement in Oracle -Use :obj:`Oracle.fetch ` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Use ``Oracle.fetch`` +~~~~~~~~~~~~~~~~~~~~ Use this method to execute some ``SELECT`` query which returns **small number or rows**, like reading -Oracle config, or reading data from some reference table. +Oracle config, or reading data from some reference table. Method returns Spark DataFrame. Method accepts :obj:`JDBCOptions `. -Connection opened using this method should be then closed with :obj:`Oracle.close `. +Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. + +.. warning:: + + Please take into account :ref:`oracle-types`. Syntax support ^^^^^^^^^^^^^^ @@ -45,14 +55,14 @@ Examples oracle.close() value = df.collect()[0][0] # get value from first row and first column -Use :obj:`Oracle.execute ` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Use ``Oracle.execute`` +~~~~~~~~~~~~~~~~~~~~~~ Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it. Method accepts :obj:`JDBCOptions `. -Connection opened using this method should be then closed with :obj:`Oracle.close `. +Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. Syntax support ^^^^^^^^^^^^^^ @@ -61,7 +71,7 @@ This method supports **any** query syntax supported by Oracle, like: * ✅︎ ``CREATE TABLE ...``, ``CREATE VIEW ...`` * ✅︎ ``ALTER ...`` -* ✅︎ ``INSERT INTO ... AS SELECT ...`` +* ✅︎ ``INSERT INTO ... SELECT ...``, ``UPDATE ...``, ``DELETE ...``, and so on * ✅︎ ``DROP TABLE ...``, ``DROP VIEW ...``, and so on * ✅︎ ``CALL procedure(arg1, arg2) ...`` or ``{call procedure(arg1, arg2)}`` - special syntax for calling procedure * ✅︎ ``DECLARE ... BEGIN ... END`` - execute PL/SQL statement @@ -78,6 +88,7 @@ Examples oracle = Oracle(...) with oracle: + # automatically close connection after exiting this context manager oracle.execute("DROP TABLE schema.table") oracle.execute( """ @@ -90,15 +101,8 @@ Examples options=Oracle.JDBCOptions(query_timeout=10), ) - -References ----------- - -.. currentmodule:: onetl.connection.db_connection.oracle.connection - -.. automethod:: Oracle.fetch -.. automethod:: Oracle.execute -.. automethod:: Oracle.close +Options +------- .. currentmodule:: onetl.connection.db_connection.jdbc_mixin.options diff --git a/docs/connection/db_connection/oracle/read.rst b/docs/connection/db_connection/oracle/read.rst index 5877e2caf..6592cfc7b 100644 --- a/docs/connection/db_connection/oracle/read.rst +++ b/docs/connection/db_connection/oracle/read.rst @@ -3,13 +3,13 @@ Reading from Oracle using ``DBReader`` ====================================== +:obj:`DBReader ` supports :ref:`strategy` for incremental data reading, +but does not support custom queries, like ``JOIN``. + .. warning:: Please take into account :ref:`oracle-types` -:obj:`DBReader ` supports :ref:`strategy` for incremental data reading, -but does not support custom queries, like JOINs. - Supported DBReader features --------------------------- @@ -69,8 +69,23 @@ Incremental strategy: with IncrementalStrategy(): df = reader.run() -Read options ------------- +Recommendations +--------------- + +Select only required columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of passing ``"*"`` in ``DBReader(columns=[...])`` prefer passing exact column names. This reduces the amount of data passed from Oracle to Spark. + +Pay attention to ``where`` value +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of filtering data on Spark side using ``df.filter(df.column == 'value')`` pass proper ``DBReader(where="column = 'value'")`` clause. +This both reduces the amount of data send from Oracle to Spark, and may also improve performance of the query. +Especially if there are indexes or partitions for columns used in ``where`` clause. + +Options +------- .. currentmodule:: onetl.connection.db_connection.jdbc_connection.options diff --git a/docs/connection/db_connection/oracle/sql.rst b/docs/connection/db_connection/oracle/sql.rst index 6971de4ed..969c28afa 100644 --- a/docs/connection/db_connection/oracle/sql.rst +++ b/docs/connection/db_connection/oracle/sql.rst @@ -3,14 +3,16 @@ Reading from Oracle using ``Oracle.sql`` ======================================== +``Oracle.sql`` allows passing custom SQL query, but does not support incremental strategies. + .. warning:: Please take into account :ref:`oracle-types` -:obj:`Oracle.sql ` allows passing custom SQL query, -but does not support incremental strategies. +.. warning:: -Method also accepts :obj:`JDBCReadOptions `. + Statement is executed in **read-write** connection, so if you're calling some functions/procedures with DDL/DML statements inside, + they can change data in your database. Syntax support -------------- @@ -42,12 +44,26 @@ Examples WHERE key = 'something' """, - options=Oracle.ReadOptions(partition_column="id", num_partitions=10), + options=Oracle.ReadOptions( + partition_column="id", + num_partitions=10, + lower_bound=0, + upper_bound=1000, + ), ) -References ----------- +Recommendations +--------------- + +Select only required columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of passing ``SELECT * FROM ...`` prefer passing exact column names ``SELECT col1, col2, ...``. +This reduces the amount of data passed from Oracle to Spark. -.. currentmodule:: onetl.connection.db_connection.oracle.connection +Pay attention to ``where`` value +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. automethod:: Oracle.sql +Instead of filtering data on Spark side using ``df.filter(df.column == 'value')`` pass proper ``WHERE column = 'value'`` clause. +This both reduces the amount of data send from Oracle to Spark, and may also improve performance of the query. +Especially if there are indexes or partitions for columns used in ``where`` clause. diff --git a/docs/connection/db_connection/oracle/write.rst b/docs/connection/db_connection/oracle/write.rst index 0f21ec2ac..5ce0e3b86 100644 --- a/docs/connection/db_connection/oracle/write.rst +++ b/docs/connection/db_connection/oracle/write.rst @@ -11,7 +11,7 @@ For writing data to Oracle, use :obj:`DBWriter ` + It is always recommended to create table explicitly using :ref:`Oracle.execute ` instead of relying on Spark's table DDL generation. This is because Spark's DDL generator can create columns with different precision and types than it is expected, @@ -37,9 +37,8 @@ Examples writer.run(df) - -Write options -------------- +Options +------- Method above accepts :obj:`JDBCWriteOptions ` diff --git a/docs/connection/db_connection/postgres/execute.rst b/docs/connection/db_connection/postgres/execute.rst index ac9f553a5..8c966c110 100644 --- a/docs/connection/db_connection/postgres/execute.rst +++ b/docs/connection/db_connection/postgres/execute.rst @@ -3,20 +3,30 @@ Executing statements in Postgres ================================ +.. warning:: + + Methods below **read all the rows** returned from DB **to Spark driver memory**, and then convert them to DataFrame. + + Do **NOT** use them to read large amounts of data. Use :ref:`DBReader ` or :ref:`Postgres.sql ` instead. + How to ------ There are 2 ways to execute some statement in Postgres -Use :obj:`Postgres.fetch ` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Use ``Postgres.fetch`` +~~~~~~~~~~~~~~~~~~~~~~ Use this method to execute some ``SELECT`` query which returns **small number or rows**, like reading -Postgres config, or reading data from some reference table. +Postgres config, or reading data from some reference table. Method returns Spark DataFrame. Method accepts :obj:`JDBCOptions `. -Connection opened using this method should be then closed with :obj:`Postgres.close `. +Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. + +.. warning:: + + Please take into account :ref:`postgres-types`. Syntax support ^^^^^^^^^^^^^^ @@ -43,23 +53,23 @@ Examples postgres.close() value = df.collect()[0][0] # get value from first row and first column -Use :obj:`Postgres.execute ` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Use ``Postgres.execute`` +~~~~~~~~~~~~~~~~~~~~~~~~ Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it. Method accepts :obj:`JDBCOptions `. -Connection opened using this method should be then closed with :obj:`Postgres.close `. +Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. Syntax support ^^^^^^^^^^^^^^ This method supports **any** query syntax supported by Postgres, like: -* ✅︎ ``CREATE TABLE ...``, ``CREATE VIEW ...``, and so on +* ✅︎ ``CREATE TABLE ...``, ``CREATE VIEW ...``, ``DROP TABLE ...``, and so on * ✅︎ ``ALTER ...`` -* ✅︎ ``INSERT INTO ... AS SELECT ...`` +* ✅︎ ``INSERT INTO ... SELECT ...``, ``UPDATE ...``, ``DELETE ...``, and so on * ✅︎ ``DROP TABLE ...``, ``DROP VIEW ...``, and so on * ✅︎ ``CALL procedure(arg1, arg2) ...`` * ✅︎ ``SELECT func(arg1, arg2)`` or ``{call func(arg1, arg2)}`` - special syntax for calling functions @@ -76,6 +86,7 @@ Examples postgres = Postgres(...) with postgres: + # automatically close connection after exiting this context manager postgres.execute("DROP TABLE schema.table") postgres.execute( """ @@ -88,15 +99,8 @@ Examples options=Postgres.JDBCOptions(query_timeout=10), ) - -References ----------- - -.. currentmodule:: onetl.connection.db_connection.postgres.connection - -.. automethod:: Postgres.fetch -.. automethod:: Postgres.execute -.. automethod:: Postgres.close +Options +------- .. currentmodule:: onetl.connection.db_connection.jdbc_mixin.options diff --git a/docs/connection/db_connection/postgres/read.rst b/docs/connection/db_connection/postgres/read.rst index 0733f8e08..67b5234a2 100644 --- a/docs/connection/db_connection/postgres/read.rst +++ b/docs/connection/db_connection/postgres/read.rst @@ -3,13 +3,13 @@ Reading from Postgres using ``DBReader`` ======================================== +:obj:`DBReader ` supports :ref:`strategy` for incremental data reading, +but does not support custom queries, like ``JOIN``. + .. warning:: Please take into account :ref:`postgres-types` -:obj:`DBReader ` supports :ref:`strategy` for incremental data reading, -but does not support custom queries, like JOINs. - Supported DBReader features --------------------------- @@ -67,8 +67,23 @@ Incremental strategy: with IncrementalStrategy(): df = reader.run() -Read options ------------- +Recommendations +--------------- + +Select only required columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of passing ``"*"`` in ``DBReader(columns=[...])`` prefer passing exact column names. This reduces the amount of data passed from Postgres to Spark. + +Pay attention to ``where`` value +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of filtering data on Spark side using ``df.filter(df.column == 'value')`` pass proper ``DBReader(where="column = 'value'")`` clause. +This both reduces the amount of data send from Postgres to Spark, and may also improve performance of the query. +Especially if there are indexes or partitions for columns used in ``where`` clause. + +Options +------- .. currentmodule:: onetl.connection.db_connection.jdbc_connection.options diff --git a/docs/connection/db_connection/postgres/sql.rst b/docs/connection/db_connection/postgres/sql.rst index e0316e2a4..3f762f1ad 100644 --- a/docs/connection/db_connection/postgres/sql.rst +++ b/docs/connection/db_connection/postgres/sql.rst @@ -3,14 +3,16 @@ Reading from Postgres using ``Postgres.sql`` ============================================ +``Postgres.sql`` allows passing custom SQL query, but does not support incremental strategies. + .. warning:: Please take into account :ref:`postgres-types` -:obj:`Postgres.sql ` allows passing custom SQL query, -but does not support incremental strategies. +.. warning:: -Method also accepts :obj:`JDBCReadOptions `. + Statement is executed in **read-write** connection, so if you're calling some functions/procedures with DDL/DML statements inside, + they can change data in your database. Syntax support -------------- @@ -41,12 +43,26 @@ Examples WHERE key = 'something' """, - options=Postgres.ReadOptions(partition_column="id", num_partitions=10), + options=Postgres.ReadOptions( + partition_column="id", + num_partitions=10, + lower_bound=0, + upper_bound=1000, + ), ) -References ----------- +Recommendations +--------------- + +Select only required columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of passing ``SELECT * FROM ...`` prefer passing exact column names ``SELECT col1, col2, ...``. +This reduces the amount of data passed from Postgres to Spark. -.. currentmodule:: onetl.connection.db_connection.postgres.connection +Pay attention to ``where`` value +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. automethod:: Postgres.sql +Instead of filtering data on Spark side using ``df.filter(df.column == 'value')`` pass proper ``WHERE column = 'value'`` clause. +This both reduces the amount of data send from Postgres to Spark, and may also improve performance of the query. +Especially if there are indexes or partitions for columns used in ``where`` clause. diff --git a/docs/connection/db_connection/postgres/write.rst b/docs/connection/db_connection/postgres/write.rst index ad0c42f68..f35edf9b8 100644 --- a/docs/connection/db_connection/postgres/write.rst +++ b/docs/connection/db_connection/postgres/write.rst @@ -11,7 +11,7 @@ For writing data to Postgres, use :obj:`DBWriter ` + It is always recommended to create table explicitly using :ref:`Postgres.execute ` instead of relying on Spark's table DDL generation. This is because Spark's DDL generator can create columns with different precision and types than it is expected, @@ -37,9 +37,8 @@ Examples writer.run(df) - -Write options -------------- +Options +------- Method above accepts :obj:`JDBCWriteOptions ` diff --git a/docs/connection/db_connection/teradata/execute.rst b/docs/connection/db_connection/teradata/execute.rst index 80853f919..545b473d8 100644 --- a/docs/connection/db_connection/teradata/execute.rst +++ b/docs/connection/db_connection/teradata/execute.rst @@ -3,11 +3,103 @@ Executing statements in Teradata ================================ -.. currentmodule:: onetl.connection.db_connection.teradata.connection +.. warning:: -.. automethod:: Teradata.fetch -.. automethod:: Teradata.execute -.. automethod:: Teradata.close + Methods below **read all the rows** returned from DB **to Spark driver memory**, and then convert them to DataFrame. + + Do **NOT** use them to read large amounts of data. Use :ref:`DBReader ` or :ref:`Teradata.sql ` instead. + +How to +------ + +There are 2 ways to execute some statement in Teradata + +Use ``Teradata.fetch`` +~~~~~~~~~~~~~~~~~~~~~~ + +Use this method to execute some ``SELECT`` query which returns **small number or rows**, like reading +Teradata config, or reading data from some reference table. Method returns Spark DataFrame. + +Method accepts :obj:`JDBCOptions `. + +Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. + +Syntax support +^^^^^^^^^^^^^^ + +This method supports **any** query syntax supported by Teradata, like: + +* ✅︎ ``SELECT ... FROM ...`` +* ✅︎ ``WITH alias AS (...) SELECT ...`` +* ✅︎ ``SHOW ...`` +* ❌ ``SET ...; SELECT ...;`` - multiple statements not supported + +Examples +^^^^^^^^ + +.. code-block:: python + + from onetl.connection import Teradata + + teradata = Teradata(...) + + df = teradata.fetch( + "SELECT value FROM some.reference_table WHERE key = 'some_constant'", + options=Teradata.JDBCOptions(query_timeout=10), + ) + teradata.close() + value = df.collect()[0][0] # get value from first row and first column + +Use ``Teradata.execute`` +~~~~~~~~~~~~~~~~~~~~~~~~ + +Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it. + +Method accepts :obj:`JDBCOptions `. + +Connection opened using this method should be then closed with ``connection.close()`` or ``with connection:``. + +Syntax support +^^^^^^^^^^^^^^ + +This method supports **any** query syntax supported by Teradata, like: + +* ✅︎ ``CREATE TABLE ...``, ``CREATE VIEW ...``, ``DROP TABLE ...``, and so on +* ✅︎ ``ALTER ...`` +* ✅︎ ``INSERT INTO ... SELECT ...``, ``UPDATE ...``, ``DELETE ...``, and so on +* ✅︎ ``DROP TABLE ...``, ``DROP VIEW ...``, and so on +* ✅︎ ``CALL procedure(arg1, arg2) ...`` or ``{call procedure(arg1, arg2)}`` - special syntax for calling procedure +* ✅︎ ``EXECUTE macro(arg1, arg2)`` +* ✅︎ ``EXECUTE FUNCTION ...`` +* ✅︎ other statements not mentioned here +* ❌ ``SET ...; SELECT ...;`` - multiple statements not supported + +Examples +^^^^^^^^ + +.. code-block:: python + + from onetl.connection import Teradata + + teradata = Teradata(...) + + with teradata: + # automatically close connection after exiting this context manager + teradata.execute("DROP TABLE database.table") + teradata.execute( + """ + CREATE MULTISET TABLE database.table AS ( + id BIGINT, + key VARCHAR, + value REAL + ) + NO PRIMARY INDEX + """, + options=Teradata.JDBCOptions(query_timeout=10), + ) + +Options +------- .. currentmodule:: onetl.connection.db_connection.jdbc_mixin.options diff --git a/docs/connection/db_connection/teradata/index.rst b/docs/connection/db_connection/teradata/index.rst index 2f6d6636d..a7d021947 100644 --- a/docs/connection/db_connection/teradata/index.rst +++ b/docs/connection/db_connection/teradata/index.rst @@ -7,6 +7,7 @@ Teradata :maxdepth: 1 :caption: Connection + prerequisites connection .. toctree:: @@ -14,5 +15,6 @@ Teradata :caption: Operations read + sql write execute diff --git a/docs/connection/db_connection/teradata/prerequisites.rst b/docs/connection/db_connection/teradata/prerequisites.rst new file mode 100644 index 000000000..294f9d536 --- /dev/null +++ b/docs/connection/db_connection/teradata/prerequisites.rst @@ -0,0 +1,60 @@ +.. _teradata-prerequisites: + +Prerequisites +============= + +Version Compatibility +--------------------- + +* Teradata server versions: 16.10 - 20.0 +* Spark versions: 2.3.x - 3.5.x +* Java versions: 8 - 20 + +See `official documentation `_. + +Installing PySpark +------------------ + +To use Teradata connector you should have PySpark installed (or injected to ``sys.path``) +BEFORE creating the connector instance. + +See :ref:`install-spark` installation instruction for more details. + +Connecting to Teradata +----------------------- + +Connection host +~~~~~~~~~~~~~~~ + +It is possible to connect to Teradata by using either DNS name Parsing Engine (PE) host, or it's IP address. + +Connection port +~~~~~~~~~~~~~~~ + +Connection is usually performed to port ``1025``. Port may differ for different Teradata instances. +Please ask your Teradata administrator to provide required information. + +Required grants +~~~~~~~~~~~~~~~ + +Ask your Teradata cluster administrator to set following grants for a user, +used for creating a connection: + +.. tabs:: + + .. code-tab:: sql Read + Write + + -- allow creating tables in the target schema + GRANT CREATE TABLE ON database TO username; + + -- allow read & write access to specific table + GRANT SELECT, INSERT ON database.mytable TO username; + + .. code-tab:: sql Read only + + -- allow read access to specific table + GRANT SELECT ON database.mytable TO username; + +See: + * `Teradata access rights `_ + * `GRANT documentation `_ diff --git a/docs/connection/db_connection/teradata/read.rst b/docs/connection/db_connection/teradata/read.rst index b23c42a59..f4cf95bfb 100644 --- a/docs/connection/db_connection/teradata/read.rst +++ b/docs/connection/db_connection/teradata/read.rst @@ -1,18 +1,119 @@ .. _teradata-read: -Reading from Teradata -===================== +Reading from Teradata using ``DBReader`` +======================================== -There are 2 ways of distributed data reading from Teradata: +:obj:`DBReader ` supports :ref:`strategy` for incremental data reading, +but does not support custom queries, like ``JOIN``. -* Using :obj:`DBReader ` with different :ref:`strategy` -* Using :obj:`Teradata.sql ` +Supported DBReader features +--------------------------- -Both methods accept :obj:`JDBCReadOptions ` +* ✅︎ ``columns`` +* ✅︎ ``where`` +* ✅︎ ``hwm``, supported strategies: +* * ✅︎ :ref:`snapshot-strategy` +* * ✅︎ :ref:`incremental-strategy` +* * ✅︎ :ref:`snapshot-batch-strategy` +* * ✅︎ :ref:`incremental-batch-strategy` +* ❌ ``hint`` (is not supported by Teradata) +* ❌ ``df_schema`` +* ✅︎ ``options`` (see :obj:`JDBCReadOptions `) -.. currentmodule:: onetl.connection.db_connection.teradata.connection +Examples +-------- -.. automethod:: Teradata.sql +Snapshot strategy: + +.. code-block:: python + + from onetl.connection import Teradata + from onetl.db import DBReader + + teradata = Teradata(...) + + reader = DBReader( + connection=teradata, + source="database.table", + columns=["id", "key", "CAST(value AS VARCHAR) value", "updated_dt"], + where="key = 'something'", + options=Teradata.ReadOptions( + partition_column="id", + num_partitions=10, + partitioning_mode="hash", + ), + ) + df = reader.run() + +Incremental strategy: + +.. code-block:: python + + from onetl.connection import Teradata + from onetl.db import DBReader + from onetl.strategy import IncrementalStrategy + + teradata = Teradata(...) + + reader = DBReader( + connection=teradata, + source="database.table", + columns=["id", "key", "CAST(value AS VARCHAR) value", "updated_dt"], + where="key = 'something'", + hwm=DBReader.AutoDetectHWM(name="teradata_hwm", expression="updated_dt"), + options=Teradata.ReadOptions( + partition_column="id", + num_partitions=10, + partitioning_mode="hash", + ), + ) + + with IncrementalStrategy(): + df = reader.run() + +Recommendations +--------------- + +Select only required columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of passing ``"*"`` in ``DBReader(columns=[...])`` prefer passing exact column names. This reduces the amount of data passed from Teradata to Spark. + +Pay attention to ``where`` value +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of filtering data on Spark side using ``df.filter(df.column == 'value')`` pass proper ``DBReader(where="column = 'value'")`` clause. +This both reduces the amount of data send from Teradata to Spark, and may also improve performance of the query. +Especially if there are indexes or partitions for columns used in ``where`` clause. + +Read data in parallel +~~~~~~~~~~~~~~~~~~~~~ + +``DBReader`` can read data in multiple parallel connections by passing ``Teradata.ReadOptions(num_partitions=..., partition_column=...)``. + +In the example above, Spark opens 10 parallel connections, and data is evenly distributed between all these connections using expression +``HASHAMP(HASHBUCKET(HASHROW({partition_column}))) MOD {num_partitions}``. +This allows sending each Spark worker only some piece of data, reducing resource consumption. +``partition_column`` here can be table column of any type. + +It is also possible to use ``partitioning_mode="mod"`` or ``partitioning_mode="range"``, but in this case +``partition_column`` have to be an integer, should not contain ``NULL``, and values to be uniformly distributed. +It is also less performant than ``partitioning_mode="hash"`` due to Teradata ``HASHAMP`` implementation. + +Do **NOT** use ``TYPE=FASTEXPORT`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Teradata supports several `different connection types `_: + * ``TYPE=DEFAULT`` - perform plain ``SELECT`` queries + * ``TYPE=FASTEXPORT`` - uses special FastExport protocol for select queries + +But ``TYPE=FASTEXPORT`` uses exclusive lock on the source table, so it is impossible to use multiple Spark workers parallel data read. +This leads to sending all the data to just one Spark worker, which is slow and takes a lot of RAM. + +Prefer using ``partitioning_mode="hash"`` from example above. + +Options +------- .. currentmodule:: onetl.connection.db_connection.jdbc_connection.options diff --git a/docs/connection/db_connection/teradata/sql.rst b/docs/connection/db_connection/teradata/sql.rst new file mode 100644 index 000000000..3531d01c3 --- /dev/null +++ b/docs/connection/db_connection/teradata/sql.rst @@ -0,0 +1,66 @@ +.. _teradata-sql: + +Reading from Teradata using ``Teradata.sql`` +============================================ + +``Teradata.sql`` allows passing custom SQL query, but does not support incremental strategies. + +.. warning:: + + Statement is executed in **read-write** connection, so if you're calling some functions/procedures with DDL/DML statements inside, + they can change data in your database. + +Syntax support +-------------- + +Only queries with the following syntax are supported: + +* ✅︎ ``SELECT ... FROM ...`` +* ✅︎ ``WITH alias AS (...) SELECT ...`` +* ❌ ``SHOW ...`` +* ❌ ``SET ...; SELECT ...;`` - multiple statements not supported + +Examples +-------- + +.. code-block:: python + + from onetl.connection import Teradata + + teradata = Teradata(...) + df = teradata.sql( + """ + SELECT + id, + key, + CAST(value AS VARCHAR) AS value, + updated_at, + HASHAMP(HASHBUCKET(HASHROW(id))) MOD 10 AS part_column + FROM + database.mytable + WHERE + key = 'something' + """, + options=Teradata.ReadOptions( + partition_column="part_column", + num_partitions=10, + lower_bound=0, + upper_bound=9, + ), + ) + +Recommendations +--------------- + +Select only required columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of passing ``SELECT * FROM ...`` prefer passing exact column names ``SELECT col1, col2, ...``. +This reduces the amount of data passed from Teradata to Spark. + +Pay attention to ``where`` value +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of filtering data on Spark side using ``df.filter(df.column == 'value')`` pass proper ``WHERE column = 'value'`` clause. +This both reduces the amount of data send from Teradata to Spark, and may also improve performance of the query. +Especially if there are indexes or partitions for columns used in ``where`` clause. diff --git a/docs/connection/db_connection/teradata/write.rst b/docs/connection/db_connection/teradata/write.rst index 7e5cbef40..288e79667 100644 --- a/docs/connection/db_connection/teradata/write.rst +++ b/docs/connection/db_connection/teradata/write.rst @@ -1,9 +1,116 @@ .. _teradata-write: -Writing to Teradata -=================== +Writing to Teradata using ``DBWriter`` +====================================== -For writing data to Teradata, use :obj:`DBWriter ` with options below. +For writing data to Teradata, use :obj:`DBWriter `. + +.. warning:: + + It is always recommended to create table explicitly using :ref:`Teradata.execute ` + instead of relying on Spark's table DDL generation. + + This is because Spark's DDL generator can create columns with different precision and types than it is expected, + causing precision loss or other issues. + +Examples +-------- + +.. code-block:: python + + from onetl.connection import Teradata + from onetl.db import DBWriter + + teradata = Teradata( + ..., + extra={"TYPE": "FASTLOAD", "TMODE": "TERA"}, + ) + + df = ... # data is here + + writer = DBWriter( + connection=teradata, + target="database.table", + options=Teradata.WriteOptions( + if_exists="append", + # avoid creating SET table, use MULTISET + createTableOptions="NO PRIMARY INDEX", + ), + ) + + writer.run(df.repartition(1)) + +Recommendations +--------------- + +Number of connections +~~~~~~~~~~~~~~~~~~~~~ + +Teradata is not MVCC based, so write operations take exclusive lock on the entire table. +So **it is impossible to write data to Teradata table in multiple parallel connections**, no exceptions. + +The only way to write to Teradata without making deadlocks is write dataframe with exactly 1 partition. + +It can be implemented using ``df.repartition(1)``: + +.. code-block:: python + + # do NOT use df.coalesce(1) as it can freeze + writer.run(df.repartition(1)) + +This moves all the data to just one Spark worker, so it may consume a lot of RAM. It is usually require to increase ``spark.executor.memory`` to handle this. + +Another way is to write all dataframe partitions one-by-one: + +.. code-block:: python + + from pyspark.sql.functions import spark_partition_id + + # get list of all partitions in the dataframe + partitions = sorted(df.select(spark_partition_id()).distinct().collect()) + + for partition in partitions: + # get only part of data within this exact partition + part_df = df.where(**partition.asDict()).coalesce(1) + + writer.run(part_df) + +This require even data distribution for all partitions to avoid data skew and spikes of RAM consuming. + +Choosing connection type +~~~~~~~~~~~~~~~~~~~~~~~~ + +Teradata supports several `different connection types `_: + * ``TYPE=DEFAULT`` - perform plain ``INSERT`` queries + * ``TYPE=FASTLOAD`` - uses special FastLoad protocol for insert queries + +It is always recommended to use ``TYPE=FASTLOAD`` because: + * It provides higher performance + * It properly handles inserting ``NULL`` values (``TYPE=DEFAULT`` raises an exception) + +But it can be used only during write, not read. + +Choosing transaction mode +~~~~~~~~~~~~~~~~~~~~~~~~~ + +Teradata supports `2 different transaction modes `_: + * ``TMODE=ANSI`` + * ``TMODE=TERA`` + +Choosing one of the modes can alter connector behavior. For example: + * Inserting data which exceeds table column length, like insert ``CHAR(25)`` to column with type ``CHAR(24)``: + * * ``TMODE=ANSI`` - raises exception + * * ``TMODE=TERA`` - truncates input string to 24 symbols + * Creating table using Spark: + * * ``TMODE=ANSI`` - creates ``MULTISET`` table + * * ``TMODE=TERA`` - creates ``SET`` table with ``PRIMARY KEY`` is a first column in dataframe. + This can lead to slower insert time, because each row will be checked against a unique index. + Fortunately, this can be disabled by passing custom ``createTableOptions``. + +Options +------- + +Method above accepts :obj:`JDBCWriteOptions ` .. currentmodule:: onetl.connection.db_connection.jdbc_connection.options diff --git a/onetl/connection/db_connection/hive/options.py b/onetl/connection/db_connection/hive/options.py index cdf5b5d5c..5f687112f 100644 --- a/onetl/connection/db_connection/hive/options.py +++ b/onetl/connection/db_connection/hive/options.py @@ -82,8 +82,8 @@ class HiveWriteOptions(GenericOptions): options = Hive.WriteOptions( if_exists="append", - partitionBy="reg_id", - someNewOption="value", + partition_by="reg_id", + customOption="value", ) """ diff --git a/onetl/connection/db_connection/jdbc_connection/connection.py b/onetl/connection/db_connection/jdbc_connection/connection.py index 9396dc802..3133d3671 100644 --- a/onetl/connection/db_connection/jdbc_connection/connection.py +++ b/onetl/connection/db_connection/jdbc_connection/connection.py @@ -68,38 +68,15 @@ def sql( Same as ``spark.read.jdbc(query)``. - .. note:: - - This method does not support :ref:`strategy`, - use :obj:`DBReader ` instead - - .. note:: - - Statement is executed in read-write connection, - so if you're calling some functions/procedures with DDL/DML statements inside, - they can change data in your database. - - Unfortunately, Spark does no provide any option to change this behavior. - Parameters ---------- query : str SQL query to be executed. - Only ``SELECT ... FROM ...`` form is supported. - - Some databases also supports ``WITH ... AS (...) SELECT ... FROM ...`` form. - - Queries like ``SHOW ...`` are not supported. - - .. warning:: - - The exact syntax **depends on RDBMS** is being used. - options : dict, :obj:`~ReadOptions`, default: ``None`` - Spark options to be used while fetching data, like ``fetchsize`` or ``partitionColumn`` + Spark options to be used while fetching data. Returns ------- @@ -107,23 +84,6 @@ def sql( Spark dataframe - Examples - -------- - - Read data from a table: - - .. code:: python - - df = connection.sql("SELECT * FROM mytable") - - Read data from a table with options: - - .. code:: python - - # reads data from table in batches, 10000 rows per batch - df = connection.sql("SELECT * FROM mytable", {"fetchsize": 10000}) - assert df.count() - """ query = clear_statement(query) diff --git a/onetl/connection/db_connection/jdbc_connection/options.py b/onetl/connection/db_connection/jdbc_connection/options.py index fbeb3cead..50eda6a51 100644 --- a/onetl/connection/db_connection/jdbc_connection/options.py +++ b/onetl/connection/db_connection/jdbc_connection/options.py @@ -123,11 +123,11 @@ class JDBCReadOptions(JDBCOptions): .. code:: python options = JDBC.ReadOptions( - partitionColumn="reg_id", - numPartitions=10, - lowerBound=0, - upperBound=1000, - someNewOption="value", + partition_column="reg_id", + num_partitions=10, + lower_bound=0, + upper_bound=1000, + customOption="value", ) """ @@ -390,7 +390,7 @@ class JDBCWriteOptions(JDBCOptions): .. code:: python - options = JDBC.WriteOptions(if_exists="append", batchsize=20_000, someNewOption="value") + options = JDBC.WriteOptions(if_exists="append", batchsize=20_000, customOption="value") """ class Config: diff --git a/onetl/connection/db_connection/jdbc_mixin/connection.py b/onetl/connection/db_connection/jdbc_mixin/connection.py index ab49aae81..76424c30c 100644 --- a/onetl/connection/db_connection/jdbc_mixin/connection.py +++ b/onetl/connection/db_connection/jdbc_mixin/connection.py @@ -147,22 +147,7 @@ def fetch( """ **Immediately** execute SELECT statement **on Spark driver** and return in-memory DataFrame. |support_hooks| - Works almost the same like :obj:`~sql`, but calls JDBC driver directly. - - .. warning:: - - This function should **NOT** be used to return dataframe with large number of rows/columns, - like ``SELECT * FROM table`` (without any filtering). - - Use it **ONLY** to execute queries with **one or just a few returning rows/columns**, - like ``SELECT COUNT(*) FROM table`` or ``SELECT * FROM table WHERE id = ...``. - - This is because all the data is fetched through master host, not in a distributed way - (even on clustered instances). - Also the resulting dataframe is stored directly in driver's memory, which is usually quite limited. - - Use :obj:`DBReader ` for reading - data from table via Spark executors, or for handling different :ref:`strategy`. + Works almost the same like :obj:`~sql`, but Spark executor is not used. .. note:: @@ -179,16 +164,6 @@ def fetch( SQL query to be executed. - Can be in any form of SELECT supported by RDBMS, like: - - * ``SELECT ... FROM ...`` - * ``WITH ... AS (...) SELECT ... FROM ...`` - * *Some* RDBMS instances also support ``SHOW ...`` queries, like ``SHOW TABLES`` - - .. warning:: - - The exact syntax **depends on a RDBMS** is being used. - options : dict, :obj:`~JDBCOptions`, default: ``None`` Options to be passed directly to JDBC driver, like ``fetchsize`` or ``queryTimeout`` @@ -202,16 +177,6 @@ def fetch( df : pyspark.sql.dataframe.DataFrame Spark dataframe - - Examples - -------- - - Read data from a table: - - .. code:: python - - df = connection.fetch("SELECT * FROM mytable LIMIT 10") - assert df.count() """ query = clear_statement(query) @@ -237,25 +202,9 @@ def execute( """ **Immediately** execute DDL, DML or procedure/function **on Spark driver**. |support_hooks| - Returns DataFrame only if input is DML statement with ``RETURNING ...`` clause, or a procedure/function call. - In other cases returns ``None``. - There is no method like this in :obj:`pyspark.sql.SparkSession` object, but Spark internal methods works almost the same (but on executor side). - .. warning:: - - Use this method **ONLY** to execute queries which return **small number of rows/column, - or return nothing**. - - This is because all the data is fetched through master host, not in a distributed way - (even on clustered instances). - Also the resulting dataframe is stored directly in driver's memory, which is usually quite limited. - - Please **DO NOT** use this method to execute queries - like ``INSERT INTO ... VALUES(a, lot, of, values)``, - use :obj:`onetl.db.db_writer.db_writer.DBWriter` instead - .. note:: First call of the method opens the connection to a database. @@ -265,33 +214,7 @@ def execute( ---------- statement : str - Statement to be executed, like: - - DML statements: - - * ``INSERT INTO target_table SELECT * FROM source_table`` - * ``UPDATE mytable SET value = 1 WHERE id BETWEEN 100 AND 999`` - * ``DELETE FROM mytable WHERE id BETWEEN 100 AND 999`` - * ``TRUNCATE TABLE mytable`` - - DDL statements: - - * ``CREATE TABLE mytable (...)`` - * ``ALTER SCHEMA myschema ...`` - * ``DROP PROCEDURE myprocedure`` - - Call statements: - - * ``BEGIN ... END`` - * ``EXEC myprocedure`` - * ``EXECUTE myprocedure(arg1)`` - * ``CALL myfunction(arg1, arg2)`` - * ``{call myprocedure(arg1, ?)}`` (``?`` is output parameter) - * ``{?= call myfunction(arg1, arg2)}`` - - .. warning:: - - The exact syntax **depends on a RDBMS** is being used. + Statement to be executed. options : dict, :obj:`~JDBCOptions`, default: ``None`` @@ -305,27 +228,10 @@ def execute( ------- df : pyspark.sql.dataframe.DataFrame, optional - Spark dataframe - - Examples - -------- - - Create table: - - .. code:: python - - assert connection.execute("CREATE TABLE target_table(id NUMBER, data VARCHAR)") is None + Spark DataFrame. - Insert data to one table from another, with a specific transaction isolation level, - and return DataFrame with new rows: - - .. code:: python - - df = connection.execute( - "INSERT INTO target_table SELECT * FROM source_table RETURNING id, data", - {"isolationLevel": "READ_COMMITTED"}, - ) - assert df.count() + DataFrame is returned only if input is DML statement with ``RETURNING ...`` clause, + or a procedure/function call. In other cases returns ``None``. """ statement = clear_statement(statement) diff --git a/onetl/connection/db_connection/teradata/connection.py b/onetl/connection/db_connection/teradata/connection.py index 95741927b..0dc3e67dd 100644 --- a/onetl/connection/db_connection/teradata/connection.py +++ b/onetl/connection/db_connection/teradata/connection.py @@ -32,29 +32,9 @@ class Teradata(JDBCConnection): Based on package ``com.teradata.jdbc:terajdbc:17.20.00.15`` (`official Teradata JDBC driver `_). - .. dropdown:: Version compatibility - - * Teradata server versions: 16.10 - 20.0 - * Spark versions: 2.3.x - 3.5.x - * Java versions: 8 - 20 - - See `official documentation `_. - .. warning:: - To use Teradata connector you should have PySpark installed (or injected to ``sys.path``) - BEFORE creating the connector instance. - - You can install PySpark as follows: - - .. code:: bash - - pip install onetl[spark] # latest PySpark version - - # or - pip install onetl pyspark=3.5.0 # pass specific PySpark version - - See :ref:`install-spark` installation instruction for more details. + Before using this connector please take into account :ref:`teradata-prerequisites` Parameters ---------- @@ -92,10 +72,10 @@ class Teradata(JDBCConnection): By default, these options are added to extra: * ``CHARSET = "UTF8"`` - * ``COLUMN_NAME = "ON"`` - * ``FLATTEN = "ON"`` + * ``COLUMN_NAME = "ON"`` - allow reading column title from a table + * ``FLATTEN = "ON"`` - improves error messages * ``MAYBENULL = "ON"`` - * ``STRICT_NAMES = "OFF"`` + * ``STRICT_NAMES = "OFF"`` - ignore Spark options passed to JDBC URL It is possible to override default values, for example set ``extra={"FLATTEN": "OFF"}`` From 45e1405166a3220b7508ad6c98e48d23b011722c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Wed, 20 Mar 2024 07:44:04 +0000 Subject: [PATCH 54/63] [DOP-13779] Update blacken-docs --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c591ac80d..9a1da04b7 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -105,7 +105,7 @@ repos: hooks: - id: blacken-docs additional_dependencies: - - black==24.2.0 + - black==24.3.0 - repo: meta hooks: From 161fccf1339dd16386692571c931410ec510fb86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Wed, 20 Mar 2024 07:54:19 +0000 Subject: [PATCH 55/63] [DOP-13779] Add bandit to pre-commit config --- .pre-commit-config.yaml | 10 ++++++++++ onetl/connection/file_connection/ftp.py | 2 +- onetl/connection/file_connection/ftps.py | 4 ++-- tests/fixtures/create_keytab.py | 2 +- tests/tests_integration/tests_hwm_store_integration.py | 2 +- 5 files changed, 15 insertions(+), 5 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9a1da04b7..516ea1cf1 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -107,6 +107,16 @@ repos: additional_dependencies: - black==24.3.0 + - repo: https://github.com/pycqa/bandit + rev: 1.7.8 + hooks: + - id: bandit + args: + - --aggregate=file + - -iii + - -ll + require_serial: true + - repo: meta hooks: - id: check-hooks-apply diff --git a/onetl/connection/file_connection/ftp.py b/onetl/connection/file_connection/ftp.py index 6fcbaa856..07b8b53e1 100644 --- a/onetl/connection/file_connection/ftp.py +++ b/onetl/connection/file_connection/ftp.py @@ -2,7 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations -import ftplib # noqa: S402 +import ftplib # noqa: S402 # nosec import os import textwrap from logging import getLogger diff --git a/onetl/connection/file_connection/ftps.py b/onetl/connection/file_connection/ftps.py index 859cca942..9277b66fd 100644 --- a/onetl/connection/file_connection/ftps.py +++ b/onetl/connection/file_connection/ftps.py @@ -1,6 +1,6 @@ # SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) # SPDX-License-Identifier: Apache-2.0 -import ftplib # NOQA: S402 +import ftplib # noqa: S402 # nosec import textwrap from ftputil import FTPHost @@ -31,7 +31,7 @@ class TLSfix(ftplib.FTP_TLS): # noqa: N801 """ def ntransfercmd(self, cmd, rest=None): - conn, size = ftplib.FTP.ntransfercmd(self, cmd, rest) # noqa: S321 + conn, size = ftplib.FTP.ntransfercmd(self, cmd, rest) # noqa: S321 # nosec if self._prot_p: conn = self.context.wrap_socket( conn, diff --git a/tests/fixtures/create_keytab.py b/tests/fixtures/create_keytab.py index 368c36d59..c44cb1a62 100644 --- a/tests/fixtures/create_keytab.py +++ b/tests/fixtures/create_keytab.py @@ -14,4 +14,4 @@ def create_keytab(tmp_path_factory): @pytest.fixture def keytab_md5(): - return hashlib.md5(b"content").hexdigest() # noqa: S324 + return hashlib.md5(b"content").hexdigest() # noqa: S324 # nosec diff --git a/tests/tests_integration/tests_hwm_store_integration.py b/tests/tests_integration/tests_hwm_store_integration.py index 8b6938b33..1b589de2d 100644 --- a/tests/tests_integration/tests_hwm_store_integration.py +++ b/tests/tests_integration/tests_hwm_store_integration.py @@ -11,7 +11,7 @@ hwm_store = [ MemoryHWMStore(), - YAMLHWMStore(path=tempfile.mktemp("hwmstore")), # noqa: S306 NOSONAR + YAMLHWMStore(path=tempfile.mktemp("hwmstore")), # noqa: S306 # nosec ] From b6f89f9800254ac772502eaa3d404e254815bb18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Wed, 20 Mar 2024 08:09:58 +0000 Subject: [PATCH 56/63] [DOP-13779] Document HorizonHWMStore --- docs/hwm_store/index.rst | 7 +++++-- docs/hwm_store/yaml_hwm_store.rst | 4 ++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/docs/hwm_store/index.rst b/docs/hwm_store/index.rst index 884a72c1e..e07676989 100644 --- a/docs/hwm_store/index.rst +++ b/docs/hwm_store/index.rst @@ -3,11 +3,14 @@ HWM === -Since ``onetl>=0.10.0`` version, the HWM Store and HWM classes have been moved to a separate library :etl-entities:`etl-entities <>`. +Since ``onetl>=0.10.0`` version, the ``HWMStore`` and ``HWM`` classes have been moved to a separate library :etl-entities:`etl-entities <>`. -The only class was left intact is YamlHWMStore, **which is default** in onETL: +The only class was left intact is :ref:`yaml-hwm-store`, **which is default** in onETL. + +Other known implementation is `HorizonHWMStore `_. .. toctree:: :maxdepth: 2 + :hidden: yaml_hwm_store diff --git a/docs/hwm_store/yaml_hwm_store.rst b/docs/hwm_store/yaml_hwm_store.rst index 45cf2612a..6ac5e2530 100644 --- a/docs/hwm_store/yaml_hwm_store.rst +++ b/docs/hwm_store/yaml_hwm_store.rst @@ -1,7 +1,7 @@ .. _yaml-hwm-store: -YAML HWM Store -================================================================= +YAMLHWMStore +============ .. currentmodule:: onetl.hwm.store.yaml_hwm_store From bf51c19f3b97158770fc9e609eaaef94490946ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Wed, 20 Mar 2024 08:10:23 +0000 Subject: [PATCH 57/63] [DOP-13779] Document HorizonHWMStore --- docs/hwm_store/index.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/hwm_store/index.rst b/docs/hwm_store/index.rst index e07676989..e6104d63c 100644 --- a/docs/hwm_store/index.rst +++ b/docs/hwm_store/index.rst @@ -3,7 +3,7 @@ HWM === -Since ``onetl>=0.10.0`` version, the ``HWMStore`` and ``HWM`` classes have been moved to a separate library :etl-entities:`etl-entities <>`. +Since onETL v0.10.0, the ``HWMStore`` and ``HWM`` classes have been moved to a separate library :etl-entities:`etl-entities <>`. The only class was left intact is :ref:`yaml-hwm-store`, **which is default** in onETL. From 857ecd02a1efda27ec48dfbb3923fafc0a54d547 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Wed, 20 Mar 2024 08:49:00 +0000 Subject: [PATCH 58/63] [DOP-13779] Add robots.txt to documentation --- docs/conf.py | 1 + docs/robots.txt | 4 ++++ 2 files changed, 5 insertions(+) create mode 100644 docs/robots.txt diff --git a/docs/conf.py b/docs/conf.py index f65c954b3..568258709 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -84,6 +84,7 @@ # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] +html_extra_path = ["robots.txt"] # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. diff --git a/docs/robots.txt b/docs/robots.txt new file mode 100644 index 000000000..dda0be56a --- /dev/null +++ b/docs/robots.txt @@ -0,0 +1,4 @@ +User-agent: * +Allow: /*/stable/ +Allow: /en/stable/ # Fallback for bots that don't understand wildcards +Disallow: / From ba2b82445c69bf5fc3cf7f10b49596d299e1b5c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Wed, 20 Mar 2024 08:51:14 +0000 Subject: [PATCH 59/63] [DOP-13779] Add docs config --- docs/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/conf.py b/docs/conf.py index 568258709..188b1cdbd 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -84,7 +84,6 @@ # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] -html_extra_path = ["robots.txt"] # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. @@ -108,6 +107,7 @@ # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ["_static"] +html_extra_path = ["robots.txt"] html_logo = "./_static/logo.svg" favicons = [ {"rel": "icon", "href": "icon.svg", "type": "image/svg+xml"}, From aa6f11c51550b3b0316b2875b1db727773f8a169 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Wed, 20 Mar 2024 08:54:57 +0000 Subject: [PATCH 60/63] [DOP-13779] Update robots.txt --- docs/robots.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/robots.txt b/docs/robots.txt index dda0be56a..dcab3dc9e 100644 --- a/docs/robots.txt +++ b/docs/robots.txt @@ -2,3 +2,4 @@ User-agent: * Allow: /*/stable/ Allow: /en/stable/ # Fallback for bots that don't understand wildcards Disallow: / +Sitemap: https://onel.readthedocs.io/sitemap.xml From fce801a750f197115b49dafeb3489e7c4a59e06b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Thu, 21 Mar 2024 07:35:45 +0000 Subject: [PATCH 61/63] [DOP-13779] Prepare for release --- docs/changelog/0.10.2.rst | 41 +++++++++++++++++++ docs/changelog/index.rst | 1 + .../next_release/211.improvement.rst | 5 --- .../next_release/228.improvement.rst | 5 --- docs/changelog/next_release/229.feature.rst | 4 -- .../next_release/229.improvement.rst | 5 --- docs/changelog/next_release/230.feature.rst | 1 - .../next_release/233.improvement.rst | 5 --- .../next_release/234.improvement.rst | 5 --- .../next_release/235.improvement.rst | 5 --- .../next_release/236.improvement.rst | 4 -- docs/changelog/next_release/237.feature.rst | 1 - docs/changelog/next_release/238.bugfix.rst | 2 - docs/changelog/next_release/239.bugfix.rst | 8 ---- .../next_release/240.improvement.rst | 4 -- 15 files changed, 42 insertions(+), 54 deletions(-) create mode 100644 docs/changelog/0.10.2.rst delete mode 100644 docs/changelog/next_release/211.improvement.rst delete mode 100644 docs/changelog/next_release/228.improvement.rst delete mode 100644 docs/changelog/next_release/229.feature.rst delete mode 100644 docs/changelog/next_release/229.improvement.rst delete mode 100644 docs/changelog/next_release/230.feature.rst delete mode 100644 docs/changelog/next_release/233.improvement.rst delete mode 100644 docs/changelog/next_release/234.improvement.rst delete mode 100644 docs/changelog/next_release/235.improvement.rst delete mode 100644 docs/changelog/next_release/236.improvement.rst delete mode 100644 docs/changelog/next_release/237.feature.rst delete mode 100644 docs/changelog/next_release/238.bugfix.rst delete mode 100644 docs/changelog/next_release/239.bugfix.rst delete mode 100644 docs/changelog/next_release/240.improvement.rst diff --git a/docs/changelog/0.10.2.rst b/docs/changelog/0.10.2.rst new file mode 100644 index 000000000..3ec6c43fb --- /dev/null +++ b/docs/changelog/0.10.2.rst @@ -0,0 +1,41 @@ +0.10.2 (2024-03-21) +=================== + +Features +-------- + +- Add support of Pydantic v2. (:github:pull:`230`) +- Update default ``Postgres(extra={...})`` to include ``{"stringtype": "unspecified"}`` option. + This allows to write text data to non-text column (or vice versa), relying to Postgres cast capabilities. + + For example, now it is possible to read column of type ``money`` as Spark's ``StringType()``, and write it back to the same column, + without using intermediate columns or tables. (:github:pull:`229`) +- Allow calling ``MongoDB.pipeline(...)`` with passing just collection name, without explicit aggregation pipeline. (:github:pull:`237`) + + +Improvements +------------ + +- Improve database connections documentation: + * Add "Types" section describing mapping between Clickhouse and Spark types + * Add "Prerequisites" section describing different aspects of connecting to Clickhouse + * Separate documentation of ``DBReader`` and ``.sql()`` / ``.pipeline(...)`` + * Add examples for ``.fetch()`` and ``.execute()`` (:github:pull:`211`, :github:pull:`228`, :github:pull:`229`, :github:pull:`233`, :github:pull:`234`, :github:pull:`235`, :github:pull:`236`, :github:pull:`240`) +- Add notes to Greenplum documentation about issues with IP resolution and building ``gpfdist`` URL (:github:pull:`228`) + + +Bug Fixes +--------- + +- Return back handling of ``DBReader(columns="string")``. This was a valid syntax up to v0.10 release, but it was removed because + most of users neved used it. It looks that we were wrong, returning this behavior back, but with deprecation warning. (:github:pull:`238`) +- Downgrade Greenplum package version from ``2.3.0`` to ``2.2.0``. (:github:pull:`239`) + + This is because version 2.3.0 introduced issues with writing data to Greenplum 6.x. + Connector can open transaction with ``SELECT * FROM table LIMIT 0`` query, but does not close it, which leads to deadlocks. + + For using this connector with Greenplum 7.x, please pass package version explicitly: + + .. code-block:: python + + maven_packages = Greenplum.get_packages(package_version="2.3.0", ...) diff --git a/docs/changelog/index.rst b/docs/changelog/index.rst index 45f7c283b..715b7fdf6 100644 --- a/docs/changelog/index.rst +++ b/docs/changelog/index.rst @@ -3,6 +3,7 @@ :caption: Changelog DRAFT + 0.10.2 0.10.1 0.10.0 0.9.5 diff --git a/docs/changelog/next_release/211.improvement.rst b/docs/changelog/next_release/211.improvement.rst deleted file mode 100644 index 5deec2051..000000000 --- a/docs/changelog/next_release/211.improvement.rst +++ /dev/null @@ -1,5 +0,0 @@ -Improve Clickhouse documentation: - * Add "Types" section describing mapping between Clickhouse and Spark types - * Add "Prerequisites" section describing different aspects of connecting to Clickhouse - * Separate documentation of ``DBReader`` and ``Clickhouse.sql`` - * Add examples for ``Clickhouse.fetch`` and ``Clickhouse.execute`` diff --git a/docs/changelog/next_release/228.improvement.rst b/docs/changelog/next_release/228.improvement.rst deleted file mode 100644 index 8b5c4c284..000000000 --- a/docs/changelog/next_release/228.improvement.rst +++ /dev/null @@ -1,5 +0,0 @@ -Improve Greenplum documentation: - * Add "Types" section describing mapping between Greenplum and Spark types - * Add more examples of reading and writing data from Greenplum - * Add examples for ``Greenplum.fetch`` and ``Greenplum.execute`` - * Add notes about issues with IP resolution and building ``gpfdist`` URL diff --git a/docs/changelog/next_release/229.feature.rst b/docs/changelog/next_release/229.feature.rst deleted file mode 100644 index 1175fe4d0..000000000 --- a/docs/changelog/next_release/229.feature.rst +++ /dev/null @@ -1,4 +0,0 @@ -Update default ``Postgres(extra={...})`` to include ``{"stringtype": "unspecified"}`` option. -This allows to write text data to non-text column (or vice versa), relying to Postgres cast capabilities. - -For example, now it is possible to read column of type ``money`` as Spark's ``StringType()``, and write it back to the same column, without using intermediate columns or tables. diff --git a/docs/changelog/next_release/229.improvement.rst b/docs/changelog/next_release/229.improvement.rst deleted file mode 100644 index d0d2d6b57..000000000 --- a/docs/changelog/next_release/229.improvement.rst +++ /dev/null @@ -1,5 +0,0 @@ -Improve Postgres documentation: - * Add "Types" section describing mapping between Postgres and Spark types - * Add "Prerequisites" section describing different aspects of connecting to Postgres - * Separate documentation of ``DBReader`` and ``Postgres.sql`` - * Add examples for ``Postgres.fetch`` and ``Postgres.execute`` diff --git a/docs/changelog/next_release/230.feature.rst b/docs/changelog/next_release/230.feature.rst deleted file mode 100644 index 9220b131f..000000000 --- a/docs/changelog/next_release/230.feature.rst +++ /dev/null @@ -1 +0,0 @@ -Add support of Pydantic v2. diff --git a/docs/changelog/next_release/233.improvement.rst b/docs/changelog/next_release/233.improvement.rst deleted file mode 100644 index 5b1913eba..000000000 --- a/docs/changelog/next_release/233.improvement.rst +++ /dev/null @@ -1,5 +0,0 @@ -Improve Oracle documentation: - * Add "Types" section describing mapping between Oracle and Spark types - * Add "Prerequisites" section describing different aspects of connecting to Oracle - * Separate documentation of ``DBReader`` and ``Oracle.sql`` - * Add examples for ``Oracle.fetch`` and ``Oracle.execute`` diff --git a/docs/changelog/next_release/234.improvement.rst b/docs/changelog/next_release/234.improvement.rst deleted file mode 100644 index 89857ef03..000000000 --- a/docs/changelog/next_release/234.improvement.rst +++ /dev/null @@ -1,5 +0,0 @@ -Improve MySQL documentation: - * Add "Types" section describing mapping between MySQL and Spark types - * Add "Prerequisites" section describing different aspects of connecting to MySQL - * Separate documentation of ``DBReader`` and ``MySQL.sql`` - * Add examples for ``MySQL.fetch`` and ``MySQL.execute`` diff --git a/docs/changelog/next_release/235.improvement.rst b/docs/changelog/next_release/235.improvement.rst deleted file mode 100644 index 6875f015f..000000000 --- a/docs/changelog/next_release/235.improvement.rst +++ /dev/null @@ -1,5 +0,0 @@ -Improve MSSQL documentation: - * Add "Types" section describing mapping between MSSQL and Spark types - * Add "Prerequisites" section describing different aspects of connecting to MSSQL - * Separate documentation of ``DBReader`` and ``MSSQL.sql`` - * Add examples for ``MSSQL.fetch`` and ``MSSQL.execute`` diff --git a/docs/changelog/next_release/236.improvement.rst b/docs/changelog/next_release/236.improvement.rst deleted file mode 100644 index a56c624fc..000000000 --- a/docs/changelog/next_release/236.improvement.rst +++ /dev/null @@ -1,4 +0,0 @@ -Improve MongoDB documentation: - * Add "Types" section describing mapping between MongoDB and Spark types - * Add "Prerequisites" section describing different aspects of connecting to MongoDB - * Separate documentation of ``DBReader`` and ``MongoDB.pipeline`` diff --git a/docs/changelog/next_release/237.feature.rst b/docs/changelog/next_release/237.feature.rst deleted file mode 100644 index b053e437c..000000000 --- a/docs/changelog/next_release/237.feature.rst +++ /dev/null @@ -1 +0,0 @@ -Allow calling ``MongoDB.pipeline(...)`` with passing just collection name, without explicit aggregation pipeline. diff --git a/docs/changelog/next_release/238.bugfix.rst b/docs/changelog/next_release/238.bugfix.rst deleted file mode 100644 index 558a16d08..000000000 --- a/docs/changelog/next_release/238.bugfix.rst +++ /dev/null @@ -1,2 +0,0 @@ -Return back handling of ``DBReader(columns="string")``. This was a valid syntax up to v0.10 release, but it was removed because -most of users neved used it. It looks that we were wrong, returning this behavior back, but with deprecation warning. diff --git a/docs/changelog/next_release/239.bugfix.rst b/docs/changelog/next_release/239.bugfix.rst deleted file mode 100644 index c6b9bc48d..000000000 --- a/docs/changelog/next_release/239.bugfix.rst +++ /dev/null @@ -1,8 +0,0 @@ -Downgrade Greenplum package version from 2.3.0 to 2.2.0. This is because version 2.3.0 introduced issues with writing data to Greenplum 6.x. -Connector can open transaction with ``SELECT * FROM table LIMIT 0`` query, but does not close it, which leads to deadlocks. - -For using this connector with Greenplum 7.x, please pass package version explicitly: - -.. code-block:: python - - maven_packages = Greenplum.get_packages(package_version="2.3.0", ...) diff --git a/docs/changelog/next_release/240.improvement.rst b/docs/changelog/next_release/240.improvement.rst deleted file mode 100644 index 2284bd0d9..000000000 --- a/docs/changelog/next_release/240.improvement.rst +++ /dev/null @@ -1,4 +0,0 @@ -Improve Teradata documentation: - * Add "Prerequisites" section describing different aspects of connecting to Teradata - * Separate documentation of ``DBReader`` and ``Teradata.sql`` - * Add examples for ``Teradata.fetch`` and ``Teradata.execute`` From 048b615dc386a8507bdbbcfc23804e1c703c5ff2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Thu, 21 Mar 2024 07:41:50 +0000 Subject: [PATCH 62/63] [DOP-13779] Prepare for release --- docs/changelog/0.10.2.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/changelog/0.10.2.rst b/docs/changelog/0.10.2.rst index 3ec6c43fb..ff72ad640 100644 --- a/docs/changelog/0.10.2.rst +++ b/docs/changelog/0.10.2.rst @@ -10,7 +10,6 @@ Features For example, now it is possible to read column of type ``money`` as Spark's ``StringType()``, and write it back to the same column, without using intermediate columns or tables. (:github:pull:`229`) -- Allow calling ``MongoDB.pipeline(...)`` with passing just collection name, without explicit aggregation pipeline. (:github:pull:`237`) Improvements @@ -22,6 +21,7 @@ Improvements * Separate documentation of ``DBReader`` and ``.sql()`` / ``.pipeline(...)`` * Add examples for ``.fetch()`` and ``.execute()`` (:github:pull:`211`, :github:pull:`228`, :github:pull:`229`, :github:pull:`233`, :github:pull:`234`, :github:pull:`235`, :github:pull:`236`, :github:pull:`240`) - Add notes to Greenplum documentation about issues with IP resolution and building ``gpfdist`` URL (:github:pull:`228`) +- Allow calling ``MongoDB.pipeline(...)`` with passing just collection name, without explicit aggregation pipeline. (:github:pull:`237`) Bug Fixes From 157ceba0d84a371dca856b0d5c74a0709d243bd8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Thu, 21 Mar 2024 07:42:12 +0000 Subject: [PATCH 63/63] [DOP-13779] Prepare for release --- docs/changelog/0.10.2.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/changelog/0.10.2.rst b/docs/changelog/0.10.2.rst index ff72ad640..c516735b9 100644 --- a/docs/changelog/0.10.2.rst +++ b/docs/changelog/0.10.2.rst @@ -5,11 +5,6 @@ Features -------- - Add support of Pydantic v2. (:github:pull:`230`) -- Update default ``Postgres(extra={...})`` to include ``{"stringtype": "unspecified"}`` option. - This allows to write text data to non-text column (or vice versa), relying to Postgres cast capabilities. - - For example, now it is possible to read column of type ``money`` as Spark's ``StringType()``, and write it back to the same column, - without using intermediate columns or tables. (:github:pull:`229`) Improvements @@ -22,6 +17,11 @@ Improvements * Add examples for ``.fetch()`` and ``.execute()`` (:github:pull:`211`, :github:pull:`228`, :github:pull:`229`, :github:pull:`233`, :github:pull:`234`, :github:pull:`235`, :github:pull:`236`, :github:pull:`240`) - Add notes to Greenplum documentation about issues with IP resolution and building ``gpfdist`` URL (:github:pull:`228`) - Allow calling ``MongoDB.pipeline(...)`` with passing just collection name, without explicit aggregation pipeline. (:github:pull:`237`) +- Update default ``Postgres(extra={...})`` to include ``{"stringtype": "unspecified"}`` option. + This allows to write text data to non-text column (or vice versa), relying to Postgres cast capabilities. + + For example, now it is possible to read column of type ``money`` as Spark's ``StringType()``, and write it back to the same column, + without using intermediate columns or tables. (:github:pull:`229`) Bug Fixes