From 6b2cc12234eafc27b9a5da930b5a62c60c916812 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Wed, 21 Aug 2024 10:07:00 +0000 Subject: [PATCH] Test Spark 3.5.2 --- .github/workflows/data/clickhouse/matrix.yml | 2 +- .github/workflows/data/core/matrix.yml | 2 +- .github/workflows/data/hdfs/matrix.yml | 2 +- .github/workflows/data/hive/matrix.yml | 2 +- .github/workflows/data/kafka/matrix.yml | 2 +- .github/workflows/data/local-fs/matrix.yml | 4 ++-- .github/workflows/data/mongodb/matrix.yml | 2 +- .github/workflows/data/mssql/matrix.yml | 2 +- .github/workflows/data/mysql/matrix.yml | 2 +- .github/workflows/data/oracle/matrix.yml | 2 +- .github/workflows/data/postgres/matrix.yml | 2 +- .github/workflows/data/s3/matrix.yml | 2 +- .github/workflows/data/teradata/matrix.yml | 2 +- CONTRIBUTING.rst | 2 +- README.rst | 6 +++--- docker-compose.yml | 2 +- docker/Dockerfile | 2 +- docs/changelog/next_release/306.feature.rst | 1 + .../db_connection/clickhouse/types.rst | 4 ++-- docs/connection/db_connection/mssql/types.rst | 4 ++-- docs/connection/db_connection/mysql/types.rst | 4 ++-- docs/connection/db_connection/oracle/types.rst | 4 ++-- docs/connection/db_connection/postgres/types.rst | 4 ++-- onetl/_metrics/extract.py | 2 +- onetl/_metrics/listener/base.py | 2 +- onetl/_metrics/listener/execution.py | 12 ++++++------ onetl/_metrics/listener/job.py | 4 ++-- onetl/_metrics/listener/listener.py | 2 +- onetl/_metrics/listener/stage.py | 6 +++--- onetl/_metrics/listener/task.py | 6 +++--- onetl/_util/spark.py | 2 +- .../connection/db_connection/kafka/connection.py | 4 ++-- .../file_df_connection/spark_s3/connection.py | 6 +++--- onetl/file/format/avro.py | 4 ++-- onetl/file/format/excel.py | 16 ++++++++-------- onetl/file/format/xml.py | 8 ++++---- .../tests/{spark-3.5.0.txt => spark-3.5.2.txt} | 2 +- .../test_file/test_format_unit/test_avro_unit.py | 8 ++++---- .../test_format_unit/test_excel_unit.py | 16 ++++++++-------- .../test_spark_s3_unit.py | 6 +++--- 40 files changed, 84 insertions(+), 83 deletions(-) create mode 100644 docs/changelog/next_release/306.feature.rst rename requirements/tests/{spark-3.5.0.txt => spark-3.5.2.txt} (76%) diff --git a/.github/workflows/data/clickhouse/matrix.yml b/.github/workflows/data/clickhouse/matrix.yml index 6f1d72619..d18856dff 100644 --- a/.github/workflows/data/clickhouse/matrix.yml +++ b/.github/workflows/data/clickhouse/matrix.yml @@ -11,7 +11,7 @@ min: &min max: &max clickhouse-image: clickhouse/clickhouse-server clickhouse-version: 24.6.3.70-alpine - spark-version: 3.5.1 + spark-version: 3.5.2 pydantic-version: 2 python-version: '3.12' java-version: 20 diff --git a/.github/workflows/data/core/matrix.yml b/.github/workflows/data/core/matrix.yml index d20f074ab..504f1d4dc 100644 --- a/.github/workflows/data/core/matrix.yml +++ b/.github/workflows/data/core/matrix.yml @@ -6,7 +6,7 @@ min: &min os: ubuntu-latest max: &max - spark-version: 3.5.1 + spark-version: 3.5.2 pydantic-version: 2 python-version: '3.12' java-version: 20 diff --git a/.github/workflows/data/hdfs/matrix.yml b/.github/workflows/data/hdfs/matrix.yml index af4553f14..f8bae7d59 100644 --- a/.github/workflows/data/hdfs/matrix.yml +++ b/.github/workflows/data/hdfs/matrix.yml @@ -8,7 +8,7 @@ min: &min max: &max hadoop-version: hadoop3-hdfs - spark-version: 3.5.1 + spark-version: 3.5.2 pydantic-version: 2 python-version: '3.12' java-version: 20 diff --git a/.github/workflows/data/hive/matrix.yml b/.github/workflows/data/hive/matrix.yml index 6ce0d7a8e..31b2120fe 100644 --- a/.github/workflows/data/hive/matrix.yml +++ b/.github/workflows/data/hive/matrix.yml @@ -6,7 +6,7 @@ min: &min os: ubuntu-latest max: &max - spark-version: 3.5.1 + spark-version: 3.5.2 pydantic-version: 2 python-version: '3.12' java-version: 20 diff --git a/.github/workflows/data/kafka/matrix.yml b/.github/workflows/data/kafka/matrix.yml index 1b9b23367..4ff5fe645 100644 --- a/.github/workflows/data/kafka/matrix.yml +++ b/.github/workflows/data/kafka/matrix.yml @@ -12,7 +12,7 @@ min: &min max: &max kafka-version: 3.7.1 pydantic-version: 2 - spark-version: 3.5.1 + spark-version: 3.5.2 python-version: '3.12' java-version: 20 os: ubuntu-latest diff --git a/.github/workflows/data/local-fs/matrix.yml b/.github/workflows/data/local-fs/matrix.yml index d1337291e..c4466f3c8 100644 --- a/.github/workflows/data/local-fs/matrix.yml +++ b/.github/workflows/data/local-fs/matrix.yml @@ -20,8 +20,8 @@ min_excel: &min_excel os: ubuntu-latest max: &max - # Excel package currently has no release for 3.5.1 - spark-version: 3.5.0 + # Excel package currently has no release for 3.5.2 + spark-version: 3.5.1 pydantic-version: 2 python-version: '3.12' java-version: 20 diff --git a/.github/workflows/data/mongodb/matrix.yml b/.github/workflows/data/mongodb/matrix.yml index 98e1fe971..4c3d9d86f 100644 --- a/.github/workflows/data/mongodb/matrix.yml +++ b/.github/workflows/data/mongodb/matrix.yml @@ -9,7 +9,7 @@ min: &min max: &max mongodb-version: 7.0.12 - spark-version: 3.5.1 + spark-version: 3.5.2 pydantic-version: 2 python-version: '3.12' java-version: 20 diff --git a/.github/workflows/data/mssql/matrix.yml b/.github/workflows/data/mssql/matrix.yml index fad2e738c..3748a0a77 100644 --- a/.github/workflows/data/mssql/matrix.yml +++ b/.github/workflows/data/mssql/matrix.yml @@ -8,7 +8,7 @@ min: &min max: &max mssql-version: 2022-CU14-ubuntu-22.04 - spark-version: 3.5.1 + spark-version: 3.5.2 pydantic-version: 2 python-version: '3.12' java-version: 20 diff --git a/.github/workflows/data/mysql/matrix.yml b/.github/workflows/data/mysql/matrix.yml index d2e703143..17dacdb24 100644 --- a/.github/workflows/data/mysql/matrix.yml +++ b/.github/workflows/data/mysql/matrix.yml @@ -10,7 +10,7 @@ min: &min max: &max mysql-version: 9.0.1 - spark-version: 3.5.1 + spark-version: 3.5.2 pydantic-version: 2 python-version: '3.12' java-version: 20 diff --git a/.github/workflows/data/oracle/matrix.yml b/.github/workflows/data/oracle/matrix.yml index 7a79c68a7..ccafa20fc 100644 --- a/.github/workflows/data/oracle/matrix.yml +++ b/.github/workflows/data/oracle/matrix.yml @@ -12,7 +12,7 @@ max: &max oracle-image: gvenzl/oracle-free oracle-version: 23.4-slim-faststart db-name: FREEPDB1 - spark-version: 3.5.1 + spark-version: 3.5.2 pydantic-version: 2 python-version: '3.12' java-version: 20 diff --git a/.github/workflows/data/postgres/matrix.yml b/.github/workflows/data/postgres/matrix.yml index 4c5b5f4ef..d37c3a836 100644 --- a/.github/workflows/data/postgres/matrix.yml +++ b/.github/workflows/data/postgres/matrix.yml @@ -9,7 +9,7 @@ min: &min max: &max postgres-version: 16.3-alpine - spark-version: 3.5.1 + spark-version: 3.5.2 pydantic-version: 2 python-version: '3.12' java-version: 20 diff --git a/.github/workflows/data/s3/matrix.yml b/.github/workflows/data/s3/matrix.yml index 06d4f7489..405b8b686 100644 --- a/.github/workflows/data/s3/matrix.yml +++ b/.github/workflows/data/s3/matrix.yml @@ -10,7 +10,7 @@ min: &min max: &max minio-version: 2024.7.26 - spark-version: 3.5.1 + spark-version: 3.5.2 pydantic-version: 2 python-version: '3.12' java-version: 20 diff --git a/.github/workflows/data/teradata/matrix.yml b/.github/workflows/data/teradata/matrix.yml index 6c2a55455..d9792be6a 100644 --- a/.github/workflows/data/teradata/matrix.yml +++ b/.github/workflows/data/teradata/matrix.yml @@ -1,5 +1,5 @@ max: &max - spark-version: 3.5.1 + spark-version: 3.5.2 pydantic-version: 2 python-version: '3.12' java-version: 20 diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 7a70dbac7..aa1a3c034 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -71,7 +71,7 @@ Create virtualenv and install dependencies: -r requirements/tests/postgres.txt \ -r requirements/tests/oracle.txt \ -r requirements/tests/pydantic-2.txt \ - -r requirements/tests/spark-3.5.1.txt + -r requirements/tests/spark-3.5.2.txt # TODO: remove after https://github.com/zqmillet/sphinx-plantuml/pull/4 pip install sphinx-plantuml --no-deps diff --git a/README.rst b/README.rst index 0a4cbc97d..9def167ff 100644 --- a/README.rst +++ b/README.rst @@ -184,7 +184,7 @@ Compatibility matrix +--------------------------------------------------------------+-------------+-------------+-------+ | `3.4.x `_ | 3.7 - 3.12 | 8u362 - 20 | 2.12 | +--------------------------------------------------------------+-------------+-------------+-------+ -| `3.5.x `_ | 3.8 - 3.12 | 8u371 - 20 | 2.12 | +| `3.5.x `_ | 3.8 - 3.12 | 8u371 - 20 | 2.12 | +--------------------------------------------------------------+-------------+-------------+-------+ .. _pyspark-install: @@ -199,7 +199,7 @@ or install PySpark explicitly: .. code:: bash - pip install onetl pyspark==3.5.1 # install a specific PySpark version + pip install onetl pyspark==3.5.2 # install a specific PySpark version or inject PySpark to ``sys.path`` in some other way BEFORE creating a class instance. **Otherwise connection object cannot be created.** @@ -540,7 +540,7 @@ Read files directly from S3 path, convert them to dataframe, transform it and th setup_logging() # Initialize new SparkSession with Hadoop AWS libraries and Postgres driver loaded - maven_packages = SparkS3.get_packages(spark_version="3.5.1") + Postgres.get_packages() + maven_packages = SparkS3.get_packages(spark_version="3.5.2") + Postgres.get_packages() spark = ( SparkSession.builder.appName("spark_app_onetl_demo") .config("spark.jars.packages", ",".join(maven_packages)) diff --git a/docker-compose.yml b/docker-compose.yml index f5859bb5d..73e8a21e1 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -9,7 +9,7 @@ services: context: . target: base args: - SPARK_VERSION: 3.5.1 + SPARK_VERSION: 3.5.2 env_file: .env.docker volumes: - ./:/app/ diff --git a/docker/Dockerfile b/docker/Dockerfile index d3d34ef21..68f40a522 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -44,7 +44,7 @@ ENV PATH=${ONETL_USER_HOME}/.local/bin:${PATH} COPY --chown=onetl:onetl ./run_tests.sh ./pytest_runner.sh ./combine_coverage.sh /app/ RUN chmod +x /app/run_tests.sh /app/pytest_runner.sh /app/combine_coverage.sh -ARG SPARK_VERSION=3.5.1 +ARG SPARK_VERSION=3.5.2 # Spark is heavy, and version change is quite rare COPY --chown=onetl:onetl ./requirements/tests/spark-${SPARK_VERSION}.txt /app/requirements/tests/ RUN pip install -r /app/requirements/tests/spark-${SPARK_VERSION}.txt diff --git a/docs/changelog/next_release/306.feature.rst b/docs/changelog/next_release/306.feature.rst new file mode 100644 index 000000000..1c2b95f7f --- /dev/null +++ b/docs/changelog/next_release/306.feature.rst @@ -0,0 +1 @@ +Update ``Excel`` package from ``0.20.3`` to ``0.20.4``, to include Spark 3.5.1 support. diff --git a/docs/connection/db_connection/clickhouse/types.rst b/docs/connection/db_connection/clickhouse/types.rst index 21ddf0bad..0d8c56750 100644 --- a/docs/connection/db_connection/clickhouse/types.rst +++ b/docs/connection/db_connection/clickhouse/types.rst @@ -106,8 +106,8 @@ References Here you can find source code with type conversions: * `Clickhouse -> JDBC `_ -* `JDBC -> Spark `_ -* `Spark -> JDBC `_ +* `JDBC -> Spark `_ +* `Spark -> JDBC `_ * `JDBC -> Clickhouse `_ Supported types diff --git a/docs/connection/db_connection/mssql/types.rst b/docs/connection/db_connection/mssql/types.rst index 807d62d9c..852289adb 100644 --- a/docs/connection/db_connection/mssql/types.rst +++ b/docs/connection/db_connection/mssql/types.rst @@ -101,8 +101,8 @@ References Here you can find source code with type conversions: * `MSSQL -> JDBC `_ -* `JDBC -> Spark `_ -* `Spark -> JDBC `_ +* `JDBC -> Spark `_ +* `Spark -> JDBC `_ * `JDBC -> MSSQL `_ Supported types diff --git a/docs/connection/db_connection/mysql/types.rst b/docs/connection/db_connection/mysql/types.rst index 1ad6815cc..001a221f2 100644 --- a/docs/connection/db_connection/mysql/types.rst +++ b/docs/connection/db_connection/mysql/types.rst @@ -97,8 +97,8 @@ References Here you can find source code with type conversions: * `MySQL -> JDBC `_ -* `JDBC -> Spark `_ -* `Spark -> JDBC `_ +* `JDBC -> Spark `_ +* `Spark -> JDBC `_ * `JDBC -> MySQL `_ Supported types diff --git a/docs/connection/db_connection/oracle/types.rst b/docs/connection/db_connection/oracle/types.rst index 81b7da101..2433b0f7e 100644 --- a/docs/connection/db_connection/oracle/types.rst +++ b/docs/connection/db_connection/oracle/types.rst @@ -101,8 +101,8 @@ See `List of Oracle types Spark `_ -* `Spark -> JDBC `_ +* `JDBC -> Spark `_ +* `Spark -> JDBC `_ Numeric types ~~~~~~~~~~~~~ diff --git a/docs/connection/db_connection/postgres/types.rst b/docs/connection/db_connection/postgres/types.rst index b4d9d2029..f0fe8821c 100644 --- a/docs/connection/db_connection/postgres/types.rst +++ b/docs/connection/db_connection/postgres/types.rst @@ -109,8 +109,8 @@ See `List of Postgres types JDBC `_ -* `JDBC -> Spark `_ -* `Spark -> JDBC `_ +* `JDBC -> Spark `_ +* `Spark -> JDBC `_ Numeric types ~~~~~~~~~~~~~ diff --git a/onetl/_metrics/extract.py b/onetl/_metrics/extract.py index 4789d8fd4..8b623bb84 100644 --- a/onetl/_metrics/extract.py +++ b/onetl/_metrics/extract.py @@ -70,7 +70,7 @@ def extract_metrics_from_execution(execution: SparkListenerExecution) -> SparkCo disk_spilled_bytes += stage.metrics.disk_spilled_bytes result_size_bytes += stage.metrics.result_size_bytes - # https://github.com/apache/spark/blob/v3.5.1/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala#L467-L473 + # https://github.com/apache/spark/blob/v3.5.2/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala#L467-L473 input_file_count = ( _get_int(execution.metrics, SparkSQLMetricNames.NUMBER_OF_FILES_READ) or _get_int(execution.metrics, SparkSQLMetricNames.STATIC_NUMBER_OF_FILES_READ) diff --git a/onetl/_metrics/listener/base.py b/onetl/_metrics/listener/base.py index 90432c7c3..a8d5b8555 100644 --- a/onetl/_metrics/listener/base.py +++ b/onetl/_metrics/listener/base.py @@ -16,7 +16,7 @@ class BaseSparkListener: """Base no-op SparkListener implementation. - See `SparkListener `_ interface. + See `SparkListener `_ interface. """ spark: SparkSession diff --git a/onetl/_metrics/listener/execution.py b/onetl/_metrics/listener/execution.py index 728c4c2ca..f5749e161 100644 --- a/onetl/_metrics/listener/execution.py +++ b/onetl/_metrics/listener/execution.py @@ -22,18 +22,18 @@ class SparkSQLMetricNames(str, Enum): # noqa: WPS338 # Metric names passed to SQLMetrics.createMetric(...) # But only those we're interested in. - # https://github.com/apache/spark/blob/v3.5.1/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala#L233C55-L233C87 + # https://github.com/apache/spark/blob/v3.5.2/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala#L231 NUMBER_OF_PARTITIONS_READ = "number of partitions read" - # https://github.com/apache/spark/blob/v3.5.1/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala#L225-L227 + # https://github.com/apache/spark/blob/v3.5.2/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala#L225-L227 NUMBER_OF_FILES_READ = "number of files read" SIZE_OF_FILES_READ = "size of files read" - # https://github.com/apache/spark/blob/v3.5.1/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala#L455-L456 + # https://github.com/apache/spark/blob/v3.5.2/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala#L225-L227 STATIC_NUMBER_OF_FILES_READ = "static number of files read" STATIC_SIZE_OF_FILES_READ = "static size of files read" - # https://github.com/apache/spark/blob/v3.5.1/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/BasicWriteStatsTracker.scala#L241-L246 + # https://github.com/apache/spark/blob/v3.5.2/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/BasicWriteStatsTracker.scala#L241-L246 NUMBER_OF_DYNAMIC_PART = "number of dynamic part" NUMBER_OF_WRITTEN_FILES = "number of written files" @@ -62,11 +62,11 @@ def jobs(self) -> list[SparkListenerJob]: return result def on_execution_start(self, event): - # https://github.com/apache/spark/blob/v3.5.1/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala#L44-L58 + # https://github.com/apache/spark/blob/v3.5.2/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala#L44-L58 self.status = SparkListenerExecutionStatus.STARTED def on_execution_end(self, event): - # https://github.com/apache/spark/blob/v3.5.1/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala#L61-L83 + # https://github.com/apache/spark/blob/v3.5.2/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala#L61-L83 for job in self._jobs.values(): if job.status == SparkListenerJobStatus.FAILED: self.status = SparkListenerExecutionStatus.FAILED diff --git a/onetl/_metrics/listener/job.py b/onetl/_metrics/listener/job.py index b3abbd061..915f1f3de 100644 --- a/onetl/_metrics/listener/job.py +++ b/onetl/_metrics/listener/job.py @@ -38,8 +38,8 @@ def stages(self) -> list[SparkListenerStage]: @classmethod def create(cls, event): - # https://spark.apache.org/docs/3.5.1/api/java/org/apache/spark/scheduler/SparkListenerJobSubmitted.html - # https://spark.apache.org/docs/3.5.1/api/java/org/apache/spark/scheduler/SparkListenerJobCompleted.html + # https://spark.apache.org/docs/3.5.2/api/java/org/apache/spark/scheduler/SparkListenerJobSubmitted.html + # https://spark.apache.org/docs/3.5.2/api/java/org/apache/spark/scheduler/SparkListenerJobCompleted.html result = cls( id=event.jobId(), description=event.properties().get("spark.job.description"), diff --git a/onetl/_metrics/listener/listener.py b/onetl/_metrics/listener/listener.py index 3421e5ae0..997f22a7b 100644 --- a/onetl/_metrics/listener/listener.py +++ b/onetl/_metrics/listener/listener.py @@ -73,7 +73,7 @@ def onExecutionEnd(self, event): # Get execution metrics from SQLAppStatusStore, # as SparkListenerSQLExecutionEnd event does not provide them: - # https://github.com/apache/spark/blob/v3.5.1/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusStore.scala + # https://github.com/apache/spark/blob/v3.5.2/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusStore.scala session_status_store = self.spark._jsparkSession.sharedState().statusStore() # noqa: WPS437 raw_execution = session_status_store.execution(execution.id).get() metrics = raw_execution.metrics() diff --git a/onetl/_metrics/listener/stage.py b/onetl/_metrics/listener/stage.py index 4bf4dffb0..89d6a6aeb 100644 --- a/onetl/_metrics/listener/stage.py +++ b/onetl/_metrics/listener/stage.py @@ -21,7 +21,7 @@ def __str__(self): @dataclass class SparkListenerStage: - # https://spark.apache.org/docs/3.5.1/api/java/org/apache/spark/scheduler/StageInfo.html + # https://spark.apache.org/docs/3.5.2/api/java/org/apache/spark/scheduler/StageInfo.html id: int status: SparkListenerStageStatus = SparkListenerStageStatus.PENDING metrics: SparkListenerTaskMetrics = field(default_factory=SparkListenerTaskMetrics, repr=False, init=False) @@ -39,11 +39,11 @@ def create(cls, stage_info): return cls(id=stage_info.stageId()) def on_stage_start(self, event): - # https://spark.apache.org/docs/3.5.1/api/java/org/apache/spark/scheduler/SparkListenerStageSubmitted.html + # https://spark.apache.org/docs/3.5.2/api/java/org/apache/spark/scheduler/SparkListenerStageSubmitted.html self.status = SparkListenerStageStatus.ACTIVE def on_stage_end(self, event): - # https://spark.apache.org/docs/3.5.1/api/java/org/apache/spark/scheduler/SparkListenerStageCompleted.html + # https://spark.apache.org/docs/3.5.2/api/java/org/apache/spark/scheduler/SparkListenerStageCompleted.html stage_info = event.stageInfo() if stage_info.failureReason().isDefined(): self.status = SparkListenerStageStatus.FAILED diff --git a/onetl/_metrics/listener/task.py b/onetl/_metrics/listener/task.py index 4b27ffcfa..ced938a86 100644 --- a/onetl/_metrics/listener/task.py +++ b/onetl/_metrics/listener/task.py @@ -81,14 +81,14 @@ class SparkListenerTask: @classmethod def create(cls, task_info): - # https://spark.apache.org/docs/3.5.1/api/java/org/apache/spark/scheduler/TaskInfo.html + # https://spark.apache.org/docs/3.5.2/api/java/org/apache/spark/scheduler/TaskInfo.html return cls(id=task_info.taskId()) def on_task_start(self, event): - # https://spark.apache.org/docs/3.5.1/api/java/org/apache/spark/scheduler/SparkListenerTaskStart.html + # https://spark.apache.org/docs/3.5.2/api/java/org/apache/spark/scheduler/SparkListenerTaskStart.html self.status = SparkListenerTaskStatus(event.taskInfo().status()) def on_task_end(self, event): - # https://spark.apache.org/docs/3.5.1/api/java/org/apache/spark/scheduler/SparkListenerTaskEnd.html + # https://spark.apache.org/docs/3.5.2/api/java/org/apache/spark/scheduler/SparkListenerTaskEnd.html self.status = SparkListenerTaskStatus(event.taskInfo().status()) self.metrics = SparkListenerTaskMetrics.create(event.taskMetrics()) diff --git a/onetl/_util/spark.py b/onetl/_util/spark.py index f7d018b39..547095af4 100644 --- a/onetl/_util/spark.py +++ b/onetl/_util/spark.py @@ -143,7 +143,7 @@ def estimate_dataframe_size(spark_session: SparkSession, df: DataFrame) -> int: """ Estimate in-memory DataFrame size in bytes. If cannot be estimated, return 0. - Using Spark's `SizeEstimator `_. + Using Spark's `SizeEstimator `_. """ try: size_estimator = spark_session._jvm.org.apache.spark.util.SizeEstimator # type: ignore[union-attr] diff --git a/onetl/connection/db_connection/kafka/connection.py b/onetl/connection/db_connection/kafka/connection.py index b404eafba..9b8bf2cde 100644 --- a/onetl/connection/db_connection/kafka/connection.py +++ b/onetl/connection/db_connection/kafka/connection.py @@ -332,7 +332,7 @@ def write_df_to_target( write_options.update(options.dict(by_alias=True, exclude_none=True, exclude={"if_exists"})) write_options["topic"] = target - # As of Apache Spark version 3.5.0, the mode 'error' is not functioning as expected. + # As of Apache Spark version 3.5.2, the mode 'error' is not functioning as expected. # This issue has been reported and can be tracked at: # https://issues.apache.org/jira/browse/SPARK-44774 mode = options.if_exists @@ -418,7 +418,7 @@ def get_packages( from onetl.connection import Kafka Kafka.get_packages(spark_version="3.2.4") - Kafka.get_packages(spark_version="3.2.4", scala_version="2.13") + Kafka.get_packages(spark_version="3.2.4", scala_version="2.12") """ diff --git a/onetl/connection/file_df_connection/spark_s3/connection.py b/onetl/connection/file_df_connection/spark_s3/connection.py index eb74d6981..8fe07d106 100644 --- a/onetl/connection/file_df_connection/spark_s3/connection.py +++ b/onetl/connection/file_df_connection/spark_s3/connection.py @@ -133,7 +133,7 @@ class SparkS3(SparkFileDFConnection): from pyspark.sql import SparkSession # Create Spark session with Hadoop AWS libraries loaded - maven_packages = SparkS3.get_packages(spark_version="3.5.0") + maven_packages = SparkS3.get_packages(spark_version="3.5.2") # Some dependencies are not used, but downloading takes a lot of time. Skipping them. excluded_packages = [ "com.google.cloud.bigdataoss:gcs-connector", @@ -236,8 +236,8 @@ def get_packages( from onetl.connection import SparkS3 - SparkS3.get_packages(spark_version="3.5.0") - SparkS3.get_packages(spark_version="3.5.0", scala_version="2.12") + SparkS3.get_packages(spark_version="3.5.2") + SparkS3.get_packages(spark_version="3.5.2", scala_version="2.12") """ diff --git a/onetl/file/format/avro.py b/onetl/file/format/avro.py index 3699620b0..418e4064e 100644 --- a/onetl/file/format/avro.py +++ b/onetl/file/format/avro.py @@ -88,7 +88,7 @@ class Avro(ReadWriteFileFormat): from pyspark.sql import SparkSession # Create Spark session with Avro package loaded - maven_packages = Avro.get_packages(spark_version="3.5.0") + maven_packages = Avro.get_packages(spark_version="3.5.2") spark = ( SparkSession.builder.appName("spark-app-name") .config("spark.jars.packages", ",".join(maven_packages)) @@ -151,7 +151,7 @@ def get_packages( from onetl.file.format import Avro Avro.get_packages(spark_version="3.2.4") - Avro.get_packages(spark_version="3.2.4", scala_version="2.13") + Avro.get_packages(spark_version="3.2.4", scala_version="2.12") """ diff --git a/onetl/file/format/excel.py b/onetl/file/format/excel.py index 2ec12758a..3f26522f0 100644 --- a/onetl/file/format/excel.py +++ b/onetl/file/format/excel.py @@ -87,7 +87,7 @@ class Excel(ReadWriteFileFormat): from pyspark.sql import SparkSession # Create Spark session with Excel package loaded - maven_packages = Excel.get_packages(spark_version="3.5.0") + maven_packages = Excel.get_packages(spark_version="3.5.1") spark = ( SparkSession.builder.appName("spark-app-name") .config("spark.jars.packages", ",".join(maven_packages)) @@ -139,7 +139,7 @@ def get_packages( If ``None``, ``spark_version`` is used to determine Scala version. package_version : str, optional - Package version in format ``major.minor.patch``. Default is ``0.20.3``. + Package version in format ``major.minor.patch``. Default is ``0.20.4``. .. warning:: @@ -157,12 +157,12 @@ def get_packages( from onetl.file.format import Excel - Excel.get_packages(spark_version="3.5.0") - Excel.get_packages(spark_version="3.5.0", scala_version="2.13") + Excel.get_packages(spark_version="3.5.1") + Excel.get_packages(spark_version="3.5.1", scala_version="2.12") Excel.get_packages( - spark_version="3.5.0", - scala_version="2.13", - package_version="0.20.3", + spark_version="3.5.1", + scala_version="2.12", + package_version="0.20.4", ) """ @@ -176,7 +176,7 @@ def get_packages( raise ValueError(f"Package version should be at least 0.15, got {package_version}") log.warning("Passed custom package version %r, it is not guaranteed to be supported", package_version) else: - version = Version("0.20.3") + version = Version("0.20.4") spark_ver = Version(spark_version).min_digits(3) if spark_ver < Version("3.2"): diff --git a/onetl/file/format/xml.py b/onetl/file/format/xml.py index cc7cd4777..11425809e 100644 --- a/onetl/file/format/xml.py +++ b/onetl/file/format/xml.py @@ -119,7 +119,7 @@ class XML(ReadWriteFileFormat): from pyspark.sql import SparkSession # Create Spark session with XML package loaded - maven_packages = XML.get_packages(spark_version="3.5.0") + maven_packages = XML.get_packages(spark_version="3.5.2") spark = ( SparkSession.builder.appName("spark-app-name") .config("spark.jars.packages", ",".join(maven_packages)) @@ -184,10 +184,10 @@ def get_packages( # noqa: WPS231 from onetl.file.format import XML - XML.get_packages(spark_version="3.5.0") - XML.get_packages(spark_version="3.5.0", scala_version="2.12") + XML.get_packages(spark_version="3.5.2") + XML.get_packages(spark_version="3.5.2", scala_version="2.12") XML.get_packages( - spark_version="3.5.0", + spark_version="3.5.2", scala_version="2.12", package_version="0.18.0", ) diff --git a/requirements/tests/spark-3.5.0.txt b/requirements/tests/spark-3.5.2.txt similarity index 76% rename from requirements/tests/spark-3.5.0.txt rename to requirements/tests/spark-3.5.2.txt index 2e49168a5..214f0d63b 100644 --- a/requirements/tests/spark-3.5.0.txt +++ b/requirements/tests/spark-3.5.2.txt @@ -1,5 +1,5 @@ numpy>=1.16 pandas>=1.0 pyarrow>=1.0 -pyspark==3.5.0 +pyspark==3.5.2 sqlalchemy diff --git a/tests/tests_unit/test_file/test_format_unit/test_avro_unit.py b/tests/tests_unit/test_file/test_format_unit/test_avro_unit.py index 3c2ef1605..53c7a67a6 100644 --- a/tests/tests_unit/test_file/test_format_unit/test_avro_unit.py +++ b/tests/tests_unit/test_file/test_format_unit/test_avro_unit.py @@ -29,14 +29,14 @@ def test_avro_get_packages_scala_version_not_supported(): [ # Detect Scala version by Spark version ("2.4.0", None, "org.apache.spark:spark-avro_2.11:2.4.0"), - ("3.5.0", None, "org.apache.spark:spark-avro_2.12:3.5.0"), + ("3.5.2", None, "org.apache.spark:spark-avro_2.12:3.5.2"), # Override Scala version ("2.4.0", "2.11", "org.apache.spark:spark-avro_2.11:2.4.0"), ("2.4.0", "2.12", "org.apache.spark:spark-avro_2.12:2.4.0"), - ("3.5.0", "2.12", "org.apache.spark:spark-avro_2.12:3.5.0"), - ("3.5.0", "2.13", "org.apache.spark:spark-avro_2.13:3.5.0"), + ("3.5.2", "2.12", "org.apache.spark:spark-avro_2.12:3.5.2"), + ("3.5.2", "2.13", "org.apache.spark:spark-avro_2.13:3.5.2"), # Scala version contain three digits when only two needed - ("3.5.0", "2.12.1", "org.apache.spark:spark-avro_2.12:3.5.0"), + ("3.5.2", "2.12.1", "org.apache.spark:spark-avro_2.12:3.5.2"), ], ) def test_avro_get_packages(spark_version, scala_version, package): diff --git a/tests/tests_unit/test_file/test_format_unit/test_excel_unit.py b/tests/tests_unit/test_file/test_format_unit/test_excel_unit.py index 95dae3da1..ecacb2ca7 100644 --- a/tests/tests_unit/test_file/test_format_unit/test_excel_unit.py +++ b/tests/tests_unit/test_file/test_format_unit/test_excel_unit.py @@ -34,18 +34,18 @@ def test_excel_get_packages_package_version_not_supported(): "spark_version, scala_version, package_version, packages", [ # Detect Scala version by Spark version - ("3.2.4", None, None, ["com.crealytics:spark-excel_2.12:3.2.4_0.20.3"]), - ("3.5.0", None, None, ["com.crealytics:spark-excel_2.12:3.5.0_0.20.3"]), + ("3.2.4", None, None, ["com.crealytics:spark-excel_2.12:3.2.4_0.20.4"]), + ("3.5.2", None, None, ["com.crealytics:spark-excel_2.12:3.5.2_0.20.4"]), # Override Scala version - ("3.2.4", "2.12", None, ["com.crealytics:spark-excel_2.12:3.2.4_0.20.3"]), - ("3.2.4", "2.13", None, ["com.crealytics:spark-excel_2.13:3.2.4_0.20.3"]), - ("3.5.0", "2.12", None, ["com.crealytics:spark-excel_2.12:3.5.0_0.20.3"]), - ("3.5.0", "2.13", None, ["com.crealytics:spark-excel_2.13:3.5.0_0.20.3"]), + ("3.2.4", "2.12", None, ["com.crealytics:spark-excel_2.12:3.2.4_0.20.4"]), + ("3.2.4", "2.13", None, ["com.crealytics:spark-excel_2.13:3.2.4_0.20.4"]), + ("3.5.2", "2.12", None, ["com.crealytics:spark-excel_2.12:3.5.2_0.20.4"]), + ("3.5.2", "2.13", None, ["com.crealytics:spark-excel_2.13:3.5.2_0.20.4"]), # Override package version ("3.2.0", None, "0.16.0", ["com.crealytics:spark-excel_2.12:3.2.0_0.16.0"]), - ("3.5.0", None, "0.18.0", ["com.crealytics:spark-excel_2.12:3.5.0_0.18.0"]), + ("3.5.2", None, "0.18.0", ["com.crealytics:spark-excel_2.12:3.5.2_0.18.0"]), # Scala version contain three digits when only two needed - ("3.5.0", "2.12.1", None, ["com.crealytics:spark-excel_2.12:3.5.0_0.20.3"]), + ("3.5.2", "2.12.1", None, ["com.crealytics:spark-excel_2.12:3.5.2_0.20.4"]), ], ) def test_excel_get_packages(caplog, spark_version, scala_version, package_version, packages): diff --git a/tests/tests_unit/tests_file_df_connection_unit/test_spark_s3_unit.py b/tests/tests_unit/tests_file_df_connection_unit/test_spark_s3_unit.py index 34ac4387f..9a5e6faca 100644 --- a/tests/tests_unit/tests_file_df_connection_unit/test_spark_s3_unit.py +++ b/tests/tests_unit/tests_file_df_connection_unit/test_spark_s3_unit.py @@ -10,9 +10,9 @@ @pytest.mark.parametrize( "spark_version, scala_version, package", [ - ("3.5.0", None, "org.apache.spark:spark-hadoop-cloud_2.12:3.5.0"), - ("3.5.0", "2.12", "org.apache.spark:spark-hadoop-cloud_2.12:3.5.0"), - ("3.5.0", "2.13", "org.apache.spark:spark-hadoop-cloud_2.13:3.5.0"), + ("3.5.2", None, "org.apache.spark:spark-hadoop-cloud_2.12:3.5.2"), + ("3.5.2", "2.12", "org.apache.spark:spark-hadoop-cloud_2.12:3.5.2"), + ("3.5.2", "2.13", "org.apache.spark:spark-hadoop-cloud_2.13:3.5.2"), ], ) def test_spark_s3_get_packages(spark_version, scala_version, package):