diff --git a/.github/workflows/data/clickhouse/matrix.yml b/.github/workflows/data/clickhouse/matrix.yml index 1373d5e8..075d98a0 100644 --- a/.github/workflows/data/clickhouse/matrix.yml +++ b/.github/workflows/data/clickhouse/matrix.yml @@ -11,7 +11,7 @@ min: &min max: &max clickhouse-image: clickhouse/clickhouse-server clickhouse-version: 24.8.2.3-alpine - spark-version: 3.5.2 + spark-version: 3.5.3 pydantic-version: 2 python-version: '3.12' java-version: 20 diff --git a/.github/workflows/data/core/matrix.yml b/.github/workflows/data/core/matrix.yml index 504f1d4d..67d32ce3 100644 --- a/.github/workflows/data/core/matrix.yml +++ b/.github/workflows/data/core/matrix.yml @@ -6,7 +6,7 @@ min: &min os: ubuntu-latest max: &max - spark-version: 3.5.2 + spark-version: 3.5.3 pydantic-version: 2 python-version: '3.12' java-version: 20 diff --git a/.github/workflows/data/hdfs/matrix.yml b/.github/workflows/data/hdfs/matrix.yml index f8bae7d5..bb913214 100644 --- a/.github/workflows/data/hdfs/matrix.yml +++ b/.github/workflows/data/hdfs/matrix.yml @@ -8,7 +8,7 @@ min: &min max: &max hadoop-version: hadoop3-hdfs - spark-version: 3.5.2 + spark-version: 3.5.3 pydantic-version: 2 python-version: '3.12' java-version: 20 diff --git a/.github/workflows/data/hive/matrix.yml b/.github/workflows/data/hive/matrix.yml index 31b2120f..6bb53edf 100644 --- a/.github/workflows/data/hive/matrix.yml +++ b/.github/workflows/data/hive/matrix.yml @@ -6,7 +6,7 @@ min: &min os: ubuntu-latest max: &max - spark-version: 3.5.2 + spark-version: 3.5.3 pydantic-version: 2 python-version: '3.12' java-version: 20 diff --git a/.github/workflows/data/kafka/matrix.yml b/.github/workflows/data/kafka/matrix.yml index 4ff5fe64..309c0cae 100644 --- a/.github/workflows/data/kafka/matrix.yml +++ b/.github/workflows/data/kafka/matrix.yml @@ -12,7 +12,7 @@ min: &min max: &max kafka-version: 3.7.1 pydantic-version: 2 - spark-version: 3.5.2 + spark-version: 3.5.3 python-version: '3.12' java-version: 20 os: ubuntu-latest diff --git a/.github/workflows/data/mongodb/matrix.yml b/.github/workflows/data/mongodb/matrix.yml index 11892d65..6df9c853 100644 --- a/.github/workflows/data/mongodb/matrix.yml +++ b/.github/workflows/data/mongodb/matrix.yml @@ -9,7 +9,7 @@ min: &min max: &max mongodb-version: 7.0.14 - spark-version: 3.5.2 + spark-version: 3.5.3 pydantic-version: 2 python-version: '3.12' java-version: 20 diff --git a/.github/workflows/data/mssql/matrix.yml b/.github/workflows/data/mssql/matrix.yml index 3748a0a7..490bc1fb 100644 --- a/.github/workflows/data/mssql/matrix.yml +++ b/.github/workflows/data/mssql/matrix.yml @@ -8,7 +8,7 @@ min: &min max: &max mssql-version: 2022-CU14-ubuntu-22.04 - spark-version: 3.5.2 + spark-version: 3.5.3 pydantic-version: 2 python-version: '3.12' java-version: 20 diff --git a/.github/workflows/data/mysql/matrix.yml b/.github/workflows/data/mysql/matrix.yml index 17dacdb2..b98c985c 100644 --- a/.github/workflows/data/mysql/matrix.yml +++ b/.github/workflows/data/mysql/matrix.yml @@ -10,7 +10,7 @@ min: &min max: &max mysql-version: 9.0.1 - spark-version: 3.5.2 + spark-version: 3.5.3 pydantic-version: 2 python-version: '3.12' java-version: 20 diff --git a/.github/workflows/data/oracle/matrix.yml b/.github/workflows/data/oracle/matrix.yml index ccafa20f..051f0df9 100644 --- a/.github/workflows/data/oracle/matrix.yml +++ b/.github/workflows/data/oracle/matrix.yml @@ -12,7 +12,7 @@ max: &max oracle-image: gvenzl/oracle-free oracle-version: 23.4-slim-faststart db-name: FREEPDB1 - spark-version: 3.5.2 + spark-version: 3.5.3 pydantic-version: 2 python-version: '3.12' java-version: 20 diff --git a/.github/workflows/data/postgres/matrix.yml b/.github/workflows/data/postgres/matrix.yml index cd63ae03..43b914e9 100644 --- a/.github/workflows/data/postgres/matrix.yml +++ b/.github/workflows/data/postgres/matrix.yml @@ -9,7 +9,7 @@ min: &min max: &max postgres-version: 16.4-alpine - spark-version: 3.5.2 + spark-version: 3.5.3 pydantic-version: 2 python-version: '3.12' java-version: 20 diff --git a/.github/workflows/data/s3/matrix.yml b/.github/workflows/data/s3/matrix.yml index 3990f312..5a408373 100644 --- a/.github/workflows/data/s3/matrix.yml +++ b/.github/workflows/data/s3/matrix.yml @@ -10,7 +10,7 @@ min: &min max: &max minio-version: 2024.8.29 - spark-version: 3.5.2 + spark-version: 3.5.3 pydantic-version: 2 python-version: '3.12' java-version: 20 diff --git a/.github/workflows/data/teradata/matrix.yml b/.github/workflows/data/teradata/matrix.yml index d9792be6..e9e4b5fa 100644 --- a/.github/workflows/data/teradata/matrix.yml +++ b/.github/workflows/data/teradata/matrix.yml @@ -1,5 +1,5 @@ max: &max - spark-version: 3.5.2 + spark-version: 3.5.3 pydantic-version: 2 python-version: '3.12' java-version: 20 diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index aa1a3c03..fe08ff0b 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -71,7 +71,7 @@ Create virtualenv and install dependencies: -r requirements/tests/postgres.txt \ -r requirements/tests/oracle.txt \ -r requirements/tests/pydantic-2.txt \ - -r requirements/tests/spark-3.5.2.txt + -r requirements/tests/spark-3.5.3.txt # TODO: remove after https://github.com/zqmillet/sphinx-plantuml/pull/4 pip install sphinx-plantuml --no-deps diff --git a/README.rst b/README.rst index 9def167f..79561c1d 100644 --- a/README.rst +++ b/README.rst @@ -184,7 +184,7 @@ Compatibility matrix +--------------------------------------------------------------+-------------+-------------+-------+ | `3.4.x `_ | 3.7 - 3.12 | 8u362 - 20 | 2.12 | +--------------------------------------------------------------+-------------+-------------+-------+ -| `3.5.x `_ | 3.8 - 3.12 | 8u371 - 20 | 2.12 | +| `3.5.x `_ | 3.8 - 3.12 | 8u371 - 20 | 2.12 | +--------------------------------------------------------------+-------------+-------------+-------+ .. _pyspark-install: @@ -199,7 +199,7 @@ or install PySpark explicitly: .. code:: bash - pip install onetl pyspark==3.5.2 # install a specific PySpark version + pip install onetl pyspark==3.5.3 # install a specific PySpark version or inject PySpark to ``sys.path`` in some other way BEFORE creating a class instance. **Otherwise connection object cannot be created.** @@ -540,7 +540,7 @@ Read files directly from S3 path, convert them to dataframe, transform it and th setup_logging() # Initialize new SparkSession with Hadoop AWS libraries and Postgres driver loaded - maven_packages = SparkS3.get_packages(spark_version="3.5.2") + Postgres.get_packages() + maven_packages = SparkS3.get_packages(spark_version="3.5.3") + Postgres.get_packages() spark = ( SparkSession.builder.appName("spark_app_onetl_demo") .config("spark.jars.packages", ",".join(maven_packages)) diff --git a/docker-compose.yml b/docker-compose.yml index d32f682a..9764ed00 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -9,7 +9,7 @@ services: context: . target: base args: - SPARK_VERSION: 3.5.2 + SPARK_VERSION: 3.5.3 env_file: .env.docker volumes: - ./:/app/ diff --git a/docker/Dockerfile b/docker/Dockerfile index 68f40a52..6dfbaf56 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -44,7 +44,7 @@ ENV PATH=${ONETL_USER_HOME}/.local/bin:${PATH} COPY --chown=onetl:onetl ./run_tests.sh ./pytest_runner.sh ./combine_coverage.sh /app/ RUN chmod +x /app/run_tests.sh /app/pytest_runner.sh /app/combine_coverage.sh -ARG SPARK_VERSION=3.5.2 +ARG SPARK_VERSION=3.5.3 # Spark is heavy, and version change is quite rare COPY --chown=onetl:onetl ./requirements/tests/spark-${SPARK_VERSION}.txt /app/requirements/tests/ RUN pip install -r /app/requirements/tests/spark-${SPARK_VERSION}.txt diff --git a/docs/connection/db_connection/clickhouse/types.rst b/docs/connection/db_connection/clickhouse/types.rst index 0d8c5675..6135e066 100644 --- a/docs/connection/db_connection/clickhouse/types.rst +++ b/docs/connection/db_connection/clickhouse/types.rst @@ -106,8 +106,8 @@ References Here you can find source code with type conversions: * `Clickhouse -> JDBC `_ -* `JDBC -> Spark `_ -* `Spark -> JDBC `_ +* `JDBC -> Spark `_ +* `Spark -> JDBC `_ * `JDBC -> Clickhouse `_ Supported types diff --git a/docs/connection/db_connection/mssql/types.rst b/docs/connection/db_connection/mssql/types.rst index 852289ad..3a8b2d36 100644 --- a/docs/connection/db_connection/mssql/types.rst +++ b/docs/connection/db_connection/mssql/types.rst @@ -101,8 +101,8 @@ References Here you can find source code with type conversions: * `MSSQL -> JDBC `_ -* `JDBC -> Spark `_ -* `Spark -> JDBC `_ +* `JDBC -> Spark `_ +* `Spark -> JDBC `_ * `JDBC -> MSSQL `_ Supported types diff --git a/docs/connection/db_connection/mysql/types.rst b/docs/connection/db_connection/mysql/types.rst index 001a221f..71795925 100644 --- a/docs/connection/db_connection/mysql/types.rst +++ b/docs/connection/db_connection/mysql/types.rst @@ -97,8 +97,8 @@ References Here you can find source code with type conversions: * `MySQL -> JDBC `_ -* `JDBC -> Spark `_ -* `Spark -> JDBC `_ +* `JDBC -> Spark `_ +* `Spark -> JDBC `_ * `JDBC -> MySQL `_ Supported types diff --git a/docs/connection/db_connection/oracle/types.rst b/docs/connection/db_connection/oracle/types.rst index 2433b0f7..28ece909 100644 --- a/docs/connection/db_connection/oracle/types.rst +++ b/docs/connection/db_connection/oracle/types.rst @@ -101,8 +101,8 @@ See `List of Oracle types Spark `_ -* `Spark -> JDBC `_ +* `JDBC -> Spark `_ +* `Spark -> JDBC `_ Numeric types ~~~~~~~~~~~~~ diff --git a/docs/connection/db_connection/postgres/types.rst b/docs/connection/db_connection/postgres/types.rst index f0fe8821..e47336f0 100644 --- a/docs/connection/db_connection/postgres/types.rst +++ b/docs/connection/db_connection/postgres/types.rst @@ -109,8 +109,8 @@ See `List of Postgres types JDBC `_ -* `JDBC -> Spark `_ -* `Spark -> JDBC `_ +* `JDBC -> Spark `_ +* `Spark -> JDBC `_ Numeric types ~~~~~~~~~~~~~ diff --git a/onetl/_metrics/extract.py b/onetl/_metrics/extract.py index 4b058092..bd293779 100644 --- a/onetl/_metrics/extract.py +++ b/onetl/_metrics/extract.py @@ -70,7 +70,7 @@ def extract_metrics_from_execution(execution: SparkListenerExecution) -> SparkCo disk_spilled_bytes += stage.metrics.disk_spilled_bytes result_size_bytes += stage.metrics.result_size_bytes - # https://github.com/apache/spark/blob/v3.5.2/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala#L467-L473 + # https://github.com/apache/spark/blob/v3.5.3/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala#L467-L473 input_file_count = ( _get_int(execution.metrics, SparkSQLMetricNames.NUMBER_OF_FILES_READ) or _get_int(execution.metrics, SparkSQLMetricNames.STATIC_NUMBER_OF_FILES_READ) diff --git a/onetl/_metrics/listener/base.py b/onetl/_metrics/listener/base.py index bbc6431c..d1b9555d 100644 --- a/onetl/_metrics/listener/base.py +++ b/onetl/_metrics/listener/base.py @@ -16,7 +16,7 @@ class BaseSparkListener: """Base no-op SparkListener implementation. - See `SparkListener `_ interface. + See `SparkListener `_ interface. """ spark: SparkSession diff --git a/onetl/_metrics/listener/execution.py b/onetl/_metrics/listener/execution.py index a0d2a522..ef2e7ffd 100644 --- a/onetl/_metrics/listener/execution.py +++ b/onetl/_metrics/listener/execution.py @@ -22,18 +22,18 @@ class SparkSQLMetricNames(str, Enum): # noqa: WPS338 # Metric names passed to SQLMetrics.createMetric(...) # But only those we're interested in. - # https://github.com/apache/spark/blob/v3.5.2/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala#L231 + # https://github.com/apache/spark/blob/v3.5.3/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala#L231 NUMBER_OF_PARTITIONS_READ = "number of partitions read" - # https://github.com/apache/spark/blob/v3.5.2/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala#L225-L227 + # https://github.com/apache/spark/blob/v3.5.3/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala#L225-L227 NUMBER_OF_FILES_READ = "number of files read" SIZE_OF_FILES_READ = "size of files read" - # https://github.com/apache/spark/blob/v3.5.2/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala#L225-L227 + # https://github.com/apache/spark/blob/v3.5.3/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala#L225-L227 STATIC_NUMBER_OF_FILES_READ = "static number of files read" STATIC_SIZE_OF_FILES_READ = "static size of files read" - # https://github.com/apache/spark/blob/v3.5.2/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/BasicWriteStatsTracker.scala#L241-L246 + # https://github.com/apache/spark/blob/v3.5.3/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/BasicWriteStatsTracker.scala#L241-L246 NUMBER_OF_DYNAMIC_PART = "number of dynamic part" NUMBER_OF_WRITTEN_FILES = "number of written files" @@ -66,11 +66,11 @@ def jobs(self) -> list[SparkListenerJob]: return result def on_execution_start(self, event): - # https://github.com/apache/spark/blob/v3.5.2/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala#L44-L58 + # https://github.com/apache/spark/blob/v3.5.3/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala#L44-L58 self.status = SparkListenerExecutionStatus.STARTED def on_execution_end(self, event): - # https://github.com/apache/spark/blob/v3.5.2/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala#L61-L83 + # https://github.com/apache/spark/blob/v3.5.3/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala#L61-L83 for job in self._jobs.values(): if job.status == SparkListenerJobStatus.FAILED: self.status = SparkListenerExecutionStatus.FAILED diff --git a/onetl/_metrics/listener/job.py b/onetl/_metrics/listener/job.py index 5581d76e..488282ed 100644 --- a/onetl/_metrics/listener/job.py +++ b/onetl/_metrics/listener/job.py @@ -38,8 +38,8 @@ def stages(self) -> list[SparkListenerStage]: @classmethod def create(cls, event): - # https://spark.apache.org/docs/3.5.2/api/java/org/apache/spark/scheduler/SparkListenerJobSubmitted.html - # https://spark.apache.org/docs/3.5.2/api/java/org/apache/spark/scheduler/SparkListenerJobCompleted.html + # https://spark.apache.org/docs/3.5.3/api/java/org/apache/spark/scheduler/SparkListenerJobSubmitted.html + # https://spark.apache.org/docs/3.5.3/api/java/org/apache/spark/scheduler/SparkListenerJobCompleted.html result = cls( id=event.jobId(), description=event.properties().get("spark.job.description"), diff --git a/onetl/_metrics/listener/listener.py b/onetl/_metrics/listener/listener.py index 04fe53c2..22522432 100644 --- a/onetl/_metrics/listener/listener.py +++ b/onetl/_metrics/listener/listener.py @@ -75,7 +75,7 @@ def onExecutionEnd(self, event): # Get execution metrics from SQLAppStatusStore, # as SparkListenerSQLExecutionEnd event does not provide them: - # https://github.com/apache/spark/blob/v3.5.2/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusStore.scala + # https://github.com/apache/spark/blob/v3.5.3/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusStore.scala session_status_store = self.spark._jsparkSession.sharedState().statusStore() # noqa: WPS437 raw_execution = session_status_store.execution(execution.id).get() metrics = raw_execution.metrics() diff --git a/onetl/_metrics/listener/stage.py b/onetl/_metrics/listener/stage.py index b858e151..3e233570 100644 --- a/onetl/_metrics/listener/stage.py +++ b/onetl/_metrics/listener/stage.py @@ -21,7 +21,7 @@ def __str__(self): @dataclass class SparkListenerStage: - # https://spark.apache.org/docs/3.5.2/api/java/org/apache/spark/scheduler/StageInfo.html + # https://spark.apache.org/docs/3.5.3/api/java/org/apache/spark/scheduler/StageInfo.html id: int status: SparkListenerStageStatus = SparkListenerStageStatus.PENDING metrics: SparkListenerTaskMetrics = field(default_factory=SparkListenerTaskMetrics, repr=False, init=False) @@ -39,11 +39,11 @@ def create(cls, stage_info): return cls(id=stage_info.stageId()) def on_stage_start(self, event): - # https://spark.apache.org/docs/3.5.2/api/java/org/apache/spark/scheduler/SparkListenerStageSubmitted.html + # https://spark.apache.org/docs/3.5.3/api/java/org/apache/spark/scheduler/SparkListenerStageSubmitted.html self.status = SparkListenerStageStatus.ACTIVE def on_stage_end(self, event): - # https://spark.apache.org/docs/3.5.2/api/java/org/apache/spark/scheduler/SparkListenerStageCompleted.html + # https://spark.apache.org/docs/3.5.3/api/java/org/apache/spark/scheduler/SparkListenerStageCompleted.html stage_info = event.stageInfo() if stage_info.failureReason().isDefined(): self.status = SparkListenerStageStatus.FAILED diff --git a/onetl/_metrics/listener/task.py b/onetl/_metrics/listener/task.py index 5a17ffc5..aecc45fe 100644 --- a/onetl/_metrics/listener/task.py +++ b/onetl/_metrics/listener/task.py @@ -81,14 +81,14 @@ class SparkListenerTask: @classmethod def create(cls, task_info): - # https://spark.apache.org/docs/3.5.2/api/java/org/apache/spark/scheduler/TaskInfo.html + # https://spark.apache.org/docs/3.5.3/api/java/org/apache/spark/scheduler/TaskInfo.html return cls(id=task_info.taskId()) def on_task_start(self, event): - # https://spark.apache.org/docs/3.5.2/api/java/org/apache/spark/scheduler/SparkListenerTaskStart.html + # https://spark.apache.org/docs/3.5.3/api/java/org/apache/spark/scheduler/SparkListenerTaskStart.html self.status = SparkListenerTaskStatus(event.taskInfo().status()) def on_task_end(self, event): - # https://spark.apache.org/docs/3.5.2/api/java/org/apache/spark/scheduler/SparkListenerTaskEnd.html + # https://spark.apache.org/docs/3.5.3/api/java/org/apache/spark/scheduler/SparkListenerTaskEnd.html self.status = SparkListenerTaskStatus(event.taskInfo().status()) self.metrics = SparkListenerTaskMetrics.create(event.taskMetrics()) diff --git a/onetl/_util/spark.py b/onetl/_util/spark.py index ab2090b0..9fa592db 100644 --- a/onetl/_util/spark.py +++ b/onetl/_util/spark.py @@ -143,7 +143,7 @@ def estimate_dataframe_size(spark_session: SparkSession, df: DataFrame) -> int: """ Estimate in-memory DataFrame size in bytes. If cannot be estimated, return 0. - Using Spark's `SizeEstimator `_. + Using Spark's `SizeEstimator `_. """ try: size_estimator = spark_session._jvm.org.apache.spark.util.SizeEstimator # type: ignore[union-attr] diff --git a/onetl/connection/db_connection/kafka/connection.py b/onetl/connection/db_connection/kafka/connection.py index 93f9d821..71fa82bd 100644 --- a/onetl/connection/db_connection/kafka/connection.py +++ b/onetl/connection/db_connection/kafka/connection.py @@ -332,7 +332,7 @@ def write_df_to_target( write_options.update(options.dict(by_alias=True, exclude_none=True, exclude={"if_exists"})) write_options["topic"] = target - # As of Apache Spark version 3.5.2, the mode 'error' is not functioning as expected. + # As of Apache Spark version 3.5.3, the mode 'error' is not functioning as expected. # This issue has been reported and can be tracked at: # https://issues.apache.org/jira/browse/SPARK-44774 mode = options.if_exists diff --git a/onetl/connection/file_df_connection/spark_s3/connection.py b/onetl/connection/file_df_connection/spark_s3/connection.py index 182955cd..2044ebb6 100644 --- a/onetl/connection/file_df_connection/spark_s3/connection.py +++ b/onetl/connection/file_df_connection/spark_s3/connection.py @@ -133,7 +133,7 @@ class SparkS3(SparkFileDFConnection): from pyspark.sql import SparkSession # Create Spark session with Hadoop AWS libraries loaded - maven_packages = SparkS3.get_packages(spark_version="3.5.2") + maven_packages = SparkS3.get_packages(spark_version="3.5.3") # Some dependencies are not used, but downloading takes a lot of time. Skipping them. excluded_packages = [ "com.google.cloud.bigdataoss:gcs-connector", @@ -236,8 +236,8 @@ def get_packages( from onetl.connection import SparkS3 - SparkS3.get_packages(spark_version="3.5.2") - SparkS3.get_packages(spark_version="3.5.2", scala_version="2.12") + SparkS3.get_packages(spark_version="3.5.3") + SparkS3.get_packages(spark_version="3.5.3", scala_version="2.12") """ diff --git a/onetl/file/format/avro.py b/onetl/file/format/avro.py index 1f6e2e0e..426eb30f 100644 --- a/onetl/file/format/avro.py +++ b/onetl/file/format/avro.py @@ -88,7 +88,7 @@ class Avro(ReadWriteFileFormat): from pyspark.sql import SparkSession # Create Spark session with Avro package loaded - maven_packages = Avro.get_packages(spark_version="3.5.2") + maven_packages = Avro.get_packages(spark_version="3.5.3") spark = ( SparkSession.builder.appName("spark-app-name") .config("spark.jars.packages", ",".join(maven_packages)) diff --git a/onetl/file/format/xml.py b/onetl/file/format/xml.py index 2e1ad003..508ec829 100644 --- a/onetl/file/format/xml.py +++ b/onetl/file/format/xml.py @@ -119,7 +119,7 @@ class XML(ReadWriteFileFormat): from pyspark.sql import SparkSession # Create Spark session with XML package loaded - maven_packages = XML.get_packages(spark_version="3.5.2") + maven_packages = XML.get_packages(spark_version="3.5.3") spark = ( SparkSession.builder.appName("spark-app-name") .config("spark.jars.packages", ",".join(maven_packages)) @@ -184,10 +184,10 @@ def get_packages( # noqa: WPS231 from onetl.file.format import XML - XML.get_packages(spark_version="3.5.2") - XML.get_packages(spark_version="3.5.2", scala_version="2.12") + XML.get_packages(spark_version="3.5.3") + XML.get_packages(spark_version="3.5.3", scala_version="2.12") XML.get_packages( - spark_version="3.5.2", + spark_version="3.5.3", scala_version="2.12", package_version="0.18.0", ) diff --git a/requirements/tests/spark-3.5.2.txt b/requirements/tests/spark-3.5.3.txt similarity index 76% rename from requirements/tests/spark-3.5.2.txt rename to requirements/tests/spark-3.5.3.txt index 214f0d63..c5c1408a 100644 --- a/requirements/tests/spark-3.5.2.txt +++ b/requirements/tests/spark-3.5.3.txt @@ -1,5 +1,5 @@ numpy>=1.16 pandas>=1.0 pyarrow>=1.0 -pyspark==3.5.2 +pyspark==3.5.3 sqlalchemy diff --git a/tests/tests_unit/test_file/test_format_unit/test_avro_unit.py b/tests/tests_unit/test_file/test_format_unit/test_avro_unit.py index 53c7a67a..37862cf7 100644 --- a/tests/tests_unit/test_file/test_format_unit/test_avro_unit.py +++ b/tests/tests_unit/test_file/test_format_unit/test_avro_unit.py @@ -29,14 +29,14 @@ def test_avro_get_packages_scala_version_not_supported(): [ # Detect Scala version by Spark version ("2.4.0", None, "org.apache.spark:spark-avro_2.11:2.4.0"), - ("3.5.2", None, "org.apache.spark:spark-avro_2.12:3.5.2"), + ("3.5.3", None, "org.apache.spark:spark-avro_2.12:3.5.3"), # Override Scala version ("2.4.0", "2.11", "org.apache.spark:spark-avro_2.11:2.4.0"), ("2.4.0", "2.12", "org.apache.spark:spark-avro_2.12:2.4.0"), - ("3.5.2", "2.12", "org.apache.spark:spark-avro_2.12:3.5.2"), - ("3.5.2", "2.13", "org.apache.spark:spark-avro_2.13:3.5.2"), + ("3.5.3", "2.12", "org.apache.spark:spark-avro_2.12:3.5.3"), + ("3.5.3", "2.13", "org.apache.spark:spark-avro_2.13:3.5.3"), # Scala version contain three digits when only two needed - ("3.5.2", "2.12.1", "org.apache.spark:spark-avro_2.12:3.5.2"), + ("3.5.3", "2.12.1", "org.apache.spark:spark-avro_2.12:3.5.3"), ], ) def test_avro_get_packages(spark_version, scala_version, package): diff --git a/tests/tests_unit/test_file/test_format_unit/test_excel_unit.py b/tests/tests_unit/test_file/test_format_unit/test_excel_unit.py index ecacb2ca..9f949651 100644 --- a/tests/tests_unit/test_file/test_format_unit/test_excel_unit.py +++ b/tests/tests_unit/test_file/test_format_unit/test_excel_unit.py @@ -35,17 +35,17 @@ def test_excel_get_packages_package_version_not_supported(): [ # Detect Scala version by Spark version ("3.2.4", None, None, ["com.crealytics:spark-excel_2.12:3.2.4_0.20.4"]), - ("3.5.2", None, None, ["com.crealytics:spark-excel_2.12:3.5.2_0.20.4"]), + ("3.5.3", None, None, ["com.crealytics:spark-excel_2.12:3.5.3_0.20.4"]), # Override Scala version ("3.2.4", "2.12", None, ["com.crealytics:spark-excel_2.12:3.2.4_0.20.4"]), ("3.2.4", "2.13", None, ["com.crealytics:spark-excel_2.13:3.2.4_0.20.4"]), - ("3.5.2", "2.12", None, ["com.crealytics:spark-excel_2.12:3.5.2_0.20.4"]), - ("3.5.2", "2.13", None, ["com.crealytics:spark-excel_2.13:3.5.2_0.20.4"]), + ("3.5.3", "2.12", None, ["com.crealytics:spark-excel_2.12:3.5.3_0.20.4"]), + ("3.5.3", "2.13", None, ["com.crealytics:spark-excel_2.13:3.5.3_0.20.4"]), # Override package version ("3.2.0", None, "0.16.0", ["com.crealytics:spark-excel_2.12:3.2.0_0.16.0"]), - ("3.5.2", None, "0.18.0", ["com.crealytics:spark-excel_2.12:3.5.2_0.18.0"]), + ("3.5.3", None, "0.18.0", ["com.crealytics:spark-excel_2.12:3.5.3_0.18.0"]), # Scala version contain three digits when only two needed - ("3.5.2", "2.12.1", None, ["com.crealytics:spark-excel_2.12:3.5.2_0.20.4"]), + ("3.5.3", "2.12.1", None, ["com.crealytics:spark-excel_2.12:3.5.3_0.20.4"]), ], ) def test_excel_get_packages(caplog, spark_version, scala_version, package_version, packages): diff --git a/tests/tests_unit/tests_file_df_connection_unit/test_spark_s3_unit.py b/tests/tests_unit/tests_file_df_connection_unit/test_spark_s3_unit.py index 9a5e6fac..59254361 100644 --- a/tests/tests_unit/tests_file_df_connection_unit/test_spark_s3_unit.py +++ b/tests/tests_unit/tests_file_df_connection_unit/test_spark_s3_unit.py @@ -10,9 +10,9 @@ @pytest.mark.parametrize( "spark_version, scala_version, package", [ - ("3.5.2", None, "org.apache.spark:spark-hadoop-cloud_2.12:3.5.2"), - ("3.5.2", "2.12", "org.apache.spark:spark-hadoop-cloud_2.12:3.5.2"), - ("3.5.2", "2.13", "org.apache.spark:spark-hadoop-cloud_2.13:3.5.2"), + ("3.5.3", None, "org.apache.spark:spark-hadoop-cloud_2.12:3.5.3"), + ("3.5.3", "2.12", "org.apache.spark:spark-hadoop-cloud_2.12:3.5.3"), + ("3.5.3", "2.13", "org.apache.spark:spark-hadoop-cloud_2.13:3.5.3"), ], ) def test_spark_s3_get_packages(spark_version, scala_version, package):