stackabletech · razvan · Mar 28, 2025 · Mar 21, 2025 · Mar 21, 2025 · Mar 24, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,10 +4,19 @@ All notable changes to this project will be documented in this file.
 
 ## [Unreleased]
 
+### Added
+
+- spark-connect-client: A new image for Spark connect tests and demos ([#1034])
+
+### Changed
+
+- spark-k8s: Include spark-connect jars. Replace OpenJDK with Temurin JDK. Cleanup. ([#1034])
+
 ### Fixed
 
 - spark-k8s: reduce docker image size by removing the recursive chown/chmods in the final image ([#1042]).
 
+[#1034]: https://github.com/stackabletech/docker-images/pull/1034
 [#1042]: https://github.com/stackabletech/docker-images/pull/1042
 
 ## [25.3.0] - 2025-03-21

diff --git a/conf.py b/conf.py
@@ -36,6 +36,7 @@
 zookeeper = importlib.import_module("zookeeper.versions")
 tools = importlib.import_module("tools.versions")
 statsd_exporter = importlib.import_module("statsd_exporter.versions")
+spark_connect_client = importlib.import_module("spark-connect-client.versions")
 
 products = [
     {"name": "airflow", "versions": airflow.versions},
@@ -64,6 +65,7 @@
     {"name": "zookeeper", "versions": zookeeper.versions},
     {"name": "tools", "versions": tools.versions},
     {"name": "statsd_exporter", "versions": statsd_exporter.versions},
+    {"name": "spark-connect-client", "versions": spark_connect_client.versions},
 ]
 
 open_shift_projects = {

diff --git a/spark-connect-client/Dockerfile b/spark-connect-client/Dockerfile
@@ -0,0 +1,59 @@
+# syntax=docker/dockerfile:1.10.0@sha256:865e5dd094beca432e8c0a1d5e1c465db5f998dca4e439981029b3b81fb39ed5
+
+# spark-builder: provides client libs for spark-connect
+FROM stackable/image/spark-k8s AS spark-builder
+
+FROM stackable/image/java-base
+
+ARG PRODUCT
+ARG PYTHON
+ARG RELEASE
+ARG STACKABLE_USER_UID
+
+LABEL name="Stackable Spark Connect Examples" \
+    maintainer="[email protected]" \
+    vendor="Stackable GmbH" \
+    version="${PRODUCT}" \
+    release="${RELEASE}" \
+    summary="Spark Connect Examples" \
+    description="Spark Connect client libraries for Python and the JVM, including some examples."
+
+
+ENV HOME=/stackable
+
+COPY spark-connect-client/stackable/spark-connect-examples /stackable/spark-connect-examples
+COPY --chown=${STACKABLE_USER_UID}:0 --from=spark-builder /stackable/spark/connect /stackable/spark/connect
+
+RUN <<EOF
+microdnf update
+# python{version}-setuptools: needed to build the pyspark[connect] package
+microdnf install --nodocs \
+  "python${PYTHON}" \
+  "python${PYTHON}-pip" \
+  "python${PYTHON}-setuptools"
+microdnf clean all
+rm -rf /var/cache/yum
+
+ln -s /usr/bin/python${PYTHON} /usr/bin/python
+ln -s /usr/bin/pip-${PYTHON} /usr/bin/pip
+
+# Install python libraries for the spark connect client
+# shellcheck disable=SC2102
+pip install --no-cache-dir pyspark[connect]==${PRODUCT}
+
+# All files and folders owned by root group to support running as arbitrary users.
+# This is best practice as all container users will belong to the root group (0).
+chown -R ${STACKABLE_USER_UID}:0 /stackable
+chmod -R g=u /stackable
+EOF
+
+# ----------------------------------------
+# Attention: We are changing the group of all files in /stackable directly above
+# If you do any file based actions (copying / creating etc.) below this comment you
+# absolutely need to make sure that the correct permissions are applied!
+# chown ${STACKABLE_USER_UID}:0
+# ----------------------------------------
+
+USER ${STACKABLE_USER_UID}
+
+WORKDIR /stackable/spark-connect-examples/python
diff --git a/spark-connect-client/stackable/spark-connect-examples/python/simple-connect-app.py b/spark-connect-client/stackable/spark-connect-examples/python/simple-connect-app.py
@@ -0,0 +1,24 @@
+import sys
+
+from pyspark.sql import SparkSession
+
+if __name__ == "__main__":
+    remote: str = sys.argv[1]
+    spark = (
+        SparkSession.builder.appName("SimpleSparkConnectApp")
+        .remote(remote)
+        .getOrCreate()
+    )
+
+    # See https://issues.apache.org/jira/browse/SPARK-46032
+    spark.addArtifacts("/stackable/spark/connect/spark-connect_2.12-3.5.5.jar")
+
+    logFile = "/stackable/spark/README.md"
+    logData = spark.read.text(logFile).cache()
+
+    numAs = logData.filter(logData.value.contains("a")).count()
+    numBs = logData.filter(logData.value.contains("b")).count()
+
+    print("Lines with a: %i, lines with b: %i" % (numAs, numBs))
+
+    spark.stop()
diff --git a/spark-connect-client/versions.py b/spark-connect-client/versions.py
@@ -0,0 +1,8 @@
+versions = [
+    {
+        "product": "3.5.5",
+        "spark-k8s": "3.5.5",
+        "java-base": "17",
+        "python": "3.11",
+    },
+]
diff --git a/spark-k8s/Dockerfile b/spark-k8s/Dockerfile
@@ -235,15 +235,27 @@ COPY --from=hbase-builder --chown=${STACKABLE_USER_UID}:0 \
     /stackable/hbase/lib/client-facing-thirdparty/opentelemetry-semconv-*-alpha.jar \
     ./
 
-WORKDIR /stackable/spark-${PRODUCT}/dist/extra-jars
+WORKDIR /stackable/spark-${PRODUCT}/dist/connect
+
+# As of version 3.5.5, spark-connect jars are not included in the dist folder.
+# To avoid classpath conflicts with existing spark applications,
+# we create a new dist/connect folder, and copy them here.
+RUN cp /stackable/spark-${PRODUCT}/connector/connect/server/target/spark-connect_*-${PRODUCT}.jar . \
+    && cp /stackable/spark-${PRODUCT}/connector/connect/common/target/spark-connect-common_*-${PRODUCT}.jar . \
+    && cp /stackable/spark-${PRODUCT}/connector/connect/client/jvm/target/spark-connect-client-jvm_2.12-${PRODUCT}.jar .
 
 COPY spark-k8s/stackable/jmx /stackable/jmx
 
+WORKDIR /stackable/spark-${PRODUCT}/dist/extra-jars
+
 RUN <<EOF
 # Download jackson-dataformat-xml, stax2-api, and woodstox-core which are required for logging.
-curl --fail https://repo.stackable.tech/repository/packages/jackson-dataformat-xml/jackson-dataformat-xml-${JACKSON_DATAFORMAT_XML}.jar
-curl --fail https://repo.stackable.tech/repository/packages/stax2-api/stax2-api-${STAX2_API}.jar
-curl --fail https://repo.stackable.tech/repository/packages/woodstox-core/woodstox-core-${WOODSTOX_CORE}.jar
+curl --fail https://repo.stackable.tech/repository/packages/jackson-dataformat-xml/jackson-dataformat-xml-${JACKSON_DATAFORMAT_XML}.jar \
+  -o /stackable/spark-${PRODUCT}/dist/extra-jars/jackson-dataformat-xml-${JACKSON_DATAFORMAT_XML}.jar
+curl --fail https://repo.stackable.tech/repository/packages/stax2-api/stax2-api-${STAX2_API}.jar \
+  -o /stackable/spark-${PRODUCT}/dist/extra-jars/stax2-api-${STAX2_API}.jar
+curl --fail https://repo.stackable.tech/repository/packages/woodstox-core/woodstox-core-${WOODSTOX_CORE}.jar \
+  -o /stackable/spark-${PRODUCT}/dist/extra-jars/woodstox-core-${WOODSTOX_CORE}.jar
 
 # Get the correct `tini` binary for our architecture.
 curl --fail "https://repo.stackable.tech/repository/packages/tini/tini-${TINI}-${TARGETARCH}" \
@@ -255,14 +267,13 @@ curl --fail "https://repo.stackable.tech/repository/packages/jmx-exporter/jmx_pr
   -o "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar"
 ln -s "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" /stackable/jmx/jmx_prometheus_javaagent.jar
 
-# Symlink example jar, so that we can easily use it in tests
-ln -s /stackable/spark-${PRODUCT}/dist/examples/jars/spark-examples_*.jar /stackable/spark-${PRODUCT}/dist/examples/jars/spark-examples.jar
-
 chmod -R g=u /stackable/spark-${PRODUCT}/dist
 chmod -R g=u /stackable/spark-${PRODUCT}/assembly/target/bom.json
 chmod -R g=u /stackable/jmx
 EOF
 
+# TODO: java-base installs the Adoptium dnf repo and the Termurin jre which is not needed here.
+# To reduce the size of this image, the Adoptium repo could be moved to stackable-base instead.
 FROM stackable/image/java-base AS final
 
 ARG PRODUCT
@@ -282,7 +293,9 @@ LABEL name="Apache Spark" \
 
 ENV HOME=/stackable
 ENV SPARK_HOME=/stackable/spark
-ENV PATH=$SPARK_HOME:$PATH:/bin:$JAVA_HOME/bin:$JAVA_HOME/jre/bin:$HOME/.local/bin
+# Override the java-base version of JAVA_HOME to point to the jdk.
+ENV JAVA_HOME="/usr/lib/jvm/temurin-${JAVA_VERSION}-jdk"
+ENV PATH=$SPARK_HOME/bin:$JAVA_HOME/bin:$PATH
 ENV PYSPARK_PYTHON=/usr/bin/python
 ENV PYTHONPATH=$SPARK_HOME/python
 
@@ -297,24 +310,32 @@ COPY --chown=${STACKABLE_USER_UID}:0 spark-k8s/licenses /licenses
 
 RUN <<EOF
 microdnf update
-# procps: required for spark startup scripts
-# java-*-openjdk-devel: This is needed by the Spark UI to display process information using jps and jmap
-#                       Copying just the binaries from the builder stage failed.
-microdnf install \
+
+# procps:
+#    Required for spark startup scripts.
+# temurin-{version}-jdk:
+#    Needed by the Spark UI to display process information using "jps" and "jmap".
+#    Spark-Connect needs "javac" to compile auto-generated classes on the fly.
+microdnf install --nodocs \
   gzip \
   hostname \
   procps \
   "python${PYTHON}" \
   "python${PYTHON}-pip" \
   zip \
-  "java-${JAVA_VERSION}-openjdk-devel"
+  "temurin-${JAVA_VERSION}-jdk"
 microdnf clean all
 rm -rf /var/cache/yum
 
 ln -s /usr/bin/python${PYTHON} /usr/bin/python
 ln -s /usr/bin/pip-${PYTHON} /usr/bin/pip
+
+# Symlink example jar, so that we can easily use it in tests
+ln -s /stackable/spark/examples/jars/spark-examples_*.jar /stackable/spark/examples/jars/spark-examples.jar
+chown -h ${STACKABLE_USER_UID}:0 /stackable/spark/examples/jars/spark-examples.jar
 EOF
 
+
 # ----------------------------------------
 # Attention:
 # If you do any file based actions (copying / creating etc.) below this comment you