Skip to content

fix(spark-k8s): refactor for Spark Connect #1034

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
Mar 28, 2025
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,19 @@ All notable changes to this project will be documented in this file.

## [Unreleased]

### Added

- spark-connect-client: A new image for Spark connect tests and demos ([#1034])

### Changed

- spark-k8s: Include spark-connect jars. Replace OpenJDK with Temurin JDK. Cleanup. ([#1034])

### Fixed

- spark-k8s: reduce docker image size by removing the recursive chown/chmods in the final image ([#1042]).

[#1034]: https://github.com/stackabletech/docker-images/pull/1034
[#1042]: https://github.com/stackabletech/docker-images/pull/1042

## [25.3.0] - 2025-03-21
Expand Down
2 changes: 2 additions & 0 deletions conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
zookeeper = importlib.import_module("zookeeper.versions")
tools = importlib.import_module("tools.versions")
statsd_exporter = importlib.import_module("statsd_exporter.versions")
spark_connect_client = importlib.import_module("spark-connect-client.versions")

products = [
{"name": "airflow", "versions": airflow.versions},
Expand Down Expand Up @@ -64,6 +65,7 @@
{"name": "zookeeper", "versions": zookeeper.versions},
{"name": "tools", "versions": tools.versions},
{"name": "statsd_exporter", "versions": statsd_exporter.versions},
{"name": "spark-connect-client", "versions": spark_connect_client.versions},
]

open_shift_projects = {
Expand Down
59 changes: 59 additions & 0 deletions spark-connect-client/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# syntax=docker/dockerfile:1.10.0@sha256:865e5dd094beca432e8c0a1d5e1c465db5f998dca4e439981029b3b81fb39ed5

# spark-builder: provides client libs for spark-connect
FROM stackable/image/spark-k8s AS spark-builder

FROM stackable/image/java-base

ARG PRODUCT
ARG PYTHON
ARG RELEASE
ARG STACKABLE_USER_UID

LABEL name="Stackable Spark Connect Examples" \
maintainer="[email protected]" \
vendor="Stackable GmbH" \
version="${PRODUCT}" \
release="${RELEASE}" \
summary="Spark Connect Examples" \
description="Spark Connect client libraries for Python and the JVM, including some examples."


ENV HOME=/stackable

COPY spark-connect-client/stackable/spark-connect-examples /stackable/spark-connect-examples
COPY --chown=${STACKABLE_USER_UID}:0 --from=spark-builder /stackable/spark/connect /stackable/spark/connect

RUN <<EOF
microdnf update
# python{version}-setuptools: needed to build the pyspark[connect] package
microdnf install --nodocs \
"python${PYTHON}" \
"python${PYTHON}-pip" \
"python${PYTHON}-setuptools"
microdnf clean all
rm -rf /var/cache/yum

ln -s /usr/bin/python${PYTHON} /usr/bin/python
ln -s /usr/bin/pip-${PYTHON} /usr/bin/pip

# Install python libraries for the spark connect client
# shellcheck disable=SC2102
pip install --no-cache-dir pyspark[connect]==${PRODUCT}

# All files and folders owned by root group to support running as arbitrary users.
# This is best practice as all container users will belong to the root group (0).
chown -R ${STACKABLE_USER_UID}:0 /stackable
chmod -R g=u /stackable
EOF

# ----------------------------------------
# Attention: We are changing the group of all files in /stackable directly above
# If you do any file based actions (copying / creating etc.) below this comment you
# absolutely need to make sure that the correct permissions are applied!
# chown ${STACKABLE_USER_UID}:0
# ----------------------------------------

USER ${STACKABLE_USER_UID}

WORKDIR /stackable/spark-connect-examples/python
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import sys

from pyspark.sql import SparkSession

if __name__ == "__main__":
remote: str = sys.argv[1]
spark = (
SparkSession.builder.appName("SimpleSparkConnectApp")
.remote(remote)
.getOrCreate()
)

# See https://issues.apache.org/jira/browse/SPARK-46032
spark.addArtifacts("/stackable/spark/connect/spark-connect_2.12-3.5.5.jar")

logFile = "/stackable/spark/README.md"
logData = spark.read.text(logFile).cache()

numAs = logData.filter(logData.value.contains("a")).count()
numBs = logData.filter(logData.value.contains("b")).count()

print("Lines with a: %i, lines with b: %i" % (numAs, numBs))

spark.stop()
8 changes: 8 additions & 0 deletions spark-connect-client/versions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
versions = [
{
"product": "3.5.5",
"spark-k8s": "3.5.5",
"java-base": "17",
"python": "3.11",
},
]
47 changes: 34 additions & 13 deletions spark-k8s/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -235,15 +235,27 @@ COPY --from=hbase-builder --chown=${STACKABLE_USER_UID}:0 \
/stackable/hbase/lib/client-facing-thirdparty/opentelemetry-semconv-*-alpha.jar \
./

WORKDIR /stackable/spark-${PRODUCT}/dist/extra-jars
WORKDIR /stackable/spark-${PRODUCT}/dist/connect

# As of version 3.5.5, spark-connect jars are not included in the dist folder.
# To avoid classpath conflicts with existing spark applications,
# we create a new dist/connect folder, and copy them here.
RUN cp /stackable/spark-${PRODUCT}/connector/connect/server/target/spark-connect_*-${PRODUCT}.jar . \
&& cp /stackable/spark-${PRODUCT}/connector/connect/common/target/spark-connect-common_*-${PRODUCT}.jar . \
&& cp /stackable/spark-${PRODUCT}/connector/connect/client/jvm/target/spark-connect-client-jvm_2.12-${PRODUCT}.jar .

COPY spark-k8s/stackable/jmx /stackable/jmx

WORKDIR /stackable/spark-${PRODUCT}/dist/extra-jars

RUN <<EOF
# Download jackson-dataformat-xml, stax2-api, and woodstox-core which are required for logging.
curl --fail https://repo.stackable.tech/repository/packages/jackson-dataformat-xml/jackson-dataformat-xml-${JACKSON_DATAFORMAT_XML}.jar
curl --fail https://repo.stackable.tech/repository/packages/stax2-api/stax2-api-${STAX2_API}.jar
curl --fail https://repo.stackable.tech/repository/packages/woodstox-core/woodstox-core-${WOODSTOX_CORE}.jar
curl --fail https://repo.stackable.tech/repository/packages/jackson-dataformat-xml/jackson-dataformat-xml-${JACKSON_DATAFORMAT_XML}.jar \
-o /stackable/spark-${PRODUCT}/dist/extra-jars/jackson-dataformat-xml-${JACKSON_DATAFORMAT_XML}.jar
curl --fail https://repo.stackable.tech/repository/packages/stax2-api/stax2-api-${STAX2_API}.jar \
-o /stackable/spark-${PRODUCT}/dist/extra-jars/stax2-api-${STAX2_API}.jar
curl --fail https://repo.stackable.tech/repository/packages/woodstox-core/woodstox-core-${WOODSTOX_CORE}.jar \
-o /stackable/spark-${PRODUCT}/dist/extra-jars/woodstox-core-${WOODSTOX_CORE}.jar

# Get the correct `tini` binary for our architecture.
curl --fail "https://repo.stackable.tech/repository/packages/tini/tini-${TINI}-${TARGETARCH}" \
Expand All @@ -255,14 +267,13 @@ curl --fail "https://repo.stackable.tech/repository/packages/jmx-exporter/jmx_pr
-o "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar"
ln -s "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" /stackable/jmx/jmx_prometheus_javaagent.jar

# Symlink example jar, so that we can easily use it in tests
ln -s /stackable/spark-${PRODUCT}/dist/examples/jars/spark-examples_*.jar /stackable/spark-${PRODUCT}/dist/examples/jars/spark-examples.jar

chmod -R g=u /stackable/spark-${PRODUCT}/dist
chmod -R g=u /stackable/spark-${PRODUCT}/assembly/target/bom.json
chmod -R g=u /stackable/jmx
EOF

# TODO: java-base installs the Adoptium dnf repo and the Termurin jre which is not needed here.
# To reduce the size of this image, the Adoptium repo could be moved to stackable-base instead.
FROM stackable/image/java-base AS final

ARG PRODUCT
Expand All @@ -282,7 +293,9 @@ LABEL name="Apache Spark" \

ENV HOME=/stackable
ENV SPARK_HOME=/stackable/spark
ENV PATH=$SPARK_HOME:$PATH:/bin:$JAVA_HOME/bin:$JAVA_HOME/jre/bin:$HOME/.local/bin
# Override the java-base version of JAVA_HOME to point to the jdk.
ENV JAVA_HOME="/usr/lib/jvm/temurin-${JAVA_VERSION}-jdk"
ENV PATH=$SPARK_HOME/bin:$JAVA_HOME/bin:$PATH
ENV PYSPARK_PYTHON=/usr/bin/python
ENV PYTHONPATH=$SPARK_HOME/python

Expand All @@ -297,24 +310,32 @@ COPY --chown=${STACKABLE_USER_UID}:0 spark-k8s/licenses /licenses

RUN <<EOF
microdnf update
# procps: required for spark startup scripts
# java-*-openjdk-devel: This is needed by the Spark UI to display process information using jps and jmap
# Copying just the binaries from the builder stage failed.
microdnf install \

# procps:
# Required for spark startup scripts.
# temurin-{version}-jdk:
# Needed by the Spark UI to display process information using "jps" and "jmap".
# Spark-Connect needs "javac" to compile auto-generated classes on the fly.
microdnf install --nodocs \
gzip \
hostname \
procps \
"python${PYTHON}" \
"python${PYTHON}-pip" \
zip \
"java-${JAVA_VERSION}-openjdk-devel"
"temurin-${JAVA_VERSION}-jdk"
microdnf clean all
rm -rf /var/cache/yum

ln -s /usr/bin/python${PYTHON} /usr/bin/python
ln -s /usr/bin/pip-${PYTHON} /usr/bin/pip

# Symlink example jar, so that we can easily use it in tests
ln -s /stackable/spark/examples/jars/spark-examples_*.jar /stackable/spark/examples/jars/spark-examples.jar
chown -h ${STACKABLE_USER_UID}:0 /stackable/spark/examples/jars/spark-examples.jar
EOF


# ----------------------------------------
# Attention:
# If you do any file based actions (copying / creating etc.) below this comment you
Expand Down