Skip to content

Commit

Permalink
use hadoop image as based image for spark
Browse files Browse the repository at this point in the history
  • Loading branch information
thanh-nguyen-dang committed Jun 10, 2024
1 parent 3d99b5e commit d2921d6
Show file tree
Hide file tree
Showing 4 changed files with 9 additions and 31 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/image_build_push.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ jobs:
build-spark-base:
name: Build Spark base image
uses: uc-cdis/.github/.github/workflows/image_build_push.yaml@master
needs: [build-hadoop-base]
with:
OVERRIDE_REPO_NAME: spark-base
OVERRIDE_TAG_NAME: 3.3.0-hadoop3.3
Expand Down Expand Up @@ -111,7 +112,7 @@ jobs:
needs: [build-spark-base]
with:
OVERRIDE_REPO_NAME: spark-master
OVERRIDE_TAG_NAME: 3.3.0-hadoop3.3
OVERRIDE_TAG_NAME: v3.3.0-hadoop3.3
DOCKERFILE_LOCATION: "./spark/master/Dockerfile"
DOCKERFILE_BUILD_CONTEXT: "./spark/master"
USE_QUAY_ONLY: true
Expand Down
1 change: 1 addition & 0 deletions hadoop/base/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
&& rm -rf /var/lib/apt/lists/*

RUN wget ${HADOOP_INSTALLATION_URL} \
&& ln -s /lib64/ld-linux-x86-64.so.2 /lib/ld-linux-x86-64.so.2 \
&& mkdir -p $HADOOP_HOME \
&& tar -xvf hadoop-${HADOOP_VERSION}.tar.gz -C ${HADOOP_HOME} --strip-components 1 \
&& rm hadoop-${HADOOP_VERSION}.tar.gz \
Expand Down
1 change: 1 addition & 0 deletions run_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ def configure_core_site():
root = tree.getroot()
root.append(create_property('hadoop.tmp.dir', '{}/hdfs/tmp'.format(config.HADOOP_HOME)))
root.append(create_property('fs.default.name', config.HADOOP_URL))
root.append(create_property('fs.defaultFS', config.HADOOP_URL))
indent(root)
tree.write(core_site_path)

Expand Down
35 changes: 5 additions & 30 deletions spark/base/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,50 +1,25 @@
# To check running container: docker exec -it tube /bin/bash
FROM quay.io/cdis/python:python3.9-buster-stable
FROM quay.io/cdis/hadoop-base:v3.3.0

ENV DEBIAN_FRONTEND=noninteractive \
HADOOP_VERSION="3" \
HADOOP_SPARK_VERSION="3" \
ES_HADOOP_VERSION="8.3.3" \
SPARK_VERSION="3.3.0"

ENV SPARK_INSTALLATION_URL="http://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" \
HADOOP_INSTALLATION_URL="http://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz" \
ENV SPARK_INSTALLATION_URL="http://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_SPARK_VERSION}.tgz" \
SPARK_HOME="/spark" \
HADOOP_HOME="/hadoop" \
JAVA_HOME="/usr/lib/jvm/java-11-openjdk-amd64/"

RUN mkdir -p /usr/share/man/man1

RUN apt-get update && apt-get install -y --no-install-recommends \
software-properties-common \
libpq-dev \
build-essential \
libssl1.1 \
libgnutls30 \
ca-certificates-java \
openjdk-11-jdk \
openssh-server \
# dependency for pyscopg2 - which is dependency for sqlalchemy postgres engine
libpq-dev \
wget \
git \
# dependency for cryptography
libffi-dev \
# dependency for cryptography
libssl-dev \
vim \
dnsutils \
&& rm -rf /var/lib/apt/lists/*

RUN wget $SPARK_INSTALLATION_URL \
&& mkdir -p $SPARK_HOME \
&& tar -xvf spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz -C $SPARK_HOME --strip-components 1 \
&& rm spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz
&& tar -xvf spark-${SPARK_VERSION}-bin-hadoop${HADOOP_SPARK_VERSION}.tgz -C $SPARK_HOME --strip-components 1 \
&& rm spark-${SPARK_VERSION}-bin-hadoop${HADOOP_SPARK_VERSION}.tgz

RUN apt-get --only-upgrade install libpq-dev

ENV PATH="${PATH}:${SPARK_HOME}/bin:${SPARK_HOME}/sbin:${HADOOP_HOME}/sbin:${HADOOP_HOME}/bin:${JAVA_HOME}/bin:${SCALA_HOME}/bin"


RUN mkdir -p /var/run/sshd ${HADOOP_HOME}/hdfs ${HADOOP_HOME}/hdfs/data ${HADOOP_HOME}/hdfs/data/dfs ${HADOOP_HOME}/hdfs/data/dfs/namenode ${HADOOP_HOME}/logs

ENV PYTHONHASHSEED 1

0 comments on commit d2921d6

Please sign in to comment.