From d2921d68a7e86dcf39e4c4b169aae7bc11d64796 Mon Sep 17 00:00:00 2001 From: Thanh Nguyen Date: Mon, 10 Jun 2024 16:34:58 -0500 Subject: [PATCH] use hadoop image as based image for spark --- .github/workflows/image_build_push.yaml | 3 ++- hadoop/base/Dockerfile | 1 + run_config.py | 1 + spark/base/Dockerfile | 35 ++++--------------------- 4 files changed, 9 insertions(+), 31 deletions(-) diff --git a/.github/workflows/image_build_push.yaml b/.github/workflows/image_build_push.yaml index 78021f1..39b0ea7 100644 --- a/.github/workflows/image_build_push.yaml +++ b/.github/workflows/image_build_push.yaml @@ -20,6 +20,7 @@ jobs: build-spark-base: name: Build Spark base image uses: uc-cdis/.github/.github/workflows/image_build_push.yaml@master + needs: [build-hadoop-base] with: OVERRIDE_REPO_NAME: spark-base OVERRIDE_TAG_NAME: 3.3.0-hadoop3.3 @@ -111,7 +112,7 @@ jobs: needs: [build-spark-base] with: OVERRIDE_REPO_NAME: spark-master - OVERRIDE_TAG_NAME: 3.3.0-hadoop3.3 + OVERRIDE_TAG_NAME: v3.3.0-hadoop3.3 DOCKERFILE_LOCATION: "./spark/master/Dockerfile" DOCKERFILE_BUILD_CONTEXT: "./spark/master" USE_QUAY_ONLY: true diff --git a/hadoop/base/Dockerfile b/hadoop/base/Dockerfile index 94ffcfa..d512d29 100644 --- a/hadoop/base/Dockerfile +++ b/hadoop/base/Dockerfile @@ -35,6 +35,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ && rm -rf /var/lib/apt/lists/* RUN wget ${HADOOP_INSTALLATION_URL} \ + && ln -s /lib64/ld-linux-x86-64.so.2 /lib/ld-linux-x86-64.so.2 \ && mkdir -p $HADOOP_HOME \ && tar -xvf hadoop-${HADOOP_VERSION}.tar.gz -C ${HADOOP_HOME} --strip-components 1 \ && rm hadoop-${HADOOP_VERSION}.tar.gz \ diff --git a/run_config.py b/run_config.py index 2eb94a4..8d3af3e 100644 --- a/run_config.py +++ b/run_config.py @@ -27,6 +27,7 @@ def configure_core_site(): root = tree.getroot() root.append(create_property('hadoop.tmp.dir', '{}/hdfs/tmp'.format(config.HADOOP_HOME))) root.append(create_property('fs.default.name', config.HADOOP_URL)) + root.append(create_property('fs.defaultFS', config.HADOOP_URL)) indent(root) tree.write(core_site_path) diff --git a/spark/base/Dockerfile b/spark/base/Dockerfile index 0414560..d910001 100644 --- a/spark/base/Dockerfile +++ b/spark/base/Dockerfile @@ -1,50 +1,25 @@ # To check running container: docker exec -it tube /bin/bash -FROM quay.io/cdis/python:python3.9-buster-stable +FROM quay.io/cdis/hadoop-base:v3.3.0 ENV DEBIAN_FRONTEND=noninteractive \ - HADOOP_VERSION="3" \ + HADOOP_SPARK_VERSION="3" \ ES_HADOOP_VERSION="8.3.3" \ SPARK_VERSION="3.3.0" -ENV SPARK_INSTALLATION_URL="http://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" \ - HADOOP_INSTALLATION_URL="http://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz" \ +ENV SPARK_INSTALLATION_URL="http://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_SPARK_VERSION}.tgz" \ SPARK_HOME="/spark" \ HADOOP_HOME="/hadoop" \ JAVA_HOME="/usr/lib/jvm/java-11-openjdk-amd64/" RUN mkdir -p /usr/share/man/man1 -RUN apt-get update && apt-get install -y --no-install-recommends \ - software-properties-common \ - libpq-dev \ - build-essential \ - libssl1.1 \ - libgnutls30 \ - ca-certificates-java \ - openjdk-11-jdk \ - openssh-server \ - # dependency for pyscopg2 - which is dependency for sqlalchemy postgres engine - libpq-dev \ - wget \ - git \ - # dependency for cryptography - libffi-dev \ - # dependency for cryptography - libssl-dev \ - vim \ - dnsutils \ - && rm -rf /var/lib/apt/lists/* - RUN wget $SPARK_INSTALLATION_URL \ && mkdir -p $SPARK_HOME \ - && tar -xvf spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz -C $SPARK_HOME --strip-components 1 \ - && rm spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz + && tar -xvf spark-${SPARK_VERSION}-bin-hadoop${HADOOP_SPARK_VERSION}.tgz -C $SPARK_HOME --strip-components 1 \ + && rm spark-${SPARK_VERSION}-bin-hadoop${HADOOP_SPARK_VERSION}.tgz RUN apt-get --only-upgrade install libpq-dev -ENV PATH="${PATH}:${SPARK_HOME}/bin:${SPARK_HOME}/sbin:${HADOOP_HOME}/sbin:${HADOOP_HOME}/bin:${JAVA_HOME}/bin:${SCALA_HOME}/bin" - - RUN mkdir -p /var/run/sshd ${HADOOP_HOME}/hdfs ${HADOOP_HOME}/hdfs/data ${HADOOP_HOME}/hdfs/data/dfs ${HADOOP_HOME}/hdfs/data/dfs/namenode ${HADOOP_HOME}/logs ENV PYTHONHASHSEED 1