From ada99106c2c39b653a5b86793c3c61a145018174 Mon Sep 17 00:00:00 2001 From: Thanh Nguyen Date: Thu, 30 May 2024 04:35:08 -0500 Subject: [PATCH] add Dockerfiles for spark --- .github/workflows/image_build_push.yaml | 4 + hadoop/base/Dockerfile | 102 +++++++++++++++++++++ hadoop/base/entrypoint.sh | 116 ++++++++++++++++++++++++ hadoop/datanode/Dockerfile | 15 +++ hadoop/datanode/run.sh | 9 ++ hadoop/historyserver/Dockerfile | 15 +++ hadoop/historyserver/run.sh | 22 +++++ hadoop/namenode/Dockerfile | 15 +++ hadoop/namenode/run.sh | 22 +++++ hadoop/nodemanager/Dockerfile | 15 +++ hadoop/nodemanager/run.sh | 9 ++ hadoop/resourcemanager/Dockerfile | 15 +++ hadoop/resourcemanager/run.sh | 9 ++ spark/base/Dockerfile | 58 ++++++++++++ spark/master/Dockerfile | 15 +++ spark/master/master.sh | 0 spark/submit/Dockerfile | 11 +++ spark/submit/submit.sh | 0 spark/worker/Dockerfile | 13 +++ spark/worker/worker.sh | 0 20 files changed, 465 insertions(+) create mode 100644 hadoop/base/Dockerfile create mode 100644 hadoop/base/entrypoint.sh create mode 100644 hadoop/datanode/Dockerfile create mode 100644 hadoop/datanode/run.sh create mode 100644 hadoop/historyserver/Dockerfile create mode 100644 hadoop/historyserver/run.sh create mode 100644 hadoop/namenode/Dockerfile create mode 100644 hadoop/namenode/run.sh create mode 100644 hadoop/nodemanager/Dockerfile create mode 100644 hadoop/nodemanager/run.sh create mode 100644 hadoop/resourcemanager/Dockerfile create mode 100644 hadoop/resourcemanager/run.sh create mode 100644 spark/base/Dockerfile create mode 100644 spark/master/Dockerfile create mode 100644 spark/master/master.sh create mode 100644 spark/submit/Dockerfile create mode 100644 spark/submit/submit.sh create mode 100644 spark/worker/Dockerfile create mode 100644 spark/worker/worker.sh diff --git a/.github/workflows/image_build_push.yaml b/.github/workflows/image_build_push.yaml index 2221aee..fa563e3 100644 --- a/.github/workflows/image_build_push.yaml +++ b/.github/workflows/image_build_push.yaml @@ -6,6 +6,10 @@ jobs: ci: name: Build Image and Push to Quay uses: uc-cdis/.github/.github/workflows/image_build_push.yaml@master + with: + OVERRIDE_REPO_NAME: hadoop-base + OVERRIDE_TAG_NAME: 3.3.0 + DOCKERFILE_LOCATION: "./hadoop/base/Dockerfile" secrets: ECR_AWS_ACCESS_KEY_ID: ${{ secrets.ECR_AWS_ACCESS_KEY_ID }} ECR_AWS_SECRET_ACCESS_KEY: ${{ secrets.ECR_AWS_SECRET_ACCESS_KEY }} diff --git a/hadoop/base/Dockerfile b/hadoop/base/Dockerfile new file mode 100644 index 0000000..6da0e11 --- /dev/null +++ b/hadoop/base/Dockerfile @@ -0,0 +1,102 @@ +# To check running container: docker exec -it tube /bin/bash +FROM quay.io/cdis/python:python3.9-buster-stable + +ENV DEBIAN_FRONTEND=noninteractive \ + HADOOP_VERSION="3.3.2" + +ENV HADOOP_INSTALLATION_URL="http://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz" \ + HADOOP_HOME="/hadoop" \ + JAVA_HOME="/usr/lib/jvm/java-11-openjdk-amd64/" + +RUN mkdir -p /usr/share/man/man1 + +RUN apt-get update && apt-get install -y --no-install-recommends \ + software-properties-common \ + libpq-dev \ + build-essential \ + libssl1.1 \ + libgnutls30 \ + ca-certificates-java \ + openjdk-11-jdk \ + openssh-server \ + # dependency for pyscopg2 - which is dependency for sqlalchemy postgres engine + libpq-dev \ + wget \ + git \ + # dependency for cryptography + libffi-dev \ + # dependency for cryptography + libssl-dev \ + vim \ + net-tools \ + netcat \ + gnupg \ + && rm -rf /var/lib/apt/lists/* + +RUN wget ${HADOOP_INSTALLATION_URL} \ + && mkdir -p $HADOOP_HOME \ + && tar -xvf hadoop-${HADOOP_VERSION}.tar.gz -C ${HADOOP_HOME} --strip-components 1 \ + && rm hadoop-${HADOOP_VERSION}.tar.gz \ + && rm -rf $HADOOP_HOME/share/doc + +ENV HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop \ + HADOOP_MAPRED_HOME=$HADOOP_HOME \ + HADOOP_COMMON_HOME=$HADOOP_HOME \ + HADOOP_HDFS_HOME=$HADOOP_HOME \ + YARN_HOME=$HADOOP_HOME \ + HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native + +RUN apt-get --only-upgrade install libpq-dev + +ENV PATH="${PATH}:${SPARK_HOME}/bin:${SPARK_HOME}/sbin:${HADOOP_HOME}/sbin:${HADOOP_HOME}/bin:${JAVA_HOME}/bin:${SCALA_HOME}/bin" + +ENV CORE_CONF_fs_defaultFS=hdfs://namenode:9000 \ + CORE_CONF_hadoop_http_staticuser_user=root \ + CORE_CONF_hadoop_proxyuser_hue_hosts=* \ + CORE_CONF_hadoop_proxyuser_hue_groups=* \ + CORE_CONF_io_compression_codecs=org.apache.hadoop.io.compress.SnappyCodec \ + HDFS_CONF_dfs_webhdfs_enabled=true \ + HDFS_CONF_dfs_permissions_enabled=false \ + HDFS_CONF_dfs_namenode_datanode_registration_ip___hostname___check=false \ + YARN_CONF_yarn_log___aggregation___enable=true \ + YARN_CONF_yarn_log_server_url=http://historyserver:8188/applicationhistory/logs/ \ + YARN_CONF_yarn_resourcemanager_recovery_enabled=true \ + YARN_CONF_yarn_resourcemanager_store_class=org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore \ + YARN_CONF_yarn_resourcemanager_scheduler_class=org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler \ + YARN_CONF_yarn_scheduler_capacity_root_default_maximum___allocation___mb=8192 \ + YARN_CONF_yarn_scheduler_capacity_root_default_maximum___allocation___vcores=4 \ + YARN_CONF_yarn_resourcemanager_fs_state___store_uri=/rmstate \ + YARN_CONF_yarn_resourcemanager_system___metrics___publisher_enabled=true \ + YARN_CONF_yarn_resourcemanager_hostname=resourcemanager \ + YARN_CONF_yarn_resourcemanager_address=resourcemanager:8032 \ + YARN_CONF_yarn_resourcemanager_scheduler_address=resourcemanager:8030 \ + YARN_CONF_yarn_resourcemanager_resource__tracker_address=resourcemanager:8031 \ + YARN_CONF_yarn_timeline___service_enabled=true \ + YARN_CONF_yarn_timeline___service_generic___application___history_enabled=true \ + YARN_CONF_yarn_timeline___service_hostname=historyserver \ + YARN_CONF_mapreduce_map_output_compress=true \ + YARN_CONF_mapred_map_output_compress_codec=org.apache.hadoop.io.compress.SnappyCodec \ + YARN_CONF_yarn_nodemanager_resource_memory___mb=16384 \ + YARN_CONF_yarn_nodemanager_resource_cpu___vcores=8 \ + YARN_CONF_yarn_nodemanager_disk___health___checker_max___disk___utilization___per___disk___percentage=98.5 \ + YARN_CONF_yarn_nodemanager_remote___app___log___dir=/app-logs \ + YARN_CONF_yarn_nodemanager_aux___services=mapreduce_shuffle \ + MAPRED_CONF_mapreduce_framework_name=yarn \ + MAPRED_CONF_mapred_child_java_opts=-Xmx4096m \ + MAPRED_CONF_mapreduce_map_memory_mb=4096 \ + MAPRED_CONF_mapreduce_reduce_memory_mb=8192 \ + MAPRED_CONF_mapreduce_map_java_opts=-Xmx3072m \ + MAPRED_CONF_mapreduce_reduce_java_opts=-Xmx6144m \ + MAPRED_CONF_yarn_app_mapreduce_am_env=HADOOP_MAPRED_HOME=$HADOOP_HOME/ \ + MAPRED_CONF_mapreduce_map_env=HADOOP_MAPRED_HOME=$HADOOP_HOME/ \ + MAPRED_CONF_mapreduce_reduce_env=HADOOP_MAPRED_HOME=$HADOOP_HOME/ + +COPY . /gen3spark +WORKDIR /gen3spark + +# ENV TINI_VERSION v0.18.0 +# ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini +# RUN chmod +x /tini +# ENTRYPOINT ["/tini", "--"] + +CMD ["/usr/sbin/sshd", "-D"] diff --git a/hadoop/base/entrypoint.sh b/hadoop/base/entrypoint.sh new file mode 100644 index 0000000..c90e09f --- /dev/null +++ b/hadoop/base/entrypoint.sh @@ -0,0 +1,116 @@ +#!/bin/bash + +# Set some sensible defaults +export CORE_CONF_fs_defaultFS=${CORE_CONF_fs_defaultFS:-hdfs://`hostname -f`:8020} + +function addProperty() { + local path=$1 + local name=$2 + local value=$3 + + local entry="$name${value}" + local escapedEntry=$(echo $entry | sed 's/\//\\\//g') + sed -i "/<\/configuration>/ s/.*/${escapedEntry}\n&/" $path +} + +function configure() { + local path=$1 + local module=$2 + local envPrefix=$3 + + local var + local value + + echo "Configuring $module" + for c in `printenv | perl -sne 'print "$1 " if m/^${envPrefix}_(.+?)=.*/' -- -envPrefix=$envPrefix`; do + name=`echo ${c} | perl -pe 's/___/-/g; s/__/@/g; s/_/./g; s/@/_/g;'` + var="${envPrefix}_${c}" + value=${!var} + echo " - Setting $name=$value" + addProperty $path $name "$value" + done +} + +configure /etc/hadoop/core-site.xml core CORE_CONF +configure /etc/hadoop/hdfs-site.xml hdfs HDFS_CONF +configure /etc/hadoop/yarn-site.xml yarn YARN_CONF +configure /etc/hadoop/httpfs-site.xml httpfs HTTPFS_CONF +configure /etc/hadoop/kms-site.xml kms KMS_CONF +configure /etc/hadoop/mapred-site.xml mapred MAPRED_CONF + +if [ "$MULTIHOMED_NETWORK" = "1" ]; then + echo "Configuring for multihomed network" + + # HDFS + addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.rpc-bind-host 0.0.0.0 + addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.servicerpc-bind-host 0.0.0.0 + addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.http-bind-host 0.0.0.0 + addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.https-bind-host 0.0.0.0 + addProperty /etc/hadoop/hdfs-site.xml dfs.client.use.datanode.hostname true + addProperty /etc/hadoop/hdfs-site.xml dfs.datanode.use.datanode.hostname true + + # YARN + addProperty /etc/hadoop/yarn-site.xml yarn.resourcemanager.bind-host 0.0.0.0 + addProperty /etc/hadoop/yarn-site.xml yarn.nodemanager.bind-host 0.0.0.0 + addProperty /etc/hadoop/yarn-site.xml yarn.timeline-service.bind-host 0.0.0.0 + + # MAPRED + addProperty /etc/hadoop/mapred-site.xml yarn.nodemanager.bind-host 0.0.0.0 +fi + +if [ -n "$GANGLIA_HOST" ]; then + mv /etc/hadoop/hadoop-metrics.properties /etc/hadoop/hadoop-metrics.properties.orig + mv /etc/hadoop/hadoop-metrics2.properties /etc/hadoop/hadoop-metrics2.properties.orig + + for module in mapred jvm rpc ugi; do + echo "$module.class=org.apache.hadoop.metrics.ganglia.GangliaContext31" + echo "$module.period=10" + echo "$module.servers=$GANGLIA_HOST:8649" + done > /etc/hadoop/hadoop-metrics.properties + + for module in namenode datanode resourcemanager nodemanager mrappmaster jobhistoryserver; do + echo "$module.sink.ganglia.class=org.apache.hadoop.metrics2.sink.ganglia.GangliaSink31" + echo "$module.sink.ganglia.period=10" + echo "$module.sink.ganglia.supportsparse=true" + echo "$module.sink.ganglia.slope=jvm.metrics.gcCount=zero,jvm.metrics.memHeapUsedM=both" + echo "$module.sink.ganglia.dmax=jvm.metrics.threadsBlocked=70,jvm.metrics.memHeapUsedM=40" + echo "$module.sink.ganglia.servers=$GANGLIA_HOST:8649" + done > /etc/hadoop/hadoop-metrics2.properties +fi + +function wait_for_it() +{ + local serviceport=$1 + local service=${serviceport%%:*} + local port=${serviceport#*:} + local retry_seconds=5 + local max_try=100 + let i=1 + + nc -z $service $port + result=$? + + until [ $result -eq 0 ]; do + echo "[$i/$max_try] check for ${service}:${port}..." + echo "[$i/$max_try] ${service}:${port} is not available yet" + if (( $i == $max_try )); then + echo "[$i/$max_try] ${service}:${port} is still not available; giving up after ${max_try} tries. :/" + exit 1 + fi + + echo "[$i/$max_try] try in ${retry_seconds}s once again ..." + let "i++" + sleep $retry_seconds + + nc -z $service $port + result=$? + done + echo "[$i/$max_try] $service:${port} is available." +} + +for i in ${SERVICE_PRECONDITION[@]} +do + wait_for_it ${i} +done + +exec $@ \ No newline at end of file diff --git a/hadoop/datanode/Dockerfile b/hadoop/datanode/Dockerfile new file mode 100644 index 0000000..cc4db8e --- /dev/null +++ b/hadoop/datanode/Dockerfile @@ -0,0 +1,15 @@ +FROM quay.io/cdis/hadoop-base:3.3.0 + +HEALTHCHECK CMD curl -f http://localhost:9864/ || exit 1 + +ENV HDFS_CONF_dfs_datanode_data_dir=file:///hadoop/dfs/data +RUN mkdir -p /hadoop/dfs/data +VOLUME /hadoop/dfs/data + +ADD run.sh /run.sh +RUN chmod a+x /run.sh +WORKDIR /gen3spark + +EXPOSE 9864 + +CMD ["/gen3spark/run.sh"] diff --git a/hadoop/datanode/run.sh b/hadoop/datanode/run.sh new file mode 100644 index 0000000..b7a3c99 --- /dev/null +++ b/hadoop/datanode/run.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +datadir=`echo $HDFS_CONF_dfs_datanode_data_dir | perl -pe 's#file://##'` +if [ ! -d $datadir ]; then + echo "Datanode data directory not found: $datadir" + exit 2 +fi + +$HADOOP_HOME/bin/hdfs --config $HADOOP_CONF_DIR datanode \ No newline at end of file diff --git a/hadoop/historyserver/Dockerfile b/hadoop/historyserver/Dockerfile new file mode 100644 index 0000000..49d65e3 --- /dev/null +++ b/hadoop/historyserver/Dockerfile @@ -0,0 +1,15 @@ +FROM quay.io/cdis/hadoop-base:3.3.0 + +HEALTHCHECK CMD curl -f http://localhost:9870/ || exit 1 + +ENV HDFS_CONF_dfs_namenode_name_dir=file:///hadoop/dfs/name +RUN mkdir -p /hadoop/dfs/name +VOLUME /hadoop/dfs/name + +ADD run.sh /gen3spark/run.sh +RUN chmod a+x /gen3spark/run.sh +WORKDIR /gen3spark + +EXPOSE 9870 + +CMD ["/gen3spark/run.sh"] \ No newline at end of file diff --git a/hadoop/historyserver/run.sh b/hadoop/historyserver/run.sh new file mode 100644 index 0000000..0923bd5 --- /dev/null +++ b/hadoop/historyserver/run.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +namedir=`echo $HDFS_CONF_dfs_namenode_name_dir | perl -pe 's#file://##'` +if [ ! -d $namedir ]; then + echo "Namenode name directory not found: $namedir" + exit 2 +fi + +if [ -z "$CLUSTER_NAME" ]; then + echo "Cluster name not specified" + exit 2 +fi + +echo "remove lost+found from $namedir" +rm -r $namedir/lost+found + +if [ "`ls -A $namedir`" == "" ]; then + echo "Formatting namenode name directory: $namedir" + $HADOOP_HOME/bin/hdfs --config $HADOOP_CONF_DIR namenode -format $CLUSTER_NAME +fi + +$HADOOP_HOME/bin/hdfs --config $HADOOP_CONF_DIR namenode \ No newline at end of file diff --git a/hadoop/namenode/Dockerfile b/hadoop/namenode/Dockerfile new file mode 100644 index 0000000..49d65e3 --- /dev/null +++ b/hadoop/namenode/Dockerfile @@ -0,0 +1,15 @@ +FROM quay.io/cdis/hadoop-base:3.3.0 + +HEALTHCHECK CMD curl -f http://localhost:9870/ || exit 1 + +ENV HDFS_CONF_dfs_namenode_name_dir=file:///hadoop/dfs/name +RUN mkdir -p /hadoop/dfs/name +VOLUME /hadoop/dfs/name + +ADD run.sh /gen3spark/run.sh +RUN chmod a+x /gen3spark/run.sh +WORKDIR /gen3spark + +EXPOSE 9870 + +CMD ["/gen3spark/run.sh"] \ No newline at end of file diff --git a/hadoop/namenode/run.sh b/hadoop/namenode/run.sh new file mode 100644 index 0000000..0923bd5 --- /dev/null +++ b/hadoop/namenode/run.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +namedir=`echo $HDFS_CONF_dfs_namenode_name_dir | perl -pe 's#file://##'` +if [ ! -d $namedir ]; then + echo "Namenode name directory not found: $namedir" + exit 2 +fi + +if [ -z "$CLUSTER_NAME" ]; then + echo "Cluster name not specified" + exit 2 +fi + +echo "remove lost+found from $namedir" +rm -r $namedir/lost+found + +if [ "`ls -A $namedir`" == "" ]; then + echo "Formatting namenode name directory: $namedir" + $HADOOP_HOME/bin/hdfs --config $HADOOP_CONF_DIR namenode -format $CLUSTER_NAME +fi + +$HADOOP_HOME/bin/hdfs --config $HADOOP_CONF_DIR namenode \ No newline at end of file diff --git a/hadoop/nodemanager/Dockerfile b/hadoop/nodemanager/Dockerfile new file mode 100644 index 0000000..cc4db8e --- /dev/null +++ b/hadoop/nodemanager/Dockerfile @@ -0,0 +1,15 @@ +FROM quay.io/cdis/hadoop-base:3.3.0 + +HEALTHCHECK CMD curl -f http://localhost:9864/ || exit 1 + +ENV HDFS_CONF_dfs_datanode_data_dir=file:///hadoop/dfs/data +RUN mkdir -p /hadoop/dfs/data +VOLUME /hadoop/dfs/data + +ADD run.sh /run.sh +RUN chmod a+x /run.sh +WORKDIR /gen3spark + +EXPOSE 9864 + +CMD ["/gen3spark/run.sh"] diff --git a/hadoop/nodemanager/run.sh b/hadoop/nodemanager/run.sh new file mode 100644 index 0000000..b7a3c99 --- /dev/null +++ b/hadoop/nodemanager/run.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +datadir=`echo $HDFS_CONF_dfs_datanode_data_dir | perl -pe 's#file://##'` +if [ ! -d $datadir ]; then + echo "Datanode data directory not found: $datadir" + exit 2 +fi + +$HADOOP_HOME/bin/hdfs --config $HADOOP_CONF_DIR datanode \ No newline at end of file diff --git a/hadoop/resourcemanager/Dockerfile b/hadoop/resourcemanager/Dockerfile new file mode 100644 index 0000000..cc4db8e --- /dev/null +++ b/hadoop/resourcemanager/Dockerfile @@ -0,0 +1,15 @@ +FROM quay.io/cdis/hadoop-base:3.3.0 + +HEALTHCHECK CMD curl -f http://localhost:9864/ || exit 1 + +ENV HDFS_CONF_dfs_datanode_data_dir=file:///hadoop/dfs/data +RUN mkdir -p /hadoop/dfs/data +VOLUME /hadoop/dfs/data + +ADD run.sh /run.sh +RUN chmod a+x /run.sh +WORKDIR /gen3spark + +EXPOSE 9864 + +CMD ["/gen3spark/run.sh"] diff --git a/hadoop/resourcemanager/run.sh b/hadoop/resourcemanager/run.sh new file mode 100644 index 0000000..b7a3c99 --- /dev/null +++ b/hadoop/resourcemanager/run.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +datadir=`echo $HDFS_CONF_dfs_datanode_data_dir | perl -pe 's#file://##'` +if [ ! -d $datadir ]; then + echo "Datanode data directory not found: $datadir" + exit 2 +fi + +$HADOOP_HOME/bin/hdfs --config $HADOOP_CONF_DIR datanode \ No newline at end of file diff --git a/spark/base/Dockerfile b/spark/base/Dockerfile new file mode 100644 index 0000000..86d3710 --- /dev/null +++ b/spark/base/Dockerfile @@ -0,0 +1,58 @@ +# To check running container: docker exec -it tube /bin/bash +FROM quay.io/cdis/python:python3.9-buster-stable + +ENV DEBIAN_FRONTEND=noninteractive \ + HADOOP_VERSION="3" \ + ES_HADOOP_VERSION="8.3.3" \ + SPARK_VERSION="3.3.0" + +ENV SPARK_INSTALLATION_URL="http://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" \ + HADOOP_INSTALLATION_URL="http://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz" \ + SPARK_HOME="/spark" \ + HADOOP_HOME="/hadoop" \ + JAVA_HOME="/usr/lib/jvm/java-11-openjdk-amd64/" + +RUN mkdir -p /usr/share/man/man1 + +RUN apt-get update && apt-get install -y --no-install-recommends \ + software-properties-common \ + libpq-dev \ + build-essential \ + libssl1.1 \ + libgnutls30 \ + ca-certificates-java \ + openjdk-11-jdk \ + openssh-server \ + # dependency for pyscopg2 - which is dependency for sqlalchemy postgres engine + libpq-dev \ + wget \ + git \ + # dependency for cryptography + libffi-dev \ + # dependency for cryptography + libssl-dev \ + vim \ + && rm -rf /var/lib/apt/lists/* + +RUN wget $SPARK_INSTALLATION_URL \ + && mkdir -p $SPARK_HOME \ + && tar -xvf spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz -C $SPARK_HOME --strip-components 1 \ + && rm spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz + +RUN apt-get --only-upgrade install libpq-dev + +ENV PATH="${PATH}:${SPARK_HOME}/bin:${SPARK_HOME}/sbin:${HADOOP_HOME}/sbin:${HADOOP_HOME}/bin:${JAVA_HOME}/bin:${SCALA_HOME}/bin" + + +RUN mkdir -p /var/run/sshd ${HADOOP_HOME}/hdfs ${HADOOP_HOME}/hdfs/data ${HADOOP_HOME}/hdfs/data/dfs ${HADOOP_HOME}/hdfs/data/dfs/namenode ${HADOOP_HOME}/logs + +COPY . /gen3spark +WORKDIR /gen3spark + +# ENV TINI_VERSION v0.18.0 +# ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini +# RUN chmod +x /tini +# ENTRYPOINT ["/tini", "--"] + +ENV PYTHONHASHSEED 1 +CMD ["/usr/sbin/sshd", "-D"] diff --git a/spark/master/Dockerfile b/spark/master/Dockerfile new file mode 100644 index 0000000..3e0552e --- /dev/null +++ b/spark/master/Dockerfile @@ -0,0 +1,15 @@ +FROM bde2020/spark-base:3.3.0-hadoop3.3 + +LABEL maintainer="Gezim Sejdiu , Giannis Mouchakis " + +ENV SPARK_MASTER_PORT 7077 +ENV SPARK_MASTER_WEBUI_PORT 8080 +ENV SPARK_MASTER_LOG /spark/logs + +EXPOSE 8080 7077 6066 + +COPY master.sh /gen3spark +WORKDIR /gen3spark + +CMD ["/bin/bash", "/master.sh"] + diff --git a/spark/master/master.sh b/spark/master/master.sh new file mode 100644 index 0000000..e69de29 diff --git a/spark/submit/Dockerfile b/spark/submit/Dockerfile new file mode 100644 index 0000000..403949a --- /dev/null +++ b/spark/submit/Dockerfile @@ -0,0 +1,11 @@ +FROM quay.io/cdis/spark-base:3.3.0-hadoop3.3 + +LABEL maintainer="Gezim Sejdiu , Giannis Mouchakis " + +ENV SPARK_MASTER_NAME spark-master +ENV SPARK_MASTER_PORT 7077 + +COPY submit.sh /gen3spark +WORKDIR /gen3spark + +CMD ["/bin/bash", "/submit.sh"] diff --git a/spark/submit/submit.sh b/spark/submit/submit.sh new file mode 100644 index 0000000..e69de29 diff --git a/spark/worker/Dockerfile b/spark/worker/Dockerfile new file mode 100644 index 0000000..37f9157 --- /dev/null +++ b/spark/worker/Dockerfile @@ -0,0 +1,13 @@ +FROM quay.io/cdis/spark-base:3.3.0-hadoop3.3 + +LABEL maintainer="Gezim Sejdiu , Giannis Mouchakis " + +ENV SPARK_WORKER_WEBUI_PORT 8081 +ENV SPARK_WORKER_LOG /spark/logs +ENV SPARK_MASTER "spark://spark-master:7077" + +EXPOSE 8081 + +COPY worker.sh /gen3spark + +CMD ["/bin/bash", "/worker.sh"] \ No newline at end of file diff --git a/spark/worker/worker.sh b/spark/worker/worker.sh new file mode 100644 index 0000000..e69de29