Skip to content

Commit

Permalink
add Dockerfiles for spark
Browse files Browse the repository at this point in the history
  • Loading branch information
thanh-nguyen-dang committed May 30, 2024
1 parent 81a46ac commit ada9910
Show file tree
Hide file tree
Showing 20 changed files with 465 additions and 0 deletions.
4 changes: 4 additions & 0 deletions .github/workflows/image_build_push.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@ jobs:
ci:
name: Build Image and Push to Quay
uses: uc-cdis/.github/.github/workflows/image_build_push.yaml@master
with:
OVERRIDE_REPO_NAME: hadoop-base
OVERRIDE_TAG_NAME: 3.3.0
DOCKERFILE_LOCATION: "./hadoop/base/Dockerfile"
secrets:
ECR_AWS_ACCESS_KEY_ID: ${{ secrets.ECR_AWS_ACCESS_KEY_ID }}
ECR_AWS_SECRET_ACCESS_KEY: ${{ secrets.ECR_AWS_SECRET_ACCESS_KEY }}
Expand Down
102 changes: 102 additions & 0 deletions hadoop/base/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
# To check running container: docker exec -it tube /bin/bash
FROM quay.io/cdis/python:python3.9-buster-stable

ENV DEBIAN_FRONTEND=noninteractive \
HADOOP_VERSION="3.3.2"

ENV HADOOP_INSTALLATION_URL="http://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz" \
HADOOP_HOME="/hadoop" \
JAVA_HOME="/usr/lib/jvm/java-11-openjdk-amd64/"

RUN mkdir -p /usr/share/man/man1

RUN apt-get update && apt-get install -y --no-install-recommends \
software-properties-common \
libpq-dev \
build-essential \
libssl1.1 \
libgnutls30 \
ca-certificates-java \
openjdk-11-jdk \
openssh-server \
# dependency for pyscopg2 - which is dependency for sqlalchemy postgres engine
libpq-dev \
wget \
git \
# dependency for cryptography
libffi-dev \
# dependency for cryptography
libssl-dev \
vim \
net-tools \
netcat \
gnupg \
&& rm -rf /var/lib/apt/lists/*

RUN wget ${HADOOP_INSTALLATION_URL} \
&& mkdir -p $HADOOP_HOME \
&& tar -xvf hadoop-${HADOOP_VERSION}.tar.gz -C ${HADOOP_HOME} --strip-components 1 \
&& rm hadoop-${HADOOP_VERSION}.tar.gz \
&& rm -rf $HADOOP_HOME/share/doc

ENV HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop \
HADOOP_MAPRED_HOME=$HADOOP_HOME \
HADOOP_COMMON_HOME=$HADOOP_HOME \
HADOOP_HDFS_HOME=$HADOOP_HOME \
YARN_HOME=$HADOOP_HOME \
HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native

RUN apt-get --only-upgrade install libpq-dev

ENV PATH="${PATH}:${SPARK_HOME}/bin:${SPARK_HOME}/sbin:${HADOOP_HOME}/sbin:${HADOOP_HOME}/bin:${JAVA_HOME}/bin:${SCALA_HOME}/bin"

ENV CORE_CONF_fs_defaultFS=hdfs://namenode:9000 \
CORE_CONF_hadoop_http_staticuser_user=root \
CORE_CONF_hadoop_proxyuser_hue_hosts=* \
CORE_CONF_hadoop_proxyuser_hue_groups=* \
CORE_CONF_io_compression_codecs=org.apache.hadoop.io.compress.SnappyCodec \
HDFS_CONF_dfs_webhdfs_enabled=true \
HDFS_CONF_dfs_permissions_enabled=false \
HDFS_CONF_dfs_namenode_datanode_registration_ip___hostname___check=false \
YARN_CONF_yarn_log___aggregation___enable=true \
YARN_CONF_yarn_log_server_url=http://historyserver:8188/applicationhistory/logs/ \
YARN_CONF_yarn_resourcemanager_recovery_enabled=true \
YARN_CONF_yarn_resourcemanager_store_class=org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore \
YARN_CONF_yarn_resourcemanager_scheduler_class=org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler \
YARN_CONF_yarn_scheduler_capacity_root_default_maximum___allocation___mb=8192 \
YARN_CONF_yarn_scheduler_capacity_root_default_maximum___allocation___vcores=4 \
YARN_CONF_yarn_resourcemanager_fs_state___store_uri=/rmstate \
YARN_CONF_yarn_resourcemanager_system___metrics___publisher_enabled=true \
YARN_CONF_yarn_resourcemanager_hostname=resourcemanager \
YARN_CONF_yarn_resourcemanager_address=resourcemanager:8032 \
YARN_CONF_yarn_resourcemanager_scheduler_address=resourcemanager:8030 \
YARN_CONF_yarn_resourcemanager_resource__tracker_address=resourcemanager:8031 \
YARN_CONF_yarn_timeline___service_enabled=true \
YARN_CONF_yarn_timeline___service_generic___application___history_enabled=true \
YARN_CONF_yarn_timeline___service_hostname=historyserver \
YARN_CONF_mapreduce_map_output_compress=true \
YARN_CONF_mapred_map_output_compress_codec=org.apache.hadoop.io.compress.SnappyCodec \
YARN_CONF_yarn_nodemanager_resource_memory___mb=16384 \
YARN_CONF_yarn_nodemanager_resource_cpu___vcores=8 \
YARN_CONF_yarn_nodemanager_disk___health___checker_max___disk___utilization___per___disk___percentage=98.5 \
YARN_CONF_yarn_nodemanager_remote___app___log___dir=/app-logs \
YARN_CONF_yarn_nodemanager_aux___services=mapreduce_shuffle \
MAPRED_CONF_mapreduce_framework_name=yarn \
MAPRED_CONF_mapred_child_java_opts=-Xmx4096m \
MAPRED_CONF_mapreduce_map_memory_mb=4096 \
MAPRED_CONF_mapreduce_reduce_memory_mb=8192 \
MAPRED_CONF_mapreduce_map_java_opts=-Xmx3072m \
MAPRED_CONF_mapreduce_reduce_java_opts=-Xmx6144m \
MAPRED_CONF_yarn_app_mapreduce_am_env=HADOOP_MAPRED_HOME=$HADOOP_HOME/ \
MAPRED_CONF_mapreduce_map_env=HADOOP_MAPRED_HOME=$HADOOP_HOME/ \
MAPRED_CONF_mapreduce_reduce_env=HADOOP_MAPRED_HOME=$HADOOP_HOME/

COPY . /gen3spark
WORKDIR /gen3spark

# ENV TINI_VERSION v0.18.0
# ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
# RUN chmod +x /tini
# ENTRYPOINT ["/tini", "--"]

CMD ["/usr/sbin/sshd", "-D"]
116 changes: 116 additions & 0 deletions hadoop/base/entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
#!/bin/bash

# Set some sensible defaults
export CORE_CONF_fs_defaultFS=${CORE_CONF_fs_defaultFS:-hdfs://`hostname -f`:8020}

function addProperty() {
local path=$1
local name=$2
local value=$3

local entry="<property><name>$name</name><value>${value}</value></property>"
local escapedEntry=$(echo $entry | sed 's/\//\\\//g')
sed -i "/<\/configuration>/ s/.*/${escapedEntry}\n&/" $path
}

function configure() {
local path=$1
local module=$2
local envPrefix=$3

local var
local value

echo "Configuring $module"
for c in `printenv | perl -sne 'print "$1 " if m/^${envPrefix}_(.+?)=.*/' -- -envPrefix=$envPrefix`; do
name=`echo ${c} | perl -pe 's/___/-/g; s/__/@/g; s/_/./g; s/@/_/g;'`
var="${envPrefix}_${c}"
value=${!var}
echo " - Setting $name=$value"
addProperty $path $name "$value"
done
}

configure /etc/hadoop/core-site.xml core CORE_CONF
configure /etc/hadoop/hdfs-site.xml hdfs HDFS_CONF
configure /etc/hadoop/yarn-site.xml yarn YARN_CONF
configure /etc/hadoop/httpfs-site.xml httpfs HTTPFS_CONF
configure /etc/hadoop/kms-site.xml kms KMS_CONF
configure /etc/hadoop/mapred-site.xml mapred MAPRED_CONF

if [ "$MULTIHOMED_NETWORK" = "1" ]; then
echo "Configuring for multihomed network"

# HDFS
addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.rpc-bind-host 0.0.0.0
addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.servicerpc-bind-host 0.0.0.0
addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.http-bind-host 0.0.0.0
addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.https-bind-host 0.0.0.0
addProperty /etc/hadoop/hdfs-site.xml dfs.client.use.datanode.hostname true
addProperty /etc/hadoop/hdfs-site.xml dfs.datanode.use.datanode.hostname true

# YARN
addProperty /etc/hadoop/yarn-site.xml yarn.resourcemanager.bind-host 0.0.0.0
addProperty /etc/hadoop/yarn-site.xml yarn.nodemanager.bind-host 0.0.0.0
addProperty /etc/hadoop/yarn-site.xml yarn.timeline-service.bind-host 0.0.0.0

# MAPRED
addProperty /etc/hadoop/mapred-site.xml yarn.nodemanager.bind-host 0.0.0.0
fi

if [ -n "$GANGLIA_HOST" ]; then
mv /etc/hadoop/hadoop-metrics.properties /etc/hadoop/hadoop-metrics.properties.orig
mv /etc/hadoop/hadoop-metrics2.properties /etc/hadoop/hadoop-metrics2.properties.orig

for module in mapred jvm rpc ugi; do
echo "$module.class=org.apache.hadoop.metrics.ganglia.GangliaContext31"
echo "$module.period=10"
echo "$module.servers=$GANGLIA_HOST:8649"
done > /etc/hadoop/hadoop-metrics.properties

for module in namenode datanode resourcemanager nodemanager mrappmaster jobhistoryserver; do
echo "$module.sink.ganglia.class=org.apache.hadoop.metrics2.sink.ganglia.GangliaSink31"
echo "$module.sink.ganglia.period=10"
echo "$module.sink.ganglia.supportsparse=true"
echo "$module.sink.ganglia.slope=jvm.metrics.gcCount=zero,jvm.metrics.memHeapUsedM=both"
echo "$module.sink.ganglia.dmax=jvm.metrics.threadsBlocked=70,jvm.metrics.memHeapUsedM=40"
echo "$module.sink.ganglia.servers=$GANGLIA_HOST:8649"
done > /etc/hadoop/hadoop-metrics2.properties
fi

function wait_for_it()
{
local serviceport=$1
local service=${serviceport%%:*}
local port=${serviceport#*:}
local retry_seconds=5
local max_try=100
let i=1

nc -z $service $port
result=$?

until [ $result -eq 0 ]; do
echo "[$i/$max_try] check for ${service}:${port}..."
echo "[$i/$max_try] ${service}:${port} is not available yet"
if (( $i == $max_try )); then
echo "[$i/$max_try] ${service}:${port} is still not available; giving up after ${max_try} tries. :/"
exit 1
fi

echo "[$i/$max_try] try in ${retry_seconds}s once again ..."
let "i++"
sleep $retry_seconds

nc -z $service $port
result=$?
done
echo "[$i/$max_try] $service:${port} is available."
}

for i in ${SERVICE_PRECONDITION[@]}
do
wait_for_it ${i}
done

exec $@
15 changes: 15 additions & 0 deletions hadoop/datanode/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
FROM quay.io/cdis/hadoop-base:3.3.0

HEALTHCHECK CMD curl -f http://localhost:9864/ || exit 1

ENV HDFS_CONF_dfs_datanode_data_dir=file:///hadoop/dfs/data
RUN mkdir -p /hadoop/dfs/data
VOLUME /hadoop/dfs/data

ADD run.sh /run.sh
RUN chmod a+x /run.sh
WORKDIR /gen3spark

EXPOSE 9864

CMD ["/gen3spark/run.sh"]
9 changes: 9 additions & 0 deletions hadoop/datanode/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash

datadir=`echo $HDFS_CONF_dfs_datanode_data_dir | perl -pe 's#file://##'`
if [ ! -d $datadir ]; then
echo "Datanode data directory not found: $datadir"
exit 2
fi

$HADOOP_HOME/bin/hdfs --config $HADOOP_CONF_DIR datanode
15 changes: 15 additions & 0 deletions hadoop/historyserver/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
FROM quay.io/cdis/hadoop-base:3.3.0

HEALTHCHECK CMD curl -f http://localhost:9870/ || exit 1

ENV HDFS_CONF_dfs_namenode_name_dir=file:///hadoop/dfs/name
RUN mkdir -p /hadoop/dfs/name
VOLUME /hadoop/dfs/name

ADD run.sh /gen3spark/run.sh
RUN chmod a+x /gen3spark/run.sh
WORKDIR /gen3spark

EXPOSE 9870

CMD ["/gen3spark/run.sh"]
22 changes: 22 additions & 0 deletions hadoop/historyserver/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/bin/bash

namedir=`echo $HDFS_CONF_dfs_namenode_name_dir | perl -pe 's#file://##'`
if [ ! -d $namedir ]; then
echo "Namenode name directory not found: $namedir"
exit 2
fi

if [ -z "$CLUSTER_NAME" ]; then
echo "Cluster name not specified"
exit 2
fi

echo "remove lost+found from $namedir"
rm -r $namedir/lost+found

if [ "`ls -A $namedir`" == "" ]; then
echo "Formatting namenode name directory: $namedir"
$HADOOP_HOME/bin/hdfs --config $HADOOP_CONF_DIR namenode -format $CLUSTER_NAME
fi

$HADOOP_HOME/bin/hdfs --config $HADOOP_CONF_DIR namenode
15 changes: 15 additions & 0 deletions hadoop/namenode/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
FROM quay.io/cdis/hadoop-base:3.3.0

HEALTHCHECK CMD curl -f http://localhost:9870/ || exit 1

ENV HDFS_CONF_dfs_namenode_name_dir=file:///hadoop/dfs/name
RUN mkdir -p /hadoop/dfs/name
VOLUME /hadoop/dfs/name

ADD run.sh /gen3spark/run.sh
RUN chmod a+x /gen3spark/run.sh
WORKDIR /gen3spark

EXPOSE 9870

CMD ["/gen3spark/run.sh"]
22 changes: 22 additions & 0 deletions hadoop/namenode/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/bin/bash

namedir=`echo $HDFS_CONF_dfs_namenode_name_dir | perl -pe 's#file://##'`
if [ ! -d $namedir ]; then
echo "Namenode name directory not found: $namedir"
exit 2
fi

if [ -z "$CLUSTER_NAME" ]; then
echo "Cluster name not specified"
exit 2
fi

echo "remove lost+found from $namedir"
rm -r $namedir/lost+found

if [ "`ls -A $namedir`" == "" ]; then
echo "Formatting namenode name directory: $namedir"
$HADOOP_HOME/bin/hdfs --config $HADOOP_CONF_DIR namenode -format $CLUSTER_NAME
fi

$HADOOP_HOME/bin/hdfs --config $HADOOP_CONF_DIR namenode
15 changes: 15 additions & 0 deletions hadoop/nodemanager/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
FROM quay.io/cdis/hadoop-base:3.3.0

HEALTHCHECK CMD curl -f http://localhost:9864/ || exit 1

ENV HDFS_CONF_dfs_datanode_data_dir=file:///hadoop/dfs/data
RUN mkdir -p /hadoop/dfs/data
VOLUME /hadoop/dfs/data

ADD run.sh /run.sh
RUN chmod a+x /run.sh
WORKDIR /gen3spark

EXPOSE 9864

CMD ["/gen3spark/run.sh"]
9 changes: 9 additions & 0 deletions hadoop/nodemanager/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash

datadir=`echo $HDFS_CONF_dfs_datanode_data_dir | perl -pe 's#file://##'`
if [ ! -d $datadir ]; then
echo "Datanode data directory not found: $datadir"
exit 2
fi

$HADOOP_HOME/bin/hdfs --config $HADOOP_CONF_DIR datanode
15 changes: 15 additions & 0 deletions hadoop/resourcemanager/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
FROM quay.io/cdis/hadoop-base:3.3.0

HEALTHCHECK CMD curl -f http://localhost:9864/ || exit 1

ENV HDFS_CONF_dfs_datanode_data_dir=file:///hadoop/dfs/data
RUN mkdir -p /hadoop/dfs/data
VOLUME /hadoop/dfs/data

ADD run.sh /run.sh
RUN chmod a+x /run.sh
WORKDIR /gen3spark

EXPOSE 9864

CMD ["/gen3spark/run.sh"]
9 changes: 9 additions & 0 deletions hadoop/resourcemanager/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash

datadir=`echo $HDFS_CONF_dfs_datanode_data_dir | perl -pe 's#file://##'`
if [ ! -d $datadir ]; then
echo "Datanode data directory not found: $datadir"
exit 2
fi

$HADOOP_HOME/bin/hdfs --config $HADOOP_CONF_DIR datanode
Loading

0 comments on commit ada9910

Please sign in to comment.