-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
81a46ac
commit ada9910
Showing
20 changed files
with
465 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
# To check running container: docker exec -it tube /bin/bash | ||
FROM quay.io/cdis/python:python3.9-buster-stable | ||
|
||
ENV DEBIAN_FRONTEND=noninteractive \ | ||
HADOOP_VERSION="3.3.2" | ||
|
||
ENV HADOOP_INSTALLATION_URL="http://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz" \ | ||
HADOOP_HOME="/hadoop" \ | ||
JAVA_HOME="/usr/lib/jvm/java-11-openjdk-amd64/" | ||
|
||
RUN mkdir -p /usr/share/man/man1 | ||
|
||
RUN apt-get update && apt-get install -y --no-install-recommends \ | ||
software-properties-common \ | ||
libpq-dev \ | ||
build-essential \ | ||
libssl1.1 \ | ||
libgnutls30 \ | ||
ca-certificates-java \ | ||
openjdk-11-jdk \ | ||
openssh-server \ | ||
# dependency for pyscopg2 - which is dependency for sqlalchemy postgres engine | ||
libpq-dev \ | ||
wget \ | ||
git \ | ||
# dependency for cryptography | ||
libffi-dev \ | ||
# dependency for cryptography | ||
libssl-dev \ | ||
vim \ | ||
net-tools \ | ||
netcat \ | ||
gnupg \ | ||
&& rm -rf /var/lib/apt/lists/* | ||
|
||
RUN wget ${HADOOP_INSTALLATION_URL} \ | ||
&& mkdir -p $HADOOP_HOME \ | ||
&& tar -xvf hadoop-${HADOOP_VERSION}.tar.gz -C ${HADOOP_HOME} --strip-components 1 \ | ||
&& rm hadoop-${HADOOP_VERSION}.tar.gz \ | ||
&& rm -rf $HADOOP_HOME/share/doc | ||
|
||
ENV HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop \ | ||
HADOOP_MAPRED_HOME=$HADOOP_HOME \ | ||
HADOOP_COMMON_HOME=$HADOOP_HOME \ | ||
HADOOP_HDFS_HOME=$HADOOP_HOME \ | ||
YARN_HOME=$HADOOP_HOME \ | ||
HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native | ||
|
||
RUN apt-get --only-upgrade install libpq-dev | ||
|
||
ENV PATH="${PATH}:${SPARK_HOME}/bin:${SPARK_HOME}/sbin:${HADOOP_HOME}/sbin:${HADOOP_HOME}/bin:${JAVA_HOME}/bin:${SCALA_HOME}/bin" | ||
|
||
ENV CORE_CONF_fs_defaultFS=hdfs://namenode:9000 \ | ||
CORE_CONF_hadoop_http_staticuser_user=root \ | ||
CORE_CONF_hadoop_proxyuser_hue_hosts=* \ | ||
CORE_CONF_hadoop_proxyuser_hue_groups=* \ | ||
CORE_CONF_io_compression_codecs=org.apache.hadoop.io.compress.SnappyCodec \ | ||
HDFS_CONF_dfs_webhdfs_enabled=true \ | ||
HDFS_CONF_dfs_permissions_enabled=false \ | ||
HDFS_CONF_dfs_namenode_datanode_registration_ip___hostname___check=false \ | ||
YARN_CONF_yarn_log___aggregation___enable=true \ | ||
YARN_CONF_yarn_log_server_url=http://historyserver:8188/applicationhistory/logs/ \ | ||
YARN_CONF_yarn_resourcemanager_recovery_enabled=true \ | ||
YARN_CONF_yarn_resourcemanager_store_class=org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore \ | ||
YARN_CONF_yarn_resourcemanager_scheduler_class=org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler \ | ||
YARN_CONF_yarn_scheduler_capacity_root_default_maximum___allocation___mb=8192 \ | ||
YARN_CONF_yarn_scheduler_capacity_root_default_maximum___allocation___vcores=4 \ | ||
YARN_CONF_yarn_resourcemanager_fs_state___store_uri=/rmstate \ | ||
YARN_CONF_yarn_resourcemanager_system___metrics___publisher_enabled=true \ | ||
YARN_CONF_yarn_resourcemanager_hostname=resourcemanager \ | ||
YARN_CONF_yarn_resourcemanager_address=resourcemanager:8032 \ | ||
YARN_CONF_yarn_resourcemanager_scheduler_address=resourcemanager:8030 \ | ||
YARN_CONF_yarn_resourcemanager_resource__tracker_address=resourcemanager:8031 \ | ||
YARN_CONF_yarn_timeline___service_enabled=true \ | ||
YARN_CONF_yarn_timeline___service_generic___application___history_enabled=true \ | ||
YARN_CONF_yarn_timeline___service_hostname=historyserver \ | ||
YARN_CONF_mapreduce_map_output_compress=true \ | ||
YARN_CONF_mapred_map_output_compress_codec=org.apache.hadoop.io.compress.SnappyCodec \ | ||
YARN_CONF_yarn_nodemanager_resource_memory___mb=16384 \ | ||
YARN_CONF_yarn_nodemanager_resource_cpu___vcores=8 \ | ||
YARN_CONF_yarn_nodemanager_disk___health___checker_max___disk___utilization___per___disk___percentage=98.5 \ | ||
YARN_CONF_yarn_nodemanager_remote___app___log___dir=/app-logs \ | ||
YARN_CONF_yarn_nodemanager_aux___services=mapreduce_shuffle \ | ||
MAPRED_CONF_mapreduce_framework_name=yarn \ | ||
MAPRED_CONF_mapred_child_java_opts=-Xmx4096m \ | ||
MAPRED_CONF_mapreduce_map_memory_mb=4096 \ | ||
MAPRED_CONF_mapreduce_reduce_memory_mb=8192 \ | ||
MAPRED_CONF_mapreduce_map_java_opts=-Xmx3072m \ | ||
MAPRED_CONF_mapreduce_reduce_java_opts=-Xmx6144m \ | ||
MAPRED_CONF_yarn_app_mapreduce_am_env=HADOOP_MAPRED_HOME=$HADOOP_HOME/ \ | ||
MAPRED_CONF_mapreduce_map_env=HADOOP_MAPRED_HOME=$HADOOP_HOME/ \ | ||
MAPRED_CONF_mapreduce_reduce_env=HADOOP_MAPRED_HOME=$HADOOP_HOME/ | ||
|
||
COPY . /gen3spark | ||
WORKDIR /gen3spark | ||
|
||
# ENV TINI_VERSION v0.18.0 | ||
# ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini | ||
# RUN chmod +x /tini | ||
# ENTRYPOINT ["/tini", "--"] | ||
|
||
CMD ["/usr/sbin/sshd", "-D"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
#!/bin/bash | ||
|
||
# Set some sensible defaults | ||
export CORE_CONF_fs_defaultFS=${CORE_CONF_fs_defaultFS:-hdfs://`hostname -f`:8020} | ||
|
||
function addProperty() { | ||
local path=$1 | ||
local name=$2 | ||
local value=$3 | ||
|
||
local entry="<property><name>$name</name><value>${value}</value></property>" | ||
local escapedEntry=$(echo $entry | sed 's/\//\\\//g') | ||
sed -i "/<\/configuration>/ s/.*/${escapedEntry}\n&/" $path | ||
} | ||
|
||
function configure() { | ||
local path=$1 | ||
local module=$2 | ||
local envPrefix=$3 | ||
|
||
local var | ||
local value | ||
|
||
echo "Configuring $module" | ||
for c in `printenv | perl -sne 'print "$1 " if m/^${envPrefix}_(.+?)=.*/' -- -envPrefix=$envPrefix`; do | ||
name=`echo ${c} | perl -pe 's/___/-/g; s/__/@/g; s/_/./g; s/@/_/g;'` | ||
var="${envPrefix}_${c}" | ||
value=${!var} | ||
echo " - Setting $name=$value" | ||
addProperty $path $name "$value" | ||
done | ||
} | ||
|
||
configure /etc/hadoop/core-site.xml core CORE_CONF | ||
configure /etc/hadoop/hdfs-site.xml hdfs HDFS_CONF | ||
configure /etc/hadoop/yarn-site.xml yarn YARN_CONF | ||
configure /etc/hadoop/httpfs-site.xml httpfs HTTPFS_CONF | ||
configure /etc/hadoop/kms-site.xml kms KMS_CONF | ||
configure /etc/hadoop/mapred-site.xml mapred MAPRED_CONF | ||
|
||
if [ "$MULTIHOMED_NETWORK" = "1" ]; then | ||
echo "Configuring for multihomed network" | ||
|
||
# HDFS | ||
addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.rpc-bind-host 0.0.0.0 | ||
addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.servicerpc-bind-host 0.0.0.0 | ||
addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.http-bind-host 0.0.0.0 | ||
addProperty /etc/hadoop/hdfs-site.xml dfs.namenode.https-bind-host 0.0.0.0 | ||
addProperty /etc/hadoop/hdfs-site.xml dfs.client.use.datanode.hostname true | ||
addProperty /etc/hadoop/hdfs-site.xml dfs.datanode.use.datanode.hostname true | ||
|
||
# YARN | ||
addProperty /etc/hadoop/yarn-site.xml yarn.resourcemanager.bind-host 0.0.0.0 | ||
addProperty /etc/hadoop/yarn-site.xml yarn.nodemanager.bind-host 0.0.0.0 | ||
addProperty /etc/hadoop/yarn-site.xml yarn.timeline-service.bind-host 0.0.0.0 | ||
|
||
# MAPRED | ||
addProperty /etc/hadoop/mapred-site.xml yarn.nodemanager.bind-host 0.0.0.0 | ||
fi | ||
|
||
if [ -n "$GANGLIA_HOST" ]; then | ||
mv /etc/hadoop/hadoop-metrics.properties /etc/hadoop/hadoop-metrics.properties.orig | ||
mv /etc/hadoop/hadoop-metrics2.properties /etc/hadoop/hadoop-metrics2.properties.orig | ||
|
||
for module in mapred jvm rpc ugi; do | ||
echo "$module.class=org.apache.hadoop.metrics.ganglia.GangliaContext31" | ||
echo "$module.period=10" | ||
echo "$module.servers=$GANGLIA_HOST:8649" | ||
done > /etc/hadoop/hadoop-metrics.properties | ||
|
||
for module in namenode datanode resourcemanager nodemanager mrappmaster jobhistoryserver; do | ||
echo "$module.sink.ganglia.class=org.apache.hadoop.metrics2.sink.ganglia.GangliaSink31" | ||
echo "$module.sink.ganglia.period=10" | ||
echo "$module.sink.ganglia.supportsparse=true" | ||
echo "$module.sink.ganglia.slope=jvm.metrics.gcCount=zero,jvm.metrics.memHeapUsedM=both" | ||
echo "$module.sink.ganglia.dmax=jvm.metrics.threadsBlocked=70,jvm.metrics.memHeapUsedM=40" | ||
echo "$module.sink.ganglia.servers=$GANGLIA_HOST:8649" | ||
done > /etc/hadoop/hadoop-metrics2.properties | ||
fi | ||
|
||
function wait_for_it() | ||
{ | ||
local serviceport=$1 | ||
local service=${serviceport%%:*} | ||
local port=${serviceport#*:} | ||
local retry_seconds=5 | ||
local max_try=100 | ||
let i=1 | ||
|
||
nc -z $service $port | ||
result=$? | ||
|
||
until [ $result -eq 0 ]; do | ||
echo "[$i/$max_try] check for ${service}:${port}..." | ||
echo "[$i/$max_try] ${service}:${port} is not available yet" | ||
if (( $i == $max_try )); then | ||
echo "[$i/$max_try] ${service}:${port} is still not available; giving up after ${max_try} tries. :/" | ||
exit 1 | ||
fi | ||
|
||
echo "[$i/$max_try] try in ${retry_seconds}s once again ..." | ||
let "i++" | ||
sleep $retry_seconds | ||
|
||
nc -z $service $port | ||
result=$? | ||
done | ||
echo "[$i/$max_try] $service:${port} is available." | ||
} | ||
|
||
for i in ${SERVICE_PRECONDITION[@]} | ||
do | ||
wait_for_it ${i} | ||
done | ||
|
||
exec $@ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
FROM quay.io/cdis/hadoop-base:3.3.0 | ||
|
||
HEALTHCHECK CMD curl -f http://localhost:9864/ || exit 1 | ||
|
||
ENV HDFS_CONF_dfs_datanode_data_dir=file:///hadoop/dfs/data | ||
RUN mkdir -p /hadoop/dfs/data | ||
VOLUME /hadoop/dfs/data | ||
|
||
ADD run.sh /run.sh | ||
RUN chmod a+x /run.sh | ||
WORKDIR /gen3spark | ||
|
||
EXPOSE 9864 | ||
|
||
CMD ["/gen3spark/run.sh"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
#!/bin/bash | ||
|
||
datadir=`echo $HDFS_CONF_dfs_datanode_data_dir | perl -pe 's#file://##'` | ||
if [ ! -d $datadir ]; then | ||
echo "Datanode data directory not found: $datadir" | ||
exit 2 | ||
fi | ||
|
||
$HADOOP_HOME/bin/hdfs --config $HADOOP_CONF_DIR datanode |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
FROM quay.io/cdis/hadoop-base:3.3.0 | ||
|
||
HEALTHCHECK CMD curl -f http://localhost:9870/ || exit 1 | ||
|
||
ENV HDFS_CONF_dfs_namenode_name_dir=file:///hadoop/dfs/name | ||
RUN mkdir -p /hadoop/dfs/name | ||
VOLUME /hadoop/dfs/name | ||
|
||
ADD run.sh /gen3spark/run.sh | ||
RUN chmod a+x /gen3spark/run.sh | ||
WORKDIR /gen3spark | ||
|
||
EXPOSE 9870 | ||
|
||
CMD ["/gen3spark/run.sh"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
#!/bin/bash | ||
|
||
namedir=`echo $HDFS_CONF_dfs_namenode_name_dir | perl -pe 's#file://##'` | ||
if [ ! -d $namedir ]; then | ||
echo "Namenode name directory not found: $namedir" | ||
exit 2 | ||
fi | ||
|
||
if [ -z "$CLUSTER_NAME" ]; then | ||
echo "Cluster name not specified" | ||
exit 2 | ||
fi | ||
|
||
echo "remove lost+found from $namedir" | ||
rm -r $namedir/lost+found | ||
|
||
if [ "`ls -A $namedir`" == "" ]; then | ||
echo "Formatting namenode name directory: $namedir" | ||
$HADOOP_HOME/bin/hdfs --config $HADOOP_CONF_DIR namenode -format $CLUSTER_NAME | ||
fi | ||
|
||
$HADOOP_HOME/bin/hdfs --config $HADOOP_CONF_DIR namenode |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
FROM quay.io/cdis/hadoop-base:3.3.0 | ||
|
||
HEALTHCHECK CMD curl -f http://localhost:9870/ || exit 1 | ||
|
||
ENV HDFS_CONF_dfs_namenode_name_dir=file:///hadoop/dfs/name | ||
RUN mkdir -p /hadoop/dfs/name | ||
VOLUME /hadoop/dfs/name | ||
|
||
ADD run.sh /gen3spark/run.sh | ||
RUN chmod a+x /gen3spark/run.sh | ||
WORKDIR /gen3spark | ||
|
||
EXPOSE 9870 | ||
|
||
CMD ["/gen3spark/run.sh"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
#!/bin/bash | ||
|
||
namedir=`echo $HDFS_CONF_dfs_namenode_name_dir | perl -pe 's#file://##'` | ||
if [ ! -d $namedir ]; then | ||
echo "Namenode name directory not found: $namedir" | ||
exit 2 | ||
fi | ||
|
||
if [ -z "$CLUSTER_NAME" ]; then | ||
echo "Cluster name not specified" | ||
exit 2 | ||
fi | ||
|
||
echo "remove lost+found from $namedir" | ||
rm -r $namedir/lost+found | ||
|
||
if [ "`ls -A $namedir`" == "" ]; then | ||
echo "Formatting namenode name directory: $namedir" | ||
$HADOOP_HOME/bin/hdfs --config $HADOOP_CONF_DIR namenode -format $CLUSTER_NAME | ||
fi | ||
|
||
$HADOOP_HOME/bin/hdfs --config $HADOOP_CONF_DIR namenode |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
FROM quay.io/cdis/hadoop-base:3.3.0 | ||
|
||
HEALTHCHECK CMD curl -f http://localhost:9864/ || exit 1 | ||
|
||
ENV HDFS_CONF_dfs_datanode_data_dir=file:///hadoop/dfs/data | ||
RUN mkdir -p /hadoop/dfs/data | ||
VOLUME /hadoop/dfs/data | ||
|
||
ADD run.sh /run.sh | ||
RUN chmod a+x /run.sh | ||
WORKDIR /gen3spark | ||
|
||
EXPOSE 9864 | ||
|
||
CMD ["/gen3spark/run.sh"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
#!/bin/bash | ||
|
||
datadir=`echo $HDFS_CONF_dfs_datanode_data_dir | perl -pe 's#file://##'` | ||
if [ ! -d $datadir ]; then | ||
echo "Datanode data directory not found: $datadir" | ||
exit 2 | ||
fi | ||
|
||
$HADOOP_HOME/bin/hdfs --config $HADOOP_CONF_DIR datanode |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
FROM quay.io/cdis/hadoop-base:3.3.0 | ||
|
||
HEALTHCHECK CMD curl -f http://localhost:9864/ || exit 1 | ||
|
||
ENV HDFS_CONF_dfs_datanode_data_dir=file:///hadoop/dfs/data | ||
RUN mkdir -p /hadoop/dfs/data | ||
VOLUME /hadoop/dfs/data | ||
|
||
ADD run.sh /run.sh | ||
RUN chmod a+x /run.sh | ||
WORKDIR /gen3spark | ||
|
||
EXPOSE 9864 | ||
|
||
CMD ["/gen3spark/run.sh"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
#!/bin/bash | ||
|
||
datadir=`echo $HDFS_CONF_dfs_datanode_data_dir | perl -pe 's#file://##'` | ||
if [ ! -d $datadir ]; then | ||
echo "Datanode data directory not found: $datadir" | ||
exit 2 | ||
fi | ||
|
||
$HADOOP_HOME/bin/hdfs --config $HADOOP_CONF_DIR datanode |
Oops, something went wrong.