Skip to content

Commit

Permalink
feat(init): spark automation
Browse files Browse the repository at this point in the history
  • Loading branch information
thanh-nguyen-dang committed Aug 21, 2018
1 parent c61c5c1 commit 386f80f
Show file tree
Hide file tree
Showing 6 changed files with 216 additions and 1 deletion.
105 changes: 105 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
# To check running container: docker exec -it tube-spark /bin/bash
FROM python:2

ENV DEBIAN_FRONTEND=noninteractive \
SPARK_VERSION="2.3.1" \
HADOOP_VERSION="3.1.0" \
SCALA_VERSION="2.12.6"

ENV SPARK_INSTALLATION_URL="http://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-without-hadoop.tgz" \
HADOOP_INSTALLATION_URL="http://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz" \
SCALA_INSTALLATION_URL="https://downloads.lightbend.com/scala/${SCALA_VERSION}/scala-${SCALA_VERSION}.tgz"

ENV SPARK_HOME="/spark" \
HADOOP_HOME="/hadoop" \
SCALA_HOME="/scala" \
JAVA_HOME="/usr/lib/jvm/java-8-openjdk-amd64/"

RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
curl \
openssh-server \
wget \
git \
openjdk-8-jdk \
# dependency for cryptography
libffi-dev \
# dependency for pyscopg2 - which is dependency for sqlalchemy postgres engine
libpq-dev \
# dependency for cryptography
libssl-dev \
python-dev \
python-pip \
python-setuptools \
vim \
net-tools

RUN wget $SPARK_INSTALLATION_URL
RUN mkdir -p $SPARK_HOME
RUN tar -xvf spark-${SPARK_VERSION}-bin-without-hadoop.tgz -C $SPARK_HOME --strip-components 1
RUN rm spark-${SPARK_VERSION}-bin-without-hadoop.tgz

RUN wget ${HADOOP_INSTALLATION_URL}
RUN mkdir -p $HADOOP_HOME
RUN tar -xvf hadoop-${HADOOP_VERSION}.tar.gz -C ${HADOOP_HOME} --strip-components 1
RUN rm hadoop-${HADOOP_VERSION}.tar.gz

RUN wget ${SCALA_INSTALLATION_URL}
RUN mkdir -p /scala
RUN tar -xvf scala-${SCALA_VERSION}.tgz -C ${SCALA_HOME} --strip-components 1
RUN rm scala-${SCALA_VERSION}.tgz

ENV HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop \
HADOOP_MAPRED_HOME=$HADOOP_HOME \
HADOOP_COMMON_HOME=$HADOOP_HOME \
HADOOP_HDFS_HOME=$HADOOP_HOME \
YARN_HOME=$HADOOP_HOME \
HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native

ENV PATH="${PATH}:${SPARK_HOME}/bin:${SPARK_HOME}/sbin:${HADOOP_HOME}/sbin:${HADOOP_HOME}/bin:${JAVA_HOME}/bin:${SCALA}/bin"

RUN echo 'export HADOOP_OPTS="-Djava.net.preferIPv4Stack=true -Dsun.security.krb5.debug=true -Dsun.security.spnego.debug"' >> $HADOOP_CONF_DIR/hadoop-env.sh && \
echo 'export HADOOP_OS_TYPE="${HADOOP_OS_TYPE:-$(uname -s)}"' >> ${HADOOP_CONF_DIR}/hadoop-env.sh && \
echo 'export HDFS_NAMENODE_OPTS="-Dhadoop.security.logger=INFO,RFAS"' >> $HADOOP_CONF_DIR/hadoop-env.sh && \
echo 'export HDFS_SECONDARYNAMENODE_OPTS="-Dhadoop.security.logger=INFO,RFAS"' >> $HADOOP_CONF_DIR/hadoop-env.sh && \
echo 'export HDFS_DATANODE_OPTS="-Dhadoop.security.logger=ERROR,RFAS"' >> $HADOOP_CONF_DIR/hadoop-env.sh && \
echo "export HDFS_DATANODE_USER=root" >> $HADOOP_CONF_DIR/hadoop-env.sh && \
echo "export HDFS_NAMENODE_USER=root" >> $HADOOP_CONF_DIR/hadoop-env.sh && \
echo "export HDFS_SECONDARYNAMENODE_USER=root" >> $HADOOP_CONF_DIR/hadoop-env.sh && \
echo "export JAVA_HOME=${JAVA_HOME}" >> $HADOOP_CONF_DIR/hadoop-env.sh && \
echo "export HADOOP_HOME=${HADOOP_HOME}" >> $HADOOP_CONF_DIR/hadoop-env.sh && \
echo "export HADOOP_CLASSPATH=${HADOOP_CLASSPATH}:${HADOOP_HOME}/share/hadoop/tools/lib" >> $HADOOP_CONF_DIR/hadoop-env.sh && \
echo "export YARN_RESOURCEMANAGER_USER=root" >> $HADOOP_CONF_DIR/yarn-env.sh && \
echo "export YARN_NODEMANAGER_USER=root" >> $HADOOP_CONF_DIR/yarn-env.sh && \
echo "export SPARK_DIST_CLASSPATH=$(hadoop --config $HADOOP_HOME/etc/hadoop classpath):/hadoop/share/hadoop/tools/lib/*" >> ${SPARK_HOME}/conf/spark-env.sh && \
echo "spark.eventLog.enabled true" >> ${SPARK_HOME}/conf/spark-defaults.conf && \
echo "spark.eventLog.compress true" >> ${SPARK_HOME}/conf/spark-defaults.conf && \
echo "spark.eventLog.dir hdfs://0.0.0.0:8021/logs" >> ${SPARK_HOME}/conf/spark-defaults.conf && \
echo "spark.history.fs.logDirectory file:/spark/logs" >> ${SPARK_HOME}/conf/spark-defaults.conf && \
echo "spark.ui.enabled true" >> ${SPARK_HOME}/conf/spark-defaults.conf && \
echo "spark.broadcast.compress true" >> ${SPARK_HOME}/conf/spark-defaults.conf && \
echo "spark.io.compression.codec org.apache.spark.io.SnappyCompressionCodec" >> ${SPARK_HOME}/conf/spark-defaults.conf && \
echo "spark.io.compression.snappy.blockSize 32k" >> ${SPARK_HOME}/conf/spark-defaults.conf && \
echo "spark.serializer org.apache.spark.serializer.KryoSerialize" >> ${SPARK_HOME}/conf/spark-defaults.conf && \
echo "spark.app.name gen3spark" >> ${SPARK_HOME}/conf/spark-defaults.conf


EXPOSE 22 4040 8020 8042 8088 9000 10020 19888 50010 50020 50070 50075 50090

RUN mkdir /var/run/sshd
RUN mkdir /hadoop/hdfs
RUN mkdir /hadoop/hdfs/data
RUN mkdir /hadoop/hdfs/data/dfs
RUN mkdir /hadoop/hdfs/data/dfs/namenode
RUN mkdir /hadoop/logs


COPY . /gen3spark
WORKDIR /gen3spark

ENV TINI_VERSION v0.18.0
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
RUN chmod +x /tini
ENTRYPOINT ["/tini", "--"]

CMD ["/usr/sbin/sshd", "-D"]
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1 +1 @@
# gen3-spark
# gen3-spark
Empty file added gen3_spark/__init__.py
Empty file.
6 changes: 6 additions & 0 deletions gen3_spark/local_settings.example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
import os

HADOOP_HOME = os.getenv("HADOOP_HOME", "/usr/local/Cellar/hadoop/3.1.0/libexec/")
JAVA_HOME = os.getenv("JAVA_HOME", "/Library/Java/JavaVirtualMachines/jdk1.8.0_131.jdk/Contents/Home")
HADOOP_URL = os.getenv("HADOOP_URL", "hdfs://localhost:9000")
HADOOP_HOST = os.getenv("HADOOP_HOST", "spark")
16 changes: 16 additions & 0 deletions gen3_spark/settings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import os

try:
# Import everything from ``local_settings``, if it exists.
from gen3_spark.local_settings import *
except ImportError:
# If it doesn't, look in ``/var/www/tube``.
try:
import imp
imp.load_source('local_settings', '/gen3-spark/gen3_spark/local_settings.py')
print('finished importing')
except IOError:
HADOOP_HOME = os.getenv("HADOOP_HOME", "")
JAVA_HOME = os.getenv("JAVA_HOME", "")
HADOOP_URL = os.getenv("HADOOP_URL", "")
HADOOP_HOST = os.getenv("HADOOP_HOST", "spark")
88 changes: 88 additions & 0 deletions run_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import xml.etree.ElementTree as et
import gen3_spark.settings as config


CONFIG_PATH = '{}/etc/hadoop/'.format(config.HADOOP_HOME)


def indent(elem, level=0):
i = "\n" + level*" "
if len(elem):
if not elem.text or not elem.text.strip():
elem.text = i + " "
if not elem.tail or not elem.tail.strip():
elem.tail = i
for elem in elem:
indent(elem, level+1)
if not elem.tail or not elem.tail.strip():
elem.tail = i
else:
if level and (not elem.tail or not elem.tail.strip()):
elem.tail = i


def configure_core_site():
core_site_path = '{}core-site.xml'.format(CONFIG_PATH)
tree = et.parse(core_site_path)
root = tree.getroot()
root.append(create_property('hadoop.tmp.dir', '{}/hdfs/tmp'.format(config.HADOOP_HOME)))
root.append(create_property('fs.default.name', config.HADOOP_URL))
indent(root)
tree.write(core_site_path)


def configure_hdfs_site():
core_site_path = '{}hdfs-site.xml'.format(CONFIG_PATH)
tree = et.parse(core_site_path)
root = tree.getroot()
root.append(create_property('dfs.blocksize', '268435456'))
root.append(create_property('dfs.hosts', '0.0.0.0'))
root.append(create_property('dfs.namenode.handler.count', '100'))
root.append(create_property('dfs.namenode.name.dir', '/hadoop/hdfs/data/dfs/namenode'))
root.append(create_property('dfs.namenode.data.dir', '/hadoop/hdfs/data/dfs/datanode'))
root.append(create_property('dfs.namenode.http-bind-host', config.HADOOP_HOST))
root.append(create_property('dfs.namenode.https-bind-host', config.HADOOP_HOST))
root.append(create_property('dfs.client.use.datanode.hostname', 'true'))
root.append(create_property('dfs.datanode.use.datanode.hostname', 'true'))
indent(root)
tree.write(core_site_path)


def configure_yarn_site():
core_site_path = '{}yarn-site.xml'.format(CONFIG_PATH)
tree = et.parse(core_site_path)
root = tree.getroot()
root.append(create_property('yarn.nodemanager.aux-services', 'mapreduce_shuffle'))
root.append(create_property('yarn.resourcemanager.scheduler.address', '{}:8030'.format(config.HADOOP_HOST)))
root.append(create_property('yarn.resourcemanager.resource-tracker.address', '{}:8031'.format(config.HADOOP_HOST)))
root.append(create_property('yarn.resourcemanager.address', '{}:8032'.format(config.HADOOP_HOST)))
tree.write(core_site_path)


def configure_mapred_site():
core_site_path = '{}mapred-site.xml'.format(CONFIG_PATH)
tree = et.parse(core_site_path)
root = tree.getroot()
root.append(create_property('mapreduce.framework.name', 'yarn'))
root.append(create_property('mapreduce.application.classpath',
'$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/*:'
'$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/lib/*'))
indent(root)
tree.write(core_site_path)


def create_property(prop_name, prop_val):
prop = et.Element('property')
name = et.Element('name')
name.text = prop_name
value = et.Element('value')
value.text = prop_val
prop.append(name)
prop.append(value)
return prop


if __name__ == '__main__':
configure_core_site()
configure_hdfs_site()
configure_mapred_site()

0 comments on commit 386f80f

Please sign in to comment.