From 386f80fd6fde61b615cca2f5cc440c91ca4b54ab Mon Sep 17 00:00:00 2001 From: Thanh Nguyen Date: Tue, 14 Aug 2018 14:21:08 -0500 Subject: [PATCH] feat(init): spark automation --- Dockerfile | 105 +++++++++++++++++++++++++++ README.md | 2 +- gen3_spark/__init__.py | 0 gen3_spark/local_settings.example.py | 6 ++ gen3_spark/settings.py | 16 ++++ run_config.py | 88 ++++++++++++++++++++++ 6 files changed, 216 insertions(+), 1 deletion(-) create mode 100644 Dockerfile create mode 100644 gen3_spark/__init__.py create mode 100644 gen3_spark/local_settings.example.py create mode 100644 gen3_spark/settings.py create mode 100644 run_config.py diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..ff9242e --- /dev/null +++ b/Dockerfile @@ -0,0 +1,105 @@ +# To check running container: docker exec -it tube-spark /bin/bash +FROM python:2 + +ENV DEBIAN_FRONTEND=noninteractive \ + SPARK_VERSION="2.3.1" \ + HADOOP_VERSION="3.1.0" \ + SCALA_VERSION="2.12.6" + +ENV SPARK_INSTALLATION_URL="http://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-without-hadoop.tgz" \ + HADOOP_INSTALLATION_URL="http://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz" \ + SCALA_INSTALLATION_URL="https://downloads.lightbend.com/scala/${SCALA_VERSION}/scala-${SCALA_VERSION}.tgz" + +ENV SPARK_HOME="/spark" \ + HADOOP_HOME="/hadoop" \ + SCALA_HOME="/scala" \ + JAVA_HOME="/usr/lib/jvm/java-8-openjdk-amd64/" + +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + curl \ + openssh-server \ + wget \ + git \ + openjdk-8-jdk \ + # dependency for cryptography + libffi-dev \ + # dependency for pyscopg2 - which is dependency for sqlalchemy postgres engine + libpq-dev \ + # dependency for cryptography + libssl-dev \ + python-dev \ + python-pip \ + python-setuptools \ + vim \ + net-tools + +RUN wget $SPARK_INSTALLATION_URL +RUN mkdir -p $SPARK_HOME +RUN tar -xvf spark-${SPARK_VERSION}-bin-without-hadoop.tgz -C $SPARK_HOME --strip-components 1 +RUN rm spark-${SPARK_VERSION}-bin-without-hadoop.tgz + +RUN wget ${HADOOP_INSTALLATION_URL} +RUN mkdir -p $HADOOP_HOME +RUN tar -xvf hadoop-${HADOOP_VERSION}.tar.gz -C ${HADOOP_HOME} --strip-components 1 +RUN rm hadoop-${HADOOP_VERSION}.tar.gz + +RUN wget ${SCALA_INSTALLATION_URL} +RUN mkdir -p /scala +RUN tar -xvf scala-${SCALA_VERSION}.tgz -C ${SCALA_HOME} --strip-components 1 +RUN rm scala-${SCALA_VERSION}.tgz + +ENV HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop \ + HADOOP_MAPRED_HOME=$HADOOP_HOME \ + HADOOP_COMMON_HOME=$HADOOP_HOME \ + HADOOP_HDFS_HOME=$HADOOP_HOME \ + YARN_HOME=$HADOOP_HOME \ + HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native + +ENV PATH="${PATH}:${SPARK_HOME}/bin:${SPARK_HOME}/sbin:${HADOOP_HOME}/sbin:${HADOOP_HOME}/bin:${JAVA_HOME}/bin:${SCALA}/bin" + +RUN echo 'export HADOOP_OPTS="-Djava.net.preferIPv4Stack=true -Dsun.security.krb5.debug=true -Dsun.security.spnego.debug"' >> $HADOOP_CONF_DIR/hadoop-env.sh && \ + echo 'export HADOOP_OS_TYPE="${HADOOP_OS_TYPE:-$(uname -s)}"' >> ${HADOOP_CONF_DIR}/hadoop-env.sh && \ + echo 'export HDFS_NAMENODE_OPTS="-Dhadoop.security.logger=INFO,RFAS"' >> $HADOOP_CONF_DIR/hadoop-env.sh && \ + echo 'export HDFS_SECONDARYNAMENODE_OPTS="-Dhadoop.security.logger=INFO,RFAS"' >> $HADOOP_CONF_DIR/hadoop-env.sh && \ + echo 'export HDFS_DATANODE_OPTS="-Dhadoop.security.logger=ERROR,RFAS"' >> $HADOOP_CONF_DIR/hadoop-env.sh && \ + echo "export HDFS_DATANODE_USER=root" >> $HADOOP_CONF_DIR/hadoop-env.sh && \ + echo "export HDFS_NAMENODE_USER=root" >> $HADOOP_CONF_DIR/hadoop-env.sh && \ + echo "export HDFS_SECONDARYNAMENODE_USER=root" >> $HADOOP_CONF_DIR/hadoop-env.sh && \ + echo "export JAVA_HOME=${JAVA_HOME}" >> $HADOOP_CONF_DIR/hadoop-env.sh && \ + echo "export HADOOP_HOME=${HADOOP_HOME}" >> $HADOOP_CONF_DIR/hadoop-env.sh && \ + echo "export HADOOP_CLASSPATH=${HADOOP_CLASSPATH}:${HADOOP_HOME}/share/hadoop/tools/lib" >> $HADOOP_CONF_DIR/hadoop-env.sh && \ + echo "export YARN_RESOURCEMANAGER_USER=root" >> $HADOOP_CONF_DIR/yarn-env.sh && \ + echo "export YARN_NODEMANAGER_USER=root" >> $HADOOP_CONF_DIR/yarn-env.sh && \ + echo "export SPARK_DIST_CLASSPATH=$(hadoop --config $HADOOP_HOME/etc/hadoop classpath):/hadoop/share/hadoop/tools/lib/*" >> ${SPARK_HOME}/conf/spark-env.sh && \ + echo "spark.eventLog.enabled true" >> ${SPARK_HOME}/conf/spark-defaults.conf && \ + echo "spark.eventLog.compress true" >> ${SPARK_HOME}/conf/spark-defaults.conf && \ + echo "spark.eventLog.dir hdfs://0.0.0.0:8021/logs" >> ${SPARK_HOME}/conf/spark-defaults.conf && \ + echo "spark.history.fs.logDirectory file:/spark/logs" >> ${SPARK_HOME}/conf/spark-defaults.conf && \ + echo "spark.ui.enabled true" >> ${SPARK_HOME}/conf/spark-defaults.conf && \ + echo "spark.broadcast.compress true" >> ${SPARK_HOME}/conf/spark-defaults.conf && \ + echo "spark.io.compression.codec org.apache.spark.io.SnappyCompressionCodec" >> ${SPARK_HOME}/conf/spark-defaults.conf && \ + echo "spark.io.compression.snappy.blockSize 32k" >> ${SPARK_HOME}/conf/spark-defaults.conf && \ + echo "spark.serializer org.apache.spark.serializer.KryoSerialize" >> ${SPARK_HOME}/conf/spark-defaults.conf && \ + echo "spark.app.name gen3spark" >> ${SPARK_HOME}/conf/spark-defaults.conf + + +EXPOSE 22 4040 8020 8042 8088 9000 10020 19888 50010 50020 50070 50075 50090 + +RUN mkdir /var/run/sshd +RUN mkdir /hadoop/hdfs +RUN mkdir /hadoop/hdfs/data +RUN mkdir /hadoop/hdfs/data/dfs +RUN mkdir /hadoop/hdfs/data/dfs/namenode +RUN mkdir /hadoop/logs + + +COPY . /gen3spark +WORKDIR /gen3spark + +ENV TINI_VERSION v0.18.0 +ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini +RUN chmod +x /tini +ENTRYPOINT ["/tini", "--"] + +CMD ["/usr/sbin/sshd", "-D"] diff --git a/README.md b/README.md index cc19d16..b766587 100644 --- a/README.md +++ b/README.md @@ -1 +1 @@ -# gen3-spark \ No newline at end of file +# gen3-spark diff --git a/gen3_spark/__init__.py b/gen3_spark/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/gen3_spark/local_settings.example.py b/gen3_spark/local_settings.example.py new file mode 100644 index 0000000..4b9203a --- /dev/null +++ b/gen3_spark/local_settings.example.py @@ -0,0 +1,6 @@ +import os + +HADOOP_HOME = os.getenv("HADOOP_HOME", "/usr/local/Cellar/hadoop/3.1.0/libexec/") +JAVA_HOME = os.getenv("JAVA_HOME", "/Library/Java/JavaVirtualMachines/jdk1.8.0_131.jdk/Contents/Home") +HADOOP_URL = os.getenv("HADOOP_URL", "hdfs://localhost:9000") +HADOOP_HOST = os.getenv("HADOOP_HOST", "spark") diff --git a/gen3_spark/settings.py b/gen3_spark/settings.py new file mode 100644 index 0000000..63d8483 --- /dev/null +++ b/gen3_spark/settings.py @@ -0,0 +1,16 @@ +import os + +try: + # Import everything from ``local_settings``, if it exists. + from gen3_spark.local_settings import * +except ImportError: + # If it doesn't, look in ``/var/www/tube``. + try: + import imp + imp.load_source('local_settings', '/gen3-spark/gen3_spark/local_settings.py') + print('finished importing') + except IOError: + HADOOP_HOME = os.getenv("HADOOP_HOME", "") + JAVA_HOME = os.getenv("JAVA_HOME", "") + HADOOP_URL = os.getenv("HADOOP_URL", "") + HADOOP_HOST = os.getenv("HADOOP_HOST", "spark") \ No newline at end of file diff --git a/run_config.py b/run_config.py new file mode 100644 index 0000000..6e03d25 --- /dev/null +++ b/run_config.py @@ -0,0 +1,88 @@ +import xml.etree.ElementTree as et +import gen3_spark.settings as config + + +CONFIG_PATH = '{}/etc/hadoop/'.format(config.HADOOP_HOME) + + +def indent(elem, level=0): + i = "\n" + level*" " + if len(elem): + if not elem.text or not elem.text.strip(): + elem.text = i + " " + if not elem.tail or not elem.tail.strip(): + elem.tail = i + for elem in elem: + indent(elem, level+1) + if not elem.tail or not elem.tail.strip(): + elem.tail = i + else: + if level and (not elem.tail or not elem.tail.strip()): + elem.tail = i + + +def configure_core_site(): + core_site_path = '{}core-site.xml'.format(CONFIG_PATH) + tree = et.parse(core_site_path) + root = tree.getroot() + root.append(create_property('hadoop.tmp.dir', '{}/hdfs/tmp'.format(config.HADOOP_HOME))) + root.append(create_property('fs.default.name', config.HADOOP_URL)) + indent(root) + tree.write(core_site_path) + + +def configure_hdfs_site(): + core_site_path = '{}hdfs-site.xml'.format(CONFIG_PATH) + tree = et.parse(core_site_path) + root = tree.getroot() + root.append(create_property('dfs.blocksize', '268435456')) + root.append(create_property('dfs.hosts', '0.0.0.0')) + root.append(create_property('dfs.namenode.handler.count', '100')) + root.append(create_property('dfs.namenode.name.dir', '/hadoop/hdfs/data/dfs/namenode')) + root.append(create_property('dfs.namenode.data.dir', '/hadoop/hdfs/data/dfs/datanode')) + root.append(create_property('dfs.namenode.http-bind-host', config.HADOOP_HOST)) + root.append(create_property('dfs.namenode.https-bind-host', config.HADOOP_HOST)) + root.append(create_property('dfs.client.use.datanode.hostname', 'true')) + root.append(create_property('dfs.datanode.use.datanode.hostname', 'true')) + indent(root) + tree.write(core_site_path) + + +def configure_yarn_site(): + core_site_path = '{}yarn-site.xml'.format(CONFIG_PATH) + tree = et.parse(core_site_path) + root = tree.getroot() + root.append(create_property('yarn.nodemanager.aux-services', 'mapreduce_shuffle')) + root.append(create_property('yarn.resourcemanager.scheduler.address', '{}:8030'.format(config.HADOOP_HOST))) + root.append(create_property('yarn.resourcemanager.resource-tracker.address', '{}:8031'.format(config.HADOOP_HOST))) + root.append(create_property('yarn.resourcemanager.address', '{}:8032'.format(config.HADOOP_HOST))) + tree.write(core_site_path) + + +def configure_mapred_site(): + core_site_path = '{}mapred-site.xml'.format(CONFIG_PATH) + tree = et.parse(core_site_path) + root = tree.getroot() + root.append(create_property('mapreduce.framework.name', 'yarn')) + root.append(create_property('mapreduce.application.classpath', + '$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/*:' + '$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/lib/*')) + indent(root) + tree.write(core_site_path) + + +def create_property(prop_name, prop_val): + prop = et.Element('property') + name = et.Element('name') + name.text = prop_name + value = et.Element('value') + value.text = prop_val + prop.append(name) + prop.append(value) + return prop + + +if __name__ == '__main__': + configure_core_site() + configure_hdfs_site() + configure_mapred_site()