-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
c61c5c1
commit 386f80f
Showing
6 changed files
with
216 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
# To check running container: docker exec -it tube-spark /bin/bash | ||
FROM python:2 | ||
|
||
ENV DEBIAN_FRONTEND=noninteractive \ | ||
SPARK_VERSION="2.3.1" \ | ||
HADOOP_VERSION="3.1.0" \ | ||
SCALA_VERSION="2.12.6" | ||
|
||
ENV SPARK_INSTALLATION_URL="http://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-without-hadoop.tgz" \ | ||
HADOOP_INSTALLATION_URL="http://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz" \ | ||
SCALA_INSTALLATION_URL="https://downloads.lightbend.com/scala/${SCALA_VERSION}/scala-${SCALA_VERSION}.tgz" | ||
|
||
ENV SPARK_HOME="/spark" \ | ||
HADOOP_HOME="/hadoop" \ | ||
SCALA_HOME="/scala" \ | ||
JAVA_HOME="/usr/lib/jvm/java-8-openjdk-amd64/" | ||
|
||
RUN apt-get update && apt-get install -y --no-install-recommends \ | ||
build-essential \ | ||
curl \ | ||
openssh-server \ | ||
wget \ | ||
git \ | ||
openjdk-8-jdk \ | ||
# dependency for cryptography | ||
libffi-dev \ | ||
# dependency for pyscopg2 - which is dependency for sqlalchemy postgres engine | ||
libpq-dev \ | ||
# dependency for cryptography | ||
libssl-dev \ | ||
python-dev \ | ||
python-pip \ | ||
python-setuptools \ | ||
vim \ | ||
net-tools | ||
|
||
RUN wget $SPARK_INSTALLATION_URL | ||
RUN mkdir -p $SPARK_HOME | ||
RUN tar -xvf spark-${SPARK_VERSION}-bin-without-hadoop.tgz -C $SPARK_HOME --strip-components 1 | ||
RUN rm spark-${SPARK_VERSION}-bin-without-hadoop.tgz | ||
|
||
RUN wget ${HADOOP_INSTALLATION_URL} | ||
RUN mkdir -p $HADOOP_HOME | ||
RUN tar -xvf hadoop-${HADOOP_VERSION}.tar.gz -C ${HADOOP_HOME} --strip-components 1 | ||
RUN rm hadoop-${HADOOP_VERSION}.tar.gz | ||
|
||
RUN wget ${SCALA_INSTALLATION_URL} | ||
RUN mkdir -p /scala | ||
RUN tar -xvf scala-${SCALA_VERSION}.tgz -C ${SCALA_HOME} --strip-components 1 | ||
RUN rm scala-${SCALA_VERSION}.tgz | ||
|
||
ENV HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop \ | ||
HADOOP_MAPRED_HOME=$HADOOP_HOME \ | ||
HADOOP_COMMON_HOME=$HADOOP_HOME \ | ||
HADOOP_HDFS_HOME=$HADOOP_HOME \ | ||
YARN_HOME=$HADOOP_HOME \ | ||
HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native | ||
|
||
ENV PATH="${PATH}:${SPARK_HOME}/bin:${SPARK_HOME}/sbin:${HADOOP_HOME}/sbin:${HADOOP_HOME}/bin:${JAVA_HOME}/bin:${SCALA}/bin" | ||
|
||
RUN echo 'export HADOOP_OPTS="-Djava.net.preferIPv4Stack=true -Dsun.security.krb5.debug=true -Dsun.security.spnego.debug"' >> $HADOOP_CONF_DIR/hadoop-env.sh && \ | ||
echo 'export HADOOP_OS_TYPE="${HADOOP_OS_TYPE:-$(uname -s)}"' >> ${HADOOP_CONF_DIR}/hadoop-env.sh && \ | ||
echo 'export HDFS_NAMENODE_OPTS="-Dhadoop.security.logger=INFO,RFAS"' >> $HADOOP_CONF_DIR/hadoop-env.sh && \ | ||
echo 'export HDFS_SECONDARYNAMENODE_OPTS="-Dhadoop.security.logger=INFO,RFAS"' >> $HADOOP_CONF_DIR/hadoop-env.sh && \ | ||
echo 'export HDFS_DATANODE_OPTS="-Dhadoop.security.logger=ERROR,RFAS"' >> $HADOOP_CONF_DIR/hadoop-env.sh && \ | ||
echo "export HDFS_DATANODE_USER=root" >> $HADOOP_CONF_DIR/hadoop-env.sh && \ | ||
echo "export HDFS_NAMENODE_USER=root" >> $HADOOP_CONF_DIR/hadoop-env.sh && \ | ||
echo "export HDFS_SECONDARYNAMENODE_USER=root" >> $HADOOP_CONF_DIR/hadoop-env.sh && \ | ||
echo "export JAVA_HOME=${JAVA_HOME}" >> $HADOOP_CONF_DIR/hadoop-env.sh && \ | ||
echo "export HADOOP_HOME=${HADOOP_HOME}" >> $HADOOP_CONF_DIR/hadoop-env.sh && \ | ||
echo "export HADOOP_CLASSPATH=${HADOOP_CLASSPATH}:${HADOOP_HOME}/share/hadoop/tools/lib" >> $HADOOP_CONF_DIR/hadoop-env.sh && \ | ||
echo "export YARN_RESOURCEMANAGER_USER=root" >> $HADOOP_CONF_DIR/yarn-env.sh && \ | ||
echo "export YARN_NODEMANAGER_USER=root" >> $HADOOP_CONF_DIR/yarn-env.sh && \ | ||
echo "export SPARK_DIST_CLASSPATH=$(hadoop --config $HADOOP_HOME/etc/hadoop classpath):/hadoop/share/hadoop/tools/lib/*" >> ${SPARK_HOME}/conf/spark-env.sh && \ | ||
echo "spark.eventLog.enabled true" >> ${SPARK_HOME}/conf/spark-defaults.conf && \ | ||
echo "spark.eventLog.compress true" >> ${SPARK_HOME}/conf/spark-defaults.conf && \ | ||
echo "spark.eventLog.dir hdfs://0.0.0.0:8021/logs" >> ${SPARK_HOME}/conf/spark-defaults.conf && \ | ||
echo "spark.history.fs.logDirectory file:/spark/logs" >> ${SPARK_HOME}/conf/spark-defaults.conf && \ | ||
echo "spark.ui.enabled true" >> ${SPARK_HOME}/conf/spark-defaults.conf && \ | ||
echo "spark.broadcast.compress true" >> ${SPARK_HOME}/conf/spark-defaults.conf && \ | ||
echo "spark.io.compression.codec org.apache.spark.io.SnappyCompressionCodec" >> ${SPARK_HOME}/conf/spark-defaults.conf && \ | ||
echo "spark.io.compression.snappy.blockSize 32k" >> ${SPARK_HOME}/conf/spark-defaults.conf && \ | ||
echo "spark.serializer org.apache.spark.serializer.KryoSerialize" >> ${SPARK_HOME}/conf/spark-defaults.conf && \ | ||
echo "spark.app.name gen3spark" >> ${SPARK_HOME}/conf/spark-defaults.conf | ||
|
||
|
||
EXPOSE 22 4040 8020 8042 8088 9000 10020 19888 50010 50020 50070 50075 50090 | ||
|
||
RUN mkdir /var/run/sshd | ||
RUN mkdir /hadoop/hdfs | ||
RUN mkdir /hadoop/hdfs/data | ||
RUN mkdir /hadoop/hdfs/data/dfs | ||
RUN mkdir /hadoop/hdfs/data/dfs/namenode | ||
RUN mkdir /hadoop/logs | ||
|
||
|
||
COPY . /gen3spark | ||
WORKDIR /gen3spark | ||
|
||
ENV TINI_VERSION v0.18.0 | ||
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini | ||
RUN chmod +x /tini | ||
ENTRYPOINT ["/tini", "--"] | ||
|
||
CMD ["/usr/sbin/sshd", "-D"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
# gen3-spark | ||
# gen3-spark |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
import os | ||
|
||
HADOOP_HOME = os.getenv("HADOOP_HOME", "/usr/local/Cellar/hadoop/3.1.0/libexec/") | ||
JAVA_HOME = os.getenv("JAVA_HOME", "/Library/Java/JavaVirtualMachines/jdk1.8.0_131.jdk/Contents/Home") | ||
HADOOP_URL = os.getenv("HADOOP_URL", "hdfs://localhost:9000") | ||
HADOOP_HOST = os.getenv("HADOOP_HOST", "spark") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
import os | ||
|
||
try: | ||
# Import everything from ``local_settings``, if it exists. | ||
from gen3_spark.local_settings import * | ||
except ImportError: | ||
# If it doesn't, look in ``/var/www/tube``. | ||
try: | ||
import imp | ||
imp.load_source('local_settings', '/gen3-spark/gen3_spark/local_settings.py') | ||
print('finished importing') | ||
except IOError: | ||
HADOOP_HOME = os.getenv("HADOOP_HOME", "") | ||
JAVA_HOME = os.getenv("JAVA_HOME", "") | ||
HADOOP_URL = os.getenv("HADOOP_URL", "") | ||
HADOOP_HOST = os.getenv("HADOOP_HOST", "spark") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
import xml.etree.ElementTree as et | ||
import gen3_spark.settings as config | ||
|
||
|
||
CONFIG_PATH = '{}/etc/hadoop/'.format(config.HADOOP_HOME) | ||
|
||
|
||
def indent(elem, level=0): | ||
i = "\n" + level*" " | ||
if len(elem): | ||
if not elem.text or not elem.text.strip(): | ||
elem.text = i + " " | ||
if not elem.tail or not elem.tail.strip(): | ||
elem.tail = i | ||
for elem in elem: | ||
indent(elem, level+1) | ||
if not elem.tail or not elem.tail.strip(): | ||
elem.tail = i | ||
else: | ||
if level and (not elem.tail or not elem.tail.strip()): | ||
elem.tail = i | ||
|
||
|
||
def configure_core_site(): | ||
core_site_path = '{}core-site.xml'.format(CONFIG_PATH) | ||
tree = et.parse(core_site_path) | ||
root = tree.getroot() | ||
root.append(create_property('hadoop.tmp.dir', '{}/hdfs/tmp'.format(config.HADOOP_HOME))) | ||
root.append(create_property('fs.default.name', config.HADOOP_URL)) | ||
indent(root) | ||
tree.write(core_site_path) | ||
|
||
|
||
def configure_hdfs_site(): | ||
core_site_path = '{}hdfs-site.xml'.format(CONFIG_PATH) | ||
tree = et.parse(core_site_path) | ||
root = tree.getroot() | ||
root.append(create_property('dfs.blocksize', '268435456')) | ||
root.append(create_property('dfs.hosts', '0.0.0.0')) | ||
root.append(create_property('dfs.namenode.handler.count', '100')) | ||
root.append(create_property('dfs.namenode.name.dir', '/hadoop/hdfs/data/dfs/namenode')) | ||
root.append(create_property('dfs.namenode.data.dir', '/hadoop/hdfs/data/dfs/datanode')) | ||
root.append(create_property('dfs.namenode.http-bind-host', config.HADOOP_HOST)) | ||
root.append(create_property('dfs.namenode.https-bind-host', config.HADOOP_HOST)) | ||
root.append(create_property('dfs.client.use.datanode.hostname', 'true')) | ||
root.append(create_property('dfs.datanode.use.datanode.hostname', 'true')) | ||
indent(root) | ||
tree.write(core_site_path) | ||
|
||
|
||
def configure_yarn_site(): | ||
core_site_path = '{}yarn-site.xml'.format(CONFIG_PATH) | ||
tree = et.parse(core_site_path) | ||
root = tree.getroot() | ||
root.append(create_property('yarn.nodemanager.aux-services', 'mapreduce_shuffle')) | ||
root.append(create_property('yarn.resourcemanager.scheduler.address', '{}:8030'.format(config.HADOOP_HOST))) | ||
root.append(create_property('yarn.resourcemanager.resource-tracker.address', '{}:8031'.format(config.HADOOP_HOST))) | ||
root.append(create_property('yarn.resourcemanager.address', '{}:8032'.format(config.HADOOP_HOST))) | ||
tree.write(core_site_path) | ||
|
||
|
||
def configure_mapred_site(): | ||
core_site_path = '{}mapred-site.xml'.format(CONFIG_PATH) | ||
tree = et.parse(core_site_path) | ||
root = tree.getroot() | ||
root.append(create_property('mapreduce.framework.name', 'yarn')) | ||
root.append(create_property('mapreduce.application.classpath', | ||
'$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/*:' | ||
'$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/lib/*')) | ||
indent(root) | ||
tree.write(core_site_path) | ||
|
||
|
||
def create_property(prop_name, prop_val): | ||
prop = et.Element('property') | ||
name = et.Element('name') | ||
name.text = prop_name | ||
value = et.Element('value') | ||
value.text = prop_val | ||
prop.append(name) | ||
prop.append(value) | ||
return prop | ||
|
||
|
||
if __name__ == '__main__': | ||
configure_core_site() | ||
configure_hdfs_site() | ||
configure_mapred_site() |