Start Vagrant VM:
vagrant up
vagrant ssh
Install Java:
sudo apt-get install openjdk-8-jdk -y
Test Java:
java -version
Check Java Home:
dirname $(dirname $(readlink -f $(which javac)))
Setup SSH:
sudo apt-get install ssh -y
ssh-keygen -t rsa -P '' -f ~/.ssh/id_rsa
cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
chmod 0600 ~/.ssh/authorized_keys
Test SSH:
ssh localhost
Set Python3:
sudo ln -s /usr/bin/python3 /usr/bin/python
Install Hadoop and Spark:
curl -O https://downloads.apache.org/hadoop/common/hadoop-3.2.1/hadoop-3.2.1.tar.gz
curl -O https://downloads.apache.org/spark/spark-2.4.5/spark-2.4.5-bin-without-hadoop.tgz
tar -xvzf hadoop-3.2.1.tar.gz
tar -xvzf spark-2.4.5-bin-without-hadoop.tgz
mv hadoop-3.2.1 hadoop
mv spark-2.4.5-bin-without-hadoop spark
mkdir hadoop/logs
mkdir dfs
mkdir dfs/data
mkdir dfs/name
rm hadoop-3.2.1.tar.gz
rm spark-2.4.5-bin-without-hadoop.tgz
Setup environment:
nano .bashrc
Paste this:
export JAVA_HOME=`dirname $(dirname $(readlink -f $(which javac)))`
export HADOOP_HOME=/home/vagrant/hadoop
export HADOOP_CONF_DIR=${HADOOP_HOME}/etc/hadoop
export HADOOP_MAPRED_HOME=${HADOOP_HOME}
export SPARK_HOME=/home/vagrant/spark
export SPARK_LOCAL_IP=192.0.2.20
export PATH=${PATH}:${HADOOP_HOME}/bin:${HADOOP_HOME}/sbin:${SPARK_HOME}/bin:${SPARK_HOME}/sbin
Refresh environment:
source .bashrc
Setup Hadoop:
nano ${HADOOP_HOME}/etc/hadoop/hadoop-env.sh
Update this:
export JAVA_HOME=`dirname $(dirname $(readlink -f $(which javac)))`
export HADOOP_HOME=/home/vagrant/hadoop
export HADOOP_CONF_DIR=${HADOOP_HOME}/etc/hadoop
export HADOOP_LOG_DIR=${HADOOP_HOME}/logs
nano ${HADOOP_HOME}/etc/hadoop/core-site.xml
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://localhost:9000</value>
</property>
</configuration>
nano ${HADOOP_HOME}/etc/hadoop/hdfs-site.xml
<configuration>
<property>
<name>dfs.namenode.name.dir</name>
<value>/home/vagrant/dfs/name</value>
</property>
<property>
<name>dfs.datanode.data.dir</name>
<value>/home/vagrant/dfs/data</value>
</property>
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
</configuration>
Start Hadoop:
hdfs namenode -format
${HADOOP_HOME}/sbin/start-dfs.sh
Browse Namenode:
Test Hadoop:
hdfs dfs -mkdir /user
hdfs dfs -mkdir /user/vagrant
hdfs dfs -mkdir input
hdfs dfs -put etc/hadoop/*.xml input
hadoop jar hadoop/share/hadoop/mapreduce/hadoop-mapreduce-examples-3.2.1.jar grep input output 'dfs[a-z.]+'
hdfs dfs -get output output
cat output/*
hdfs dfs -cat output/*
Stop Hadoop:
${HADOOP_HOME}/sbin/stop-dfs.sh
Setup Spark:
cp ${SPARK_HOME}/conf/spark-env.sh.template spark/conf/spark-env.sh
nano ${SPARK_HOME}/conf/spark-env.sh
Paste this:
export HADOOP_CONF_DIR=${HADOOP_HOME}/etc/hadoop
export YARN_CONF_DIR=${HADOOP_HOME}/etc/hadoop
export SPARK_DIST_CLASSPATH=$(${HADOOP_HOME}/bin/hadoop classpath)
Setup Yarn:
nano ${HADOOP_HOME}/etc/hadoop/mapred-site.xml
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
<property>
<name>mapreduce.application.classpath</name>
<value>$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/*:$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/lib/*</value>
</property>
</configuration>
nano ${HADOOP_HOME}/etc/hadoop/yarn-site.xml
<configuration>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.nodemanager.env-whitelist</name>
<value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_MAPRED_HOME</value>
</property>
</configuration>
Start Yarn:
${HADOOP_HOME}/sbin/start-yarn.sh
Browse Resource Manager:
Submit Job:
spark-submit --master yarn --deploy-mode cluster ${SPARK_HOME}/examples/src/main/python/pi.py
Stop Yarn:
${HADOOP_HOME}/sbin/stop-yarn.sh
Install Livy:
curl -O https://downloads.apache.org/incubator/livy/0.7.0-incubating/apache-livy-0.7.0-incubating-bin.zip
unzip apache-livy-0.7.0-incubating-bin.zip
mv apache-livy-0.7.0-incubating-bin livy
rm apache-livy-0.7.0-incubating-bin.zip
Start Livy:
livy/bin/livy-server start
Browse Sessions:
Prepare Job:
hdfs dfs -mkdir scripts
hdfs dfs -put spark/examples/src/main/python/pi.py scripts
Run Job:
curl --location --request POST 'http://192.0.2.20:8998/batches?doAs=vagrant' \
--header 'Content-Type: application/json' \
--data-raw '{
"name": "Py-Pi",
"file": "/user/vagrant/scripts/pi.py",
"executorMemory": "1g",
"executorCores": 1,
"numExecutors": 1
}'