Skip to content

gsorry/vagrant-hadoop-single-node-cluster

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

3 Commits
 
 
 
 
 
 

Repository files navigation

Vagrant Hadoop Single-node Cluster

Start Vagrant VM:

vagrant up
vagrant ssh

Install Java:

sudo apt-get install openjdk-8-jdk -y

Test Java:

java -version

Check Java Home:

dirname $(dirname $(readlink -f $(which javac)))

Setup SSH:

sudo apt-get install ssh -y
ssh-keygen -t rsa -P '' -f ~/.ssh/id_rsa
cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
chmod 0600 ~/.ssh/authorized_keys

Test SSH:

ssh localhost

Set Python3:

sudo ln -s /usr/bin/python3 /usr/bin/python

Install Hadoop and Spark:

curl -O https://downloads.apache.org/hadoop/common/hadoop-3.2.1/hadoop-3.2.1.tar.gz
curl -O https://downloads.apache.org/spark/spark-2.4.5/spark-2.4.5-bin-without-hadoop.tgz

tar -xvzf hadoop-3.2.1.tar.gz
tar -xvzf spark-2.4.5-bin-without-hadoop.tgz

mv hadoop-3.2.1 hadoop
mv spark-2.4.5-bin-without-hadoop spark

mkdir hadoop/logs
mkdir dfs
mkdir dfs/data
mkdir dfs/name


rm hadoop-3.2.1.tar.gz
rm spark-2.4.5-bin-without-hadoop.tgz

Setup environment:

nano .bashrc

Paste this:

export JAVA_HOME=`dirname $(dirname $(readlink -f $(which javac)))`
export HADOOP_HOME=/home/vagrant/hadoop
export HADOOP_CONF_DIR=${HADOOP_HOME}/etc/hadoop
export HADOOP_MAPRED_HOME=${HADOOP_HOME}
export SPARK_HOME=/home/vagrant/spark
export SPARK_LOCAL_IP=192.0.2.20
export PATH=${PATH}:${HADOOP_HOME}/bin:${HADOOP_HOME}/sbin:${SPARK_HOME}/bin:${SPARK_HOME}/sbin

Refresh environment:

source .bashrc

Setup Hadoop:

nano ${HADOOP_HOME}/etc/hadoop/hadoop-env.sh

Update this:

export JAVA_HOME=`dirname $(dirname $(readlink -f $(which javac)))`
export HADOOP_HOME=/home/vagrant/hadoop
export HADOOP_CONF_DIR=${HADOOP_HOME}/etc/hadoop
export HADOOP_LOG_DIR=${HADOOP_HOME}/logs
nano ${HADOOP_HOME}/etc/hadoop/core-site.xml
<configuration>
    <property>
        <name>fs.defaultFS</name>
        <value>hdfs://localhost:9000</value>
    </property>
</configuration>
nano ${HADOOP_HOME}/etc/hadoop/hdfs-site.xml
<configuration>
    <property>
        <name>dfs.namenode.name.dir</name>
        <value>/home/vagrant/dfs/name</value>
    </property>
    <property>
        <name>dfs.datanode.data.dir</name>
        <value>/home/vagrant/dfs/data</value>
    </property>
    <property>
        <name>dfs.replication</name>
        <value>1</value>
    </property>
</configuration>

Start Hadoop:

hdfs namenode -format
${HADOOP_HOME}/sbin/start-dfs.sh

Browse Namenode:

http://192.0.2.20:9870/

Test Hadoop:

hdfs dfs -mkdir /user
hdfs dfs -mkdir /user/vagrant
hdfs dfs -mkdir input
hdfs dfs -put etc/hadoop/*.xml input
hadoop jar hadoop/share/hadoop/mapreduce/hadoop-mapreduce-examples-3.2.1.jar grep input output 'dfs[a-z.]+'
hdfs dfs -get output output
cat output/*
hdfs dfs -cat output/*

Stop Hadoop:

${HADOOP_HOME}/sbin/stop-dfs.sh

Setup Spark:

cp ${SPARK_HOME}/conf/spark-env.sh.template spark/conf/spark-env.sh
nano ${SPARK_HOME}/conf/spark-env.sh

Paste this:

export HADOOP_CONF_DIR=${HADOOP_HOME}/etc/hadoop
export YARN_CONF_DIR=${HADOOP_HOME}/etc/hadoop
export SPARK_DIST_CLASSPATH=$(${HADOOP_HOME}/bin/hadoop classpath)

Setup Yarn:

nano ${HADOOP_HOME}/etc/hadoop/mapred-site.xml
<configuration>
    <property>
        <name>mapreduce.framework.name</name>
        <value>yarn</value>
    </property>
    <property>
        <name>mapreduce.application.classpath</name>
        <value>$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/*:$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/lib/*</value>
    </property>
</configuration>
nano ${HADOOP_HOME}/etc/hadoop/yarn-site.xml
<configuration>
    <property>
        <name>yarn.nodemanager.aux-services</name>
        <value>mapreduce_shuffle</value>
    </property>
    <property>
        <name>yarn.nodemanager.env-whitelist</name>
        <value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_MAPRED_HOME</value>
    </property>
</configuration>

Start Yarn:

${HADOOP_HOME}/sbin/start-yarn.sh

Browse Resource Manager:

http://192.0.2.20:8088/

Submit Job:

spark-submit --master yarn --deploy-mode cluster ${SPARK_HOME}/examples/src/main/python/pi.py

Stop Yarn:

${HADOOP_HOME}/sbin/stop-yarn.sh

Install Livy:

curl -O https://downloads.apache.org/incubator/livy/0.7.0-incubating/apache-livy-0.7.0-incubating-bin.zip

unzip apache-livy-0.7.0-incubating-bin.zip

mv apache-livy-0.7.0-incubating-bin livy

rm apache-livy-0.7.0-incubating-bin.zip

Start Livy:

livy/bin/livy-server start

Browse Sessions:

http://192.0.2.20:8998

Prepare Job:

hdfs dfs -mkdir scripts
hdfs dfs -put spark/examples/src/main/python/pi.py scripts

Run Job:

curl --location --request POST 'http://192.0.2.20:8998/batches?doAs=vagrant' \
--header 'Content-Type: application/json' \
--data-raw '{
    "name": "Py-Pi",
    "file": "/user/vagrant/scripts/pi.py",
    "executorMemory": "1g",
    "executorCores": 1,
    "numExecutors": 1
}'

About

Vagrant Hadoop Single Node Cluster

Resources

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published