forked from Mallik-G/sample-pyspark-application
-
Notifications
You must be signed in to change notification settings - Fork 0
/
setup-and-submit.sh
executable file
·70 lines (62 loc) · 2.44 KB
/
setup-and-submit.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
set -e
set -x
# Set $PYTHON to the Python executable you want to create
# your virtual environment with. It could just be something
# like `python3`, if that's already on your $PATH, or it could
# be a /fully/qualified/path/to/python.
test -n "$PYTHON"
# Make sure $SPARK_HOME is on your $PATH so that `spark-submit`
# runs from the correct location.
test -n "$SPARK_HOME"
"$PYTHON" -m venv venv
source venv/bin/activate
pip install -U pip
pip install -r requirements.pip
deactivate
# Here we package up an isolated environment that we'll ship to YARN.
# The awkward zip invocation for venv just creates nicer relative
# paths.
pushd venv/
zip -rq ../venv.zip *
popd
# Here it's important that application/ be zipped in this way so that
# Python knows how to load the module inside.
zip -rq application.zip application/
# We want YARN to use the Python from our virtual environment,
# which includes all our dependencies.
export PYSPARK_PYTHON="venv/bin/python"
# YARN Client Mode Example
# ------------------------
# The --archives option places our packaged up environment on each
# YARN worker's lookup path with an alias that we define. The pattern
# is `local-file-name#aliased-file-name`. So when we set
# PYSPARK_PYTHON to `venv/bin/python`, `venv/` here references the
# aliased zip file we're sending to YARN.
spark-submit \
--name "Sample Spark Application" \
--master yarn \
--deploy-mode client \
--conf "spark.yarn.appMasterEnv.SPARK_HOME=$SPARK_HOME" \
--conf "spark.yarn.appMasterEnv.PYSPARK_PYTHON=$PYSPARK_PYTHON" \
--archives "venv.zip#venv" \
--py-files "application.zip" \
hello.py
# YARN Cluster Mode Example
# -------------------------
# Two additional tips when running in cluster mode:
# 1. Be sure not to name your driver script (in this example, hello.py)
# the same name as your application folder. This confuses Python when it
# tries to import your module (e.g. `import application`).
# 2. Since your driver is running on the cluster, you'll need to
# replicate any environment variables you need using
# `--conf "spark.yarn.appMasterEnv..."` and any local files you
# depend on using `--files`.
spark-submit \
--name "Sample Spark Application" \
--master yarn \
--deploy-mode cluster \
--conf "spark.yarn.appMasterEnv.SPARK_HOME=$SPARK_HOME" \
--conf "spark.yarn.appMasterEnv.PYSPARK_PYTHON=$PYSPARK_PYTHON" \
--archives "venv.zip#venv" \
--py-files "application.zip" \
hello.py