-
Notifications
You must be signed in to change notification settings - Fork 13
/
Dockerfile.spark
118 lines (107 loc) · 3.55 KB
/
Dockerfile.spark
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# from https://cloud.google.com/dataproc-serverless/docs/guides/custom-containers#example_custom_container_image_build
# Debian 11 is recommended.
FROM debian:11-slim
LABEL org.opencontainers.image.source https://github.com/cal-itp/data-infra
# Suppress interactive prompts
ENV DEBIAN_FRONTEND=noninteractive
# (Required) Install utilities required by Spark scripts.
RUN apt update && apt install -y procps tini libjemalloc2
# Enable jemalloc2 as default memory allocator
ENV LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so.2
# (Optional) Add extra jars.
#ENV SPARK_EXTRA_JARS_DIR=/opt/spark/jars/
#ENV SPARK_EXTRA_CLASSPATH='/opt/spark/jars/*'
#RUN mkdir -p "${SPARK_EXTRA_JARS_DIR}"
#COPY spark-bigquery-with-dependencies_2.12-0.22.2.jar "${SPARK_EXTRA_JARS_DIR}"
# (Optional) Install and configure Miniconda3.
ENV CONDA_HOME=/opt/miniconda3
ENV PYSPARK_PYTHON=${CONDA_HOME}/bin/python
ENV PATH=${CONDA_HOME}/bin:${PATH}
COPY Miniconda3-py39_4.10.3-Linux-x86_64.sh .
RUN bash Miniconda3-py39_4.10.3-Linux-x86_64.sh -b -p /opt/miniconda3 \
&& ${CONDA_HOME}/bin/conda config --system --set always_yes True \
&& ${CONDA_HOME}/bin/conda config --system --set auto_update_conda False \
&& ${CONDA_HOME}/bin/conda config --system --prepend channels conda-forge \
&& ${CONDA_HOME}/bin/conda config --system --set channel_priority strict
# (Optional) Install Conda packages.
#
# The following packages are installed in the default image, it is strongly
# recommended to include all of them.
#
# Use mamba to install packages quickly.
RUN ${CONDA_HOME}/bin/conda install mamba -n base -c conda-forge \
&& ${CONDA_HOME}/bin/mamba install \
conda \
cython \
fastavro \
fastparquet \
gcsfs \
google-cloud-bigquery-storage \
google-cloud-bigquery[pandas] \
google-cloud-bigtable \
google-cloud-container \
google-cloud-datacatalog \
google-cloud-dataproc \
google-cloud-datastore \
google-cloud-language \
google-cloud-logging \
google-cloud-monitoring \
google-cloud-pubsub \
google-cloud-redis \
google-cloud-spanner \
google-cloud-speech \
google-cloud-storage \
google-cloud-texttospeech \
google-cloud-translate \
google-cloud-vision \
koalas \
matplotlib \
nltk \
numba \
numpy \
openblas \
orc \
pandas \
pyarrow \
pysal \
pytables \
python \
regex \
requests \
rtree \
scikit-image \
scikit-learn \
scipy \
seaborn \
sqlalchemy \
sympy \
virtualenv \
shapely==1.8.5.post1
# (Optional) Add extra Python modules.
#ENV PYTHONPATH=/opt/python/packages
#RUN mkdir -p "${PYTHONPATH}"
#COPY test_util.py "${PYTHONPATH}"
# (Optional) Install R and R libraries.
# The key ID sometimes needs to be updated; the error output will reference the current key ID to import
#RUN apt update \
# && apt install -y gnupg \
# && apt-key adv --no-tty \
# --keyserver "hkp://keyserver.ubuntu.com:80" \
# --recv-keys B8F25A8A73EACF41 \
# && echo "deb http://cloud.r-project.org/bin/linux/debian bullseye-cran40/" \
# >/etc/apt/sources.list.d/cran-r.list \
# && apt update \
# && apt install -y \
# libopenblas-base \
# libssl-dev \
# r-base \
# r-base-dev \
# r-recommended \
# r-cran-blob
#
#ENV R_HOME=/usr/lib/R
# (Required) Create the 'spark' group/user.
# The GID and UID must be 1099. Home directory is required.
RUN groupadd -g 1099 spark
RUN useradd -u 1099 -g 1099 -d /home/spark -m spark
USER spark