From 82521c1ad1e895a987ea85fba3db8c30c02311b7 Mon Sep 17 00:00:00 2001 From: Bryan Paget Date: Mon, 16 Dec 2024 10:07:54 -0500 Subject: [PATCH 01/14] feat: add pyspark --- docker-bits/6_jupyterlab.Dockerfile | 1 + output/jupyterlab-cpu/Dockerfile | 1 + output/jupyterlab-pytorch/Dockerfile | 1 + output/jupyterlab-tensorflow/Dockerfile | 1 + output/sas/Dockerfile | 1 + 5 files changed, 5 insertions(+) diff --git a/docker-bits/6_jupyterlab.Dockerfile b/docker-bits/6_jupyterlab.Dockerfile index 663f4bd07..27318a033 100644 --- a/docker-bits/6_jupyterlab.Dockerfile +++ b/docker-bits/6_jupyterlab.Dockerfile @@ -65,6 +65,7 @@ RUN pip install \ 'plotly' \ 'ipywidgets' \ 'markupsafe' \ + 'pyspark' \ 'ipympl' \ 'pexpect==4.9.0' \ 'jupyter-resource-usage' \ diff --git a/output/jupyterlab-cpu/Dockerfile b/output/jupyterlab-cpu/Dockerfile index 62449118c..928fb4dfa 100644 --- a/output/jupyterlab-cpu/Dockerfile +++ b/output/jupyterlab-cpu/Dockerfile @@ -342,6 +342,7 @@ RUN pip install \ 'plotly' \ 'ipywidgets' \ 'markupsafe' \ + 'pyspark' \ 'ipympl' \ 'pexpect==4.9.0' \ 'jupyter-resource-usage' \ diff --git a/output/jupyterlab-pytorch/Dockerfile b/output/jupyterlab-pytorch/Dockerfile index b307e8ec9..40a6f1214 100644 --- a/output/jupyterlab-pytorch/Dockerfile +++ b/output/jupyterlab-pytorch/Dockerfile @@ -364,6 +364,7 @@ RUN pip install \ 'plotly' \ 'ipywidgets' \ 'markupsafe' \ + 'pyspark' \ 'ipympl' \ 'pexpect==4.9.0' \ 'jupyter-resource-usage' \ diff --git a/output/jupyterlab-tensorflow/Dockerfile b/output/jupyterlab-tensorflow/Dockerfile index a0610abea..a6af673a0 100644 --- a/output/jupyterlab-tensorflow/Dockerfile +++ b/output/jupyterlab-tensorflow/Dockerfile @@ -471,6 +471,7 @@ RUN pip install \ 'plotly' \ 'ipywidgets' \ 'markupsafe' \ + 'pyspark' \ 'ipympl' \ 'pexpect==4.9.0' \ 'jupyter-resource-usage' \ diff --git a/output/sas/Dockerfile b/output/sas/Dockerfile index d614f353c..1625f200c 100644 --- a/output/sas/Dockerfile +++ b/output/sas/Dockerfile @@ -276,6 +276,7 @@ RUN pip install \ 'plotly' \ 'ipywidgets' \ 'markupsafe' \ + 'pyspark' \ 'ipympl' \ 'pexpect==4.9.0' \ 'jupyter-resource-usage' \ From 88fe3bf6db83a4ca91b3ca734ec5492f34c62d4b Mon Sep 17 00:00:00 2001 From: Bryan Paget Date: Mon, 16 Dec 2024 10:17:48 -0500 Subject: [PATCH 02/14] feat: add sparklyr --- docker-bits/6_rstudio.Dockerfile | 1 + output/jupyterlab-cpu/Dockerfile | 1 + output/jupyterlab-pytorch/Dockerfile | 1 + output/jupyterlab-tensorflow/Dockerfile | 1 + output/rstudio/Dockerfile | 1 + output/sas/Dockerfile | 1 + 6 files changed, 6 insertions(+) diff --git a/docker-bits/6_rstudio.Dockerfile b/docker-bits/6_rstudio.Dockerfile index 7697cab44..f7c393fc2 100644 --- a/docker-bits/6_rstudio.Dockerfile +++ b/docker-bits/6_rstudio.Dockerfile @@ -20,6 +20,7 @@ RUN mamba install --quiet --yes \ 'r-e1071' \ 'r-hdf5r' \ 'r-markdown' \ + 'r-sparklyr' \ 'r-odbc' \ 'r-renv' \ 'r-rodbc' \ diff --git a/output/jupyterlab-cpu/Dockerfile b/output/jupyterlab-cpu/Dockerfile index 928fb4dfa..41762cb53 100644 --- a/output/jupyterlab-cpu/Dockerfile +++ b/output/jupyterlab-cpu/Dockerfile @@ -251,6 +251,7 @@ RUN mamba install --quiet --yes \ 'r-e1071' \ 'r-hdf5r' \ 'r-markdown' \ + 'r-sparklyr' \ 'r-odbc' \ 'r-renv' \ 'r-rodbc' \ diff --git a/output/jupyterlab-pytorch/Dockerfile b/output/jupyterlab-pytorch/Dockerfile index 40a6f1214..5c869f6af 100644 --- a/output/jupyterlab-pytorch/Dockerfile +++ b/output/jupyterlab-pytorch/Dockerfile @@ -273,6 +273,7 @@ RUN mamba install --quiet --yes \ 'r-e1071' \ 'r-hdf5r' \ 'r-markdown' \ + 'r-sparklyr' \ 'r-odbc' \ 'r-renv' \ 'r-rodbc' \ diff --git a/output/jupyterlab-tensorflow/Dockerfile b/output/jupyterlab-tensorflow/Dockerfile index a6af673a0..21dd549fc 100644 --- a/output/jupyterlab-tensorflow/Dockerfile +++ b/output/jupyterlab-tensorflow/Dockerfile @@ -380,6 +380,7 @@ RUN mamba install --quiet --yes \ 'r-e1071' \ 'r-hdf5r' \ 'r-markdown' \ + 'r-sparklyr' \ 'r-odbc' \ 'r-renv' \ 'r-rodbc' \ diff --git a/output/rstudio/Dockerfile b/output/rstudio/Dockerfile index 7e6c1760e..e125b1caa 100644 --- a/output/rstudio/Dockerfile +++ b/output/rstudio/Dockerfile @@ -251,6 +251,7 @@ RUN mamba install --quiet --yes \ 'r-e1071' \ 'r-hdf5r' \ 'r-markdown' \ + 'r-sparklyr' \ 'r-odbc' \ 'r-renv' \ 'r-rodbc' \ diff --git a/output/sas/Dockerfile b/output/sas/Dockerfile index 1625f200c..55caa4c33 100644 --- a/output/sas/Dockerfile +++ b/output/sas/Dockerfile @@ -411,6 +411,7 @@ RUN mamba install --quiet --yes \ 'r-e1071' \ 'r-hdf5r' \ 'r-markdown' \ + 'r-sparklyr' \ 'r-odbc' \ 'r-renv' \ 'r-rodbc' \ From 70143476fe88977f4f2067529ccfac34c5d8dbbd Mon Sep 17 00:00:00 2001 From: Bryan Paget Date: Mon, 16 Dec 2024 15:41:52 -0500 Subject: [PATCH 03/14] feat: add spark for local r dev/debugging --- output/docker-stacks-datascience-notebook/start-custom.sh | 5 +++++ output/jupyterlab-cpu/start-custom.sh | 5 +++++ output/jupyterlab-pytorch/start-custom.sh | 5 +++++ output/jupyterlab-tensorflow/start-custom.sh | 5 +++++ output/remote-desktop/start-custom.sh | 5 +++++ output/rstudio/start-custom.sh | 5 +++++ output/sas/start-custom.sh | 5 +++++ resources/common/start-custom.sh | 5 +++++ 8 files changed, 40 insertions(+) diff --git a/output/docker-stacks-datascience-notebook/start-custom.sh b/output/docker-stacks-datascience-notebook/start-custom.sh index 7e409509d..0346ec64a 100755 --- a/output/docker-stacks-datascience-notebook/start-custom.sh +++ b/output/docker-stacks-datascience-notebook/start-custom.sh @@ -50,6 +50,11 @@ if [ ! -e /home/$NB_USER/.Rprofile ]; then cat /tmp/.Rprofile >> /home/$NB_USER/.Rprofile && rm -rf /tmp/.Rprofile fi +# Install sparklyr +Rscript -e "if (!requireNamespace('sparklyr', quietly = TRUE)) install.packages('sparklyr')" \ + -e "library(sparklyr)" \ + -e "spark_install()" + # Configure the shell! If not already configured. if [ ! -f /home/$NB_USER/.zsh-installed ]; then if [ -f /tmp/oh-my-zsh-install.sh ]; then diff --git a/output/jupyterlab-cpu/start-custom.sh b/output/jupyterlab-cpu/start-custom.sh index 7e409509d..0346ec64a 100755 --- a/output/jupyterlab-cpu/start-custom.sh +++ b/output/jupyterlab-cpu/start-custom.sh @@ -50,6 +50,11 @@ if [ ! -e /home/$NB_USER/.Rprofile ]; then cat /tmp/.Rprofile >> /home/$NB_USER/.Rprofile && rm -rf /tmp/.Rprofile fi +# Install sparklyr +Rscript -e "if (!requireNamespace('sparklyr', quietly = TRUE)) install.packages('sparklyr')" \ + -e "library(sparklyr)" \ + -e "spark_install()" + # Configure the shell! If not already configured. if [ ! -f /home/$NB_USER/.zsh-installed ]; then if [ -f /tmp/oh-my-zsh-install.sh ]; then diff --git a/output/jupyterlab-pytorch/start-custom.sh b/output/jupyterlab-pytorch/start-custom.sh index 7e409509d..0346ec64a 100755 --- a/output/jupyterlab-pytorch/start-custom.sh +++ b/output/jupyterlab-pytorch/start-custom.sh @@ -50,6 +50,11 @@ if [ ! -e /home/$NB_USER/.Rprofile ]; then cat /tmp/.Rprofile >> /home/$NB_USER/.Rprofile && rm -rf /tmp/.Rprofile fi +# Install sparklyr +Rscript -e "if (!requireNamespace('sparklyr', quietly = TRUE)) install.packages('sparklyr')" \ + -e "library(sparklyr)" \ + -e "spark_install()" + # Configure the shell! If not already configured. if [ ! -f /home/$NB_USER/.zsh-installed ]; then if [ -f /tmp/oh-my-zsh-install.sh ]; then diff --git a/output/jupyterlab-tensorflow/start-custom.sh b/output/jupyterlab-tensorflow/start-custom.sh index 7e409509d..0346ec64a 100755 --- a/output/jupyterlab-tensorflow/start-custom.sh +++ b/output/jupyterlab-tensorflow/start-custom.sh @@ -50,6 +50,11 @@ if [ ! -e /home/$NB_USER/.Rprofile ]; then cat /tmp/.Rprofile >> /home/$NB_USER/.Rprofile && rm -rf /tmp/.Rprofile fi +# Install sparklyr +Rscript -e "if (!requireNamespace('sparklyr', quietly = TRUE)) install.packages('sparklyr')" \ + -e "library(sparklyr)" \ + -e "spark_install()" + # Configure the shell! If not already configured. if [ ! -f /home/$NB_USER/.zsh-installed ]; then if [ -f /tmp/oh-my-zsh-install.sh ]; then diff --git a/output/remote-desktop/start-custom.sh b/output/remote-desktop/start-custom.sh index 7e409509d..0346ec64a 100755 --- a/output/remote-desktop/start-custom.sh +++ b/output/remote-desktop/start-custom.sh @@ -50,6 +50,11 @@ if [ ! -e /home/$NB_USER/.Rprofile ]; then cat /tmp/.Rprofile >> /home/$NB_USER/.Rprofile && rm -rf /tmp/.Rprofile fi +# Install sparklyr +Rscript -e "if (!requireNamespace('sparklyr', quietly = TRUE)) install.packages('sparklyr')" \ + -e "library(sparklyr)" \ + -e "spark_install()" + # Configure the shell! If not already configured. if [ ! -f /home/$NB_USER/.zsh-installed ]; then if [ -f /tmp/oh-my-zsh-install.sh ]; then diff --git a/output/rstudio/start-custom.sh b/output/rstudio/start-custom.sh index 7e409509d..0346ec64a 100755 --- a/output/rstudio/start-custom.sh +++ b/output/rstudio/start-custom.sh @@ -50,6 +50,11 @@ if [ ! -e /home/$NB_USER/.Rprofile ]; then cat /tmp/.Rprofile >> /home/$NB_USER/.Rprofile && rm -rf /tmp/.Rprofile fi +# Install sparklyr +Rscript -e "if (!requireNamespace('sparklyr', quietly = TRUE)) install.packages('sparklyr')" \ + -e "library(sparklyr)" \ + -e "spark_install()" + # Configure the shell! If not already configured. if [ ! -f /home/$NB_USER/.zsh-installed ]; then if [ -f /tmp/oh-my-zsh-install.sh ]; then diff --git a/output/sas/start-custom.sh b/output/sas/start-custom.sh index 7e409509d..0346ec64a 100755 --- a/output/sas/start-custom.sh +++ b/output/sas/start-custom.sh @@ -50,6 +50,11 @@ if [ ! -e /home/$NB_USER/.Rprofile ]; then cat /tmp/.Rprofile >> /home/$NB_USER/.Rprofile && rm -rf /tmp/.Rprofile fi +# Install sparklyr +Rscript -e "if (!requireNamespace('sparklyr', quietly = TRUE)) install.packages('sparklyr')" \ + -e "library(sparklyr)" \ + -e "spark_install()" + # Configure the shell! If not already configured. if [ ! -f /home/$NB_USER/.zsh-installed ]; then if [ -f /tmp/oh-my-zsh-install.sh ]; then diff --git a/resources/common/start-custom.sh b/resources/common/start-custom.sh index 7e409509d..0346ec64a 100755 --- a/resources/common/start-custom.sh +++ b/resources/common/start-custom.sh @@ -50,6 +50,11 @@ if [ ! -e /home/$NB_USER/.Rprofile ]; then cat /tmp/.Rprofile >> /home/$NB_USER/.Rprofile && rm -rf /tmp/.Rprofile fi +# Install sparklyr +Rscript -e "if (!requireNamespace('sparklyr', quietly = TRUE)) install.packages('sparklyr')" \ + -e "library(sparklyr)" \ + -e "spark_install()" + # Configure the shell! If not already configured. if [ ! -f /home/$NB_USER/.zsh-installed ]; then if [ -f /tmp/oh-my-zsh-install.sh ]; then From 74f074fb8583b1083086d6f4ef1520231c1ca296 Mon Sep 17 00:00:00 2001 From: Bryan Paget Date: Mon, 16 Dec 2024 15:54:39 -0500 Subject: [PATCH 04/14] feat: add spark for local r dev/debugging --- docker-bits/6_rstudio.Dockerfile | 3 +++ output/docker-stacks-datascience-notebook/start-custom.sh | 5 ----- output/jupyterlab-cpu/Dockerfile | 3 +++ output/jupyterlab-cpu/start-custom.sh | 5 ----- output/jupyterlab-pytorch/Dockerfile | 3 +++ output/jupyterlab-pytorch/start-custom.sh | 5 ----- output/jupyterlab-tensorflow/Dockerfile | 3 +++ output/jupyterlab-tensorflow/start-custom.sh | 5 ----- output/remote-desktop/start-custom.sh | 5 ----- output/rstudio/Dockerfile | 3 +++ output/rstudio/start-custom.sh | 5 ----- output/sas/Dockerfile | 3 +++ output/sas/start-custom.sh | 5 ----- resources/common/start-custom.sh | 5 ----- 14 files changed, 18 insertions(+), 40 deletions(-) diff --git a/docker-bits/6_rstudio.Dockerfile b/docker-bits/6_rstudio.Dockerfile index f7c393fc2..30fb93558 100644 --- a/docker-bits/6_rstudio.Dockerfile +++ b/docker-bits/6_rstudio.Dockerfile @@ -37,6 +37,9 @@ RUN python3 -m pip install \ fix-permissions $CONDA_DIR && \ fix-permissions /home/$NB_USER +# Install sparklyr +RUN Rscript -e "library(sparklyr); spark_install()" + # If using the docker bit in other Dockerfiles, this must get written over in a later layer ENV DEFAULT_JUPYTER_URL="/rstudio" ENV GIT_EXAMPLE_NOTEBOOKS=https://gitlab.k8s.cloud.statcan.ca/business-transformation/aaw/aaw-contrib-r-notebooks.git diff --git a/output/docker-stacks-datascience-notebook/start-custom.sh b/output/docker-stacks-datascience-notebook/start-custom.sh index 0346ec64a..7e409509d 100755 --- a/output/docker-stacks-datascience-notebook/start-custom.sh +++ b/output/docker-stacks-datascience-notebook/start-custom.sh @@ -50,11 +50,6 @@ if [ ! -e /home/$NB_USER/.Rprofile ]; then cat /tmp/.Rprofile >> /home/$NB_USER/.Rprofile && rm -rf /tmp/.Rprofile fi -# Install sparklyr -Rscript -e "if (!requireNamespace('sparklyr', quietly = TRUE)) install.packages('sparklyr')" \ - -e "library(sparklyr)" \ - -e "spark_install()" - # Configure the shell! If not already configured. if [ ! -f /home/$NB_USER/.zsh-installed ]; then if [ -f /tmp/oh-my-zsh-install.sh ]; then diff --git a/output/jupyterlab-cpu/Dockerfile b/output/jupyterlab-cpu/Dockerfile index 41762cb53..66a74073b 100644 --- a/output/jupyterlab-cpu/Dockerfile +++ b/output/jupyterlab-cpu/Dockerfile @@ -268,6 +268,9 @@ RUN python3 -m pip install \ fix-permissions $CONDA_DIR && \ fix-permissions /home/$NB_USER +# Install sparklyr +RUN Rscript -e "library(sparklyr); spark_install()" + # If using the docker bit in other Dockerfiles, this must get written over in a later layer ENV DEFAULT_JUPYTER_URL="/rstudio" ENV GIT_EXAMPLE_NOTEBOOKS=https://gitlab.k8s.cloud.statcan.ca/business-transformation/aaw/aaw-contrib-r-notebooks.git diff --git a/output/jupyterlab-cpu/start-custom.sh b/output/jupyterlab-cpu/start-custom.sh index 0346ec64a..7e409509d 100755 --- a/output/jupyterlab-cpu/start-custom.sh +++ b/output/jupyterlab-cpu/start-custom.sh @@ -50,11 +50,6 @@ if [ ! -e /home/$NB_USER/.Rprofile ]; then cat /tmp/.Rprofile >> /home/$NB_USER/.Rprofile && rm -rf /tmp/.Rprofile fi -# Install sparklyr -Rscript -e "if (!requireNamespace('sparklyr', quietly = TRUE)) install.packages('sparklyr')" \ - -e "library(sparklyr)" \ - -e "spark_install()" - # Configure the shell! If not already configured. if [ ! -f /home/$NB_USER/.zsh-installed ]; then if [ -f /tmp/oh-my-zsh-install.sh ]; then diff --git a/output/jupyterlab-pytorch/Dockerfile b/output/jupyterlab-pytorch/Dockerfile index 5c869f6af..086ac35f0 100644 --- a/output/jupyterlab-pytorch/Dockerfile +++ b/output/jupyterlab-pytorch/Dockerfile @@ -290,6 +290,9 @@ RUN python3 -m pip install \ fix-permissions $CONDA_DIR && \ fix-permissions /home/$NB_USER +# Install sparklyr +RUN Rscript -e "library(sparklyr); spark_install()" + # If using the docker bit in other Dockerfiles, this must get written over in a later layer ENV DEFAULT_JUPYTER_URL="/rstudio" ENV GIT_EXAMPLE_NOTEBOOKS=https://gitlab.k8s.cloud.statcan.ca/business-transformation/aaw/aaw-contrib-r-notebooks.git diff --git a/output/jupyterlab-pytorch/start-custom.sh b/output/jupyterlab-pytorch/start-custom.sh index 0346ec64a..7e409509d 100755 --- a/output/jupyterlab-pytorch/start-custom.sh +++ b/output/jupyterlab-pytorch/start-custom.sh @@ -50,11 +50,6 @@ if [ ! -e /home/$NB_USER/.Rprofile ]; then cat /tmp/.Rprofile >> /home/$NB_USER/.Rprofile && rm -rf /tmp/.Rprofile fi -# Install sparklyr -Rscript -e "if (!requireNamespace('sparklyr', quietly = TRUE)) install.packages('sparklyr')" \ - -e "library(sparklyr)" \ - -e "spark_install()" - # Configure the shell! If not already configured. if [ ! -f /home/$NB_USER/.zsh-installed ]; then if [ -f /tmp/oh-my-zsh-install.sh ]; then diff --git a/output/jupyterlab-tensorflow/Dockerfile b/output/jupyterlab-tensorflow/Dockerfile index 21dd549fc..0e67b4aa9 100644 --- a/output/jupyterlab-tensorflow/Dockerfile +++ b/output/jupyterlab-tensorflow/Dockerfile @@ -397,6 +397,9 @@ RUN python3 -m pip install \ fix-permissions $CONDA_DIR && \ fix-permissions /home/$NB_USER +# Install sparklyr +RUN Rscript -e "library(sparklyr); spark_install()" + # If using the docker bit in other Dockerfiles, this must get written over in a later layer ENV DEFAULT_JUPYTER_URL="/rstudio" ENV GIT_EXAMPLE_NOTEBOOKS=https://gitlab.k8s.cloud.statcan.ca/business-transformation/aaw/aaw-contrib-r-notebooks.git diff --git a/output/jupyterlab-tensorflow/start-custom.sh b/output/jupyterlab-tensorflow/start-custom.sh index 0346ec64a..7e409509d 100755 --- a/output/jupyterlab-tensorflow/start-custom.sh +++ b/output/jupyterlab-tensorflow/start-custom.sh @@ -50,11 +50,6 @@ if [ ! -e /home/$NB_USER/.Rprofile ]; then cat /tmp/.Rprofile >> /home/$NB_USER/.Rprofile && rm -rf /tmp/.Rprofile fi -# Install sparklyr -Rscript -e "if (!requireNamespace('sparklyr', quietly = TRUE)) install.packages('sparklyr')" \ - -e "library(sparklyr)" \ - -e "spark_install()" - # Configure the shell! If not already configured. if [ ! -f /home/$NB_USER/.zsh-installed ]; then if [ -f /tmp/oh-my-zsh-install.sh ]; then diff --git a/output/remote-desktop/start-custom.sh b/output/remote-desktop/start-custom.sh index 0346ec64a..7e409509d 100755 --- a/output/remote-desktop/start-custom.sh +++ b/output/remote-desktop/start-custom.sh @@ -50,11 +50,6 @@ if [ ! -e /home/$NB_USER/.Rprofile ]; then cat /tmp/.Rprofile >> /home/$NB_USER/.Rprofile && rm -rf /tmp/.Rprofile fi -# Install sparklyr -Rscript -e "if (!requireNamespace('sparklyr', quietly = TRUE)) install.packages('sparklyr')" \ - -e "library(sparklyr)" \ - -e "spark_install()" - # Configure the shell! If not already configured. if [ ! -f /home/$NB_USER/.zsh-installed ]; then if [ -f /tmp/oh-my-zsh-install.sh ]; then diff --git a/output/rstudio/Dockerfile b/output/rstudio/Dockerfile index e125b1caa..9a593d2f6 100644 --- a/output/rstudio/Dockerfile +++ b/output/rstudio/Dockerfile @@ -268,6 +268,9 @@ RUN python3 -m pip install \ fix-permissions $CONDA_DIR && \ fix-permissions /home/$NB_USER +# Install sparklyr +RUN Rscript -e "library(sparklyr); spark_install()" + # If using the docker bit in other Dockerfiles, this must get written over in a later layer ENV DEFAULT_JUPYTER_URL="/rstudio" ENV GIT_EXAMPLE_NOTEBOOKS=https://gitlab.k8s.cloud.statcan.ca/business-transformation/aaw/aaw-contrib-r-notebooks.git diff --git a/output/rstudio/start-custom.sh b/output/rstudio/start-custom.sh index 0346ec64a..7e409509d 100755 --- a/output/rstudio/start-custom.sh +++ b/output/rstudio/start-custom.sh @@ -50,11 +50,6 @@ if [ ! -e /home/$NB_USER/.Rprofile ]; then cat /tmp/.Rprofile >> /home/$NB_USER/.Rprofile && rm -rf /tmp/.Rprofile fi -# Install sparklyr -Rscript -e "if (!requireNamespace('sparklyr', quietly = TRUE)) install.packages('sparklyr')" \ - -e "library(sparklyr)" \ - -e "spark_install()" - # Configure the shell! If not already configured. if [ ! -f /home/$NB_USER/.zsh-installed ]; then if [ -f /tmp/oh-my-zsh-install.sh ]; then diff --git a/output/sas/Dockerfile b/output/sas/Dockerfile index 55caa4c33..d16f4698b 100644 --- a/output/sas/Dockerfile +++ b/output/sas/Dockerfile @@ -428,6 +428,9 @@ RUN python3 -m pip install \ fix-permissions $CONDA_DIR && \ fix-permissions /home/$NB_USER +# Install sparklyr +RUN Rscript -e "library(sparklyr); spark_install()" + # If using the docker bit in other Dockerfiles, this must get written over in a later layer ENV DEFAULT_JUPYTER_URL="/rstudio" ENV GIT_EXAMPLE_NOTEBOOKS=https://gitlab.k8s.cloud.statcan.ca/business-transformation/aaw/aaw-contrib-r-notebooks.git diff --git a/output/sas/start-custom.sh b/output/sas/start-custom.sh index 0346ec64a..7e409509d 100755 --- a/output/sas/start-custom.sh +++ b/output/sas/start-custom.sh @@ -50,11 +50,6 @@ if [ ! -e /home/$NB_USER/.Rprofile ]; then cat /tmp/.Rprofile >> /home/$NB_USER/.Rprofile && rm -rf /tmp/.Rprofile fi -# Install sparklyr -Rscript -e "if (!requireNamespace('sparklyr', quietly = TRUE)) install.packages('sparklyr')" \ - -e "library(sparklyr)" \ - -e "spark_install()" - # Configure the shell! If not already configured. if [ ! -f /home/$NB_USER/.zsh-installed ]; then if [ -f /tmp/oh-my-zsh-install.sh ]; then diff --git a/resources/common/start-custom.sh b/resources/common/start-custom.sh index 0346ec64a..7e409509d 100755 --- a/resources/common/start-custom.sh +++ b/resources/common/start-custom.sh @@ -50,11 +50,6 @@ if [ ! -e /home/$NB_USER/.Rprofile ]; then cat /tmp/.Rprofile >> /home/$NB_USER/.Rprofile && rm -rf /tmp/.Rprofile fi -# Install sparklyr -Rscript -e "if (!requireNamespace('sparklyr', quietly = TRUE)) install.packages('sparklyr')" \ - -e "library(sparklyr)" \ - -e "spark_install()" - # Configure the shell! If not already configured. if [ ! -f /home/$NB_USER/.zsh-installed ]; then if [ -f /tmp/oh-my-zsh-install.sh ]; then From a2831e98abe454187839228e6435ac2f19b14952 Mon Sep 17 00:00:00 2001 From: Bryan Paget Date: Tue, 17 Dec 2024 10:46:45 -0500 Subject: [PATCH 05/14] feat: add spark for local r dev/debugging Add libxml2-dev and libcurl4-openssl-dev as spark R dependencies. --- docker-bits/6_rstudio.Dockerfile | 6 +++++- output/jupyterlab-cpu/Dockerfile | 6 +++++- output/jupyterlab-pytorch/Dockerfile | 6 +++++- output/jupyterlab-tensorflow/Dockerfile | 6 +++++- output/rstudio/Dockerfile | 6 +++++- output/sas/Dockerfile | 6 +++++- 6 files changed, 30 insertions(+), 6 deletions(-) diff --git a/docker-bits/6_rstudio.Dockerfile b/docker-bits/6_rstudio.Dockerfile index 30fb93558..f05016cee 100644 --- a/docker-bits/6_rstudio.Dockerfile +++ b/docker-bits/6_rstudio.Dockerfile @@ -38,7 +38,11 @@ RUN python3 -m pip install \ fix-permissions /home/$NB_USER # Install sparklyr -RUN Rscript -e "library(sparklyr); spark_install()" +ENV R_LIBS_USER="${SPARK_HOME}/R/lib" +RUN fix-permissions "${R_LIBS_USER}" +RUN apt-get update && \ + apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ + Rscript -e "library(sparklyr); spark_install()" # If using the docker bit in other Dockerfiles, this must get written over in a later layer ENV DEFAULT_JUPYTER_URL="/rstudio" diff --git a/output/jupyterlab-cpu/Dockerfile b/output/jupyterlab-cpu/Dockerfile index 66a74073b..782fa6a50 100644 --- a/output/jupyterlab-cpu/Dockerfile +++ b/output/jupyterlab-cpu/Dockerfile @@ -269,7 +269,11 @@ RUN python3 -m pip install \ fix-permissions /home/$NB_USER # Install sparklyr -RUN Rscript -e "library(sparklyr); spark_install()" +ENV R_LIBS_USER="${SPARK_HOME}/R/lib" +RUN fix-permissions "${R_LIBS_USER}" +RUN apt-get update && \ + apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ + Rscript -e "library(sparklyr); spark_install()" # If using the docker bit in other Dockerfiles, this must get written over in a later layer ENV DEFAULT_JUPYTER_URL="/rstudio" diff --git a/output/jupyterlab-pytorch/Dockerfile b/output/jupyterlab-pytorch/Dockerfile index 086ac35f0..92cbaadbe 100644 --- a/output/jupyterlab-pytorch/Dockerfile +++ b/output/jupyterlab-pytorch/Dockerfile @@ -291,7 +291,11 @@ RUN python3 -m pip install \ fix-permissions /home/$NB_USER # Install sparklyr -RUN Rscript -e "library(sparklyr); spark_install()" +ENV R_LIBS_USER="${SPARK_HOME}/R/lib" +RUN fix-permissions "${R_LIBS_USER}" +RUN apt-get update && \ + apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ + Rscript -e "library(sparklyr); spark_install()" # If using the docker bit in other Dockerfiles, this must get written over in a later layer ENV DEFAULT_JUPYTER_URL="/rstudio" diff --git a/output/jupyterlab-tensorflow/Dockerfile b/output/jupyterlab-tensorflow/Dockerfile index 0e67b4aa9..9e7e2c14c 100644 --- a/output/jupyterlab-tensorflow/Dockerfile +++ b/output/jupyterlab-tensorflow/Dockerfile @@ -398,7 +398,11 @@ RUN python3 -m pip install \ fix-permissions /home/$NB_USER # Install sparklyr -RUN Rscript -e "library(sparklyr); spark_install()" +ENV R_LIBS_USER="${SPARK_HOME}/R/lib" +RUN fix-permissions "${R_LIBS_USER}" +RUN apt-get update && \ + apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ + Rscript -e "library(sparklyr); spark_install()" # If using the docker bit in other Dockerfiles, this must get written over in a later layer ENV DEFAULT_JUPYTER_URL="/rstudio" diff --git a/output/rstudio/Dockerfile b/output/rstudio/Dockerfile index 9a593d2f6..4700d5a98 100644 --- a/output/rstudio/Dockerfile +++ b/output/rstudio/Dockerfile @@ -269,7 +269,11 @@ RUN python3 -m pip install \ fix-permissions /home/$NB_USER # Install sparklyr -RUN Rscript -e "library(sparklyr); spark_install()" +ENV R_LIBS_USER="${SPARK_HOME}/R/lib" +RUN fix-permissions "${R_LIBS_USER}" +RUN apt-get update && \ + apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ + Rscript -e "library(sparklyr); spark_install()" # If using the docker bit in other Dockerfiles, this must get written over in a later layer ENV DEFAULT_JUPYTER_URL="/rstudio" diff --git a/output/sas/Dockerfile b/output/sas/Dockerfile index d16f4698b..6ebc8df39 100644 --- a/output/sas/Dockerfile +++ b/output/sas/Dockerfile @@ -429,7 +429,11 @@ RUN python3 -m pip install \ fix-permissions /home/$NB_USER # Install sparklyr -RUN Rscript -e "library(sparklyr); spark_install()" +ENV R_LIBS_USER="${SPARK_HOME}/R/lib" +RUN fix-permissions "${R_LIBS_USER}" +RUN apt-get update && \ + apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ + Rscript -e "library(sparklyr); spark_install()" # If using the docker bit in other Dockerfiles, this must get written over in a later layer ENV DEFAULT_JUPYTER_URL="/rstudio" From bcf9cdd38f4e9818623a77dce3a45d3f5af9e1d1 Mon Sep 17 00:00:00 2001 From: Bryan Paget Date: Tue, 17 Dec 2024 11:19:11 -0500 Subject: [PATCH 06/14] feat: add spark for local r dev/debugging Add libxml2-dev and libcurl4-openssl-dev as spark R dependencies. --- docker-bits/6_jupyterlab.Dockerfile | 4 ++++ docker-bits/6_rstudio.Dockerfile | 4 ++-- output/jupyterlab-cpu/Dockerfile | 6 +++++- output/jupyterlab-pytorch/Dockerfile | 6 +++++- output/jupyterlab-tensorflow/Dockerfile | 6 +++++- output/rstudio/Dockerfile | 2 +- output/sas/Dockerfile | 6 +++++- 7 files changed, 27 insertions(+), 7 deletions(-) diff --git a/docker-bits/6_jupyterlab.Dockerfile b/docker-bits/6_jupyterlab.Dockerfile index 27318a033..ede929df8 100644 --- a/docker-bits/6_jupyterlab.Dockerfile +++ b/docker-bits/6_jupyterlab.Dockerfile @@ -19,6 +19,10 @@ ENV CS_TEMP_HOME=/etc/share/code-server ENV CS_DEFAULT_HOME=$HOME/.local/share/code-server ENV SERVICE_URL=https://extensions.coder.com/api +ENV SPARK_HOME=/usr/local/spark +ENV PATH="${PATH}:${SPARK_HOME}/bin" +ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" + RUN wget -q "${VSCODE_URL}" -O ./vscode.deb \ && echo "${VSCODE_SHA} ./vscode.deb" | sha256sum -c - \ && wget -q https://github.com/microsoft/vscode-cpptools/releases/download/v1.20.5/cpptools-linux.vsix \ diff --git a/docker-bits/6_rstudio.Dockerfile b/docker-bits/6_rstudio.Dockerfile index f05016cee..b8ff9458c 100644 --- a/docker-bits/6_rstudio.Dockerfile +++ b/docker-bits/6_rstudio.Dockerfile @@ -41,9 +41,9 @@ RUN python3 -m pip install \ ENV R_LIBS_USER="${SPARK_HOME}/R/lib" RUN fix-permissions "${R_LIBS_USER}" RUN apt-get update && \ - apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ + apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ Rscript -e "library(sparklyr); spark_install()" # If using the docker bit in other Dockerfiles, this must get written over in a later layer ENV DEFAULT_JUPYTER_URL="/rstudio" -ENV GIT_EXAMPLE_NOTEBOOKS=https://gitlab.k8s.cloud.statcan.ca/business-transformation/aaw/aaw-contrib-r-notebooks.git +ENV GIT_EXAMPLE_NOTEBOOKS=https://gitlab.k8s.cloud.statcan.ca/business-transformation/aaw/aaw-contrib-r-notebooks.git \ No newline at end of file diff --git a/output/jupyterlab-cpu/Dockerfile b/output/jupyterlab-cpu/Dockerfile index 782fa6a50..cae793d7f 100644 --- a/output/jupyterlab-cpu/Dockerfile +++ b/output/jupyterlab-cpu/Dockerfile @@ -272,7 +272,7 @@ RUN python3 -m pip install \ ENV R_LIBS_USER="${SPARK_HOME}/R/lib" RUN fix-permissions "${R_LIBS_USER}" RUN apt-get update && \ - apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ + apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ Rscript -e "library(sparklyr); spark_install()" # If using the docker bit in other Dockerfiles, this must get written over in a later layer @@ -304,6 +304,10 @@ ENV CS_TEMP_HOME=/etc/share/code-server ENV CS_DEFAULT_HOME=$HOME/.local/share/code-server ENV SERVICE_URL=https://extensions.coder.com/api +ENV SPARK_HOME=/usr/local/spark +ENV PATH="${PATH}:${SPARK_HOME}/bin" +ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" + RUN wget -q "${VSCODE_URL}" -O ./vscode.deb \ && echo "${VSCODE_SHA} ./vscode.deb" | sha256sum -c - \ && wget -q https://github.com/microsoft/vscode-cpptools/releases/download/v1.20.5/cpptools-linux.vsix \ diff --git a/output/jupyterlab-pytorch/Dockerfile b/output/jupyterlab-pytorch/Dockerfile index 92cbaadbe..8c98f0f9d 100644 --- a/output/jupyterlab-pytorch/Dockerfile +++ b/output/jupyterlab-pytorch/Dockerfile @@ -294,7 +294,7 @@ RUN python3 -m pip install \ ENV R_LIBS_USER="${SPARK_HOME}/R/lib" RUN fix-permissions "${R_LIBS_USER}" RUN apt-get update && \ - apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ + apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ Rscript -e "library(sparklyr); spark_install()" # If using the docker bit in other Dockerfiles, this must get written over in a later layer @@ -326,6 +326,10 @@ ENV CS_TEMP_HOME=/etc/share/code-server ENV CS_DEFAULT_HOME=$HOME/.local/share/code-server ENV SERVICE_URL=https://extensions.coder.com/api +ENV SPARK_HOME=/usr/local/spark +ENV PATH="${PATH}:${SPARK_HOME}/bin" +ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" + RUN wget -q "${VSCODE_URL}" -O ./vscode.deb \ && echo "${VSCODE_SHA} ./vscode.deb" | sha256sum -c - \ && wget -q https://github.com/microsoft/vscode-cpptools/releases/download/v1.20.5/cpptools-linux.vsix \ diff --git a/output/jupyterlab-tensorflow/Dockerfile b/output/jupyterlab-tensorflow/Dockerfile index 9e7e2c14c..577a4e252 100644 --- a/output/jupyterlab-tensorflow/Dockerfile +++ b/output/jupyterlab-tensorflow/Dockerfile @@ -401,7 +401,7 @@ RUN python3 -m pip install \ ENV R_LIBS_USER="${SPARK_HOME}/R/lib" RUN fix-permissions "${R_LIBS_USER}" RUN apt-get update && \ - apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ + apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ Rscript -e "library(sparklyr); spark_install()" # If using the docker bit in other Dockerfiles, this must get written over in a later layer @@ -433,6 +433,10 @@ ENV CS_TEMP_HOME=/etc/share/code-server ENV CS_DEFAULT_HOME=$HOME/.local/share/code-server ENV SERVICE_URL=https://extensions.coder.com/api +ENV SPARK_HOME=/usr/local/spark +ENV PATH="${PATH}:${SPARK_HOME}/bin" +ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" + RUN wget -q "${VSCODE_URL}" -O ./vscode.deb \ && echo "${VSCODE_SHA} ./vscode.deb" | sha256sum -c - \ && wget -q https://github.com/microsoft/vscode-cpptools/releases/download/v1.20.5/cpptools-linux.vsix \ diff --git a/output/rstudio/Dockerfile b/output/rstudio/Dockerfile index 4700d5a98..6d443e002 100644 --- a/output/rstudio/Dockerfile +++ b/output/rstudio/Dockerfile @@ -272,7 +272,7 @@ RUN python3 -m pip install \ ENV R_LIBS_USER="${SPARK_HOME}/R/lib" RUN fix-permissions "${R_LIBS_USER}" RUN apt-get update && \ - apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ + apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ Rscript -e "library(sparklyr); spark_install()" # If using the docker bit in other Dockerfiles, this must get written over in a later layer diff --git a/output/sas/Dockerfile b/output/sas/Dockerfile index 6ebc8df39..c65d8b259 100644 --- a/output/sas/Dockerfile +++ b/output/sas/Dockerfile @@ -230,6 +230,10 @@ ENV CS_TEMP_HOME=/etc/share/code-server ENV CS_DEFAULT_HOME=$HOME/.local/share/code-server ENV SERVICE_URL=https://extensions.coder.com/api +ENV SPARK_HOME=/usr/local/spark +ENV PATH="${PATH}:${SPARK_HOME}/bin" +ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" + RUN wget -q "${VSCODE_URL}" -O ./vscode.deb \ && echo "${VSCODE_SHA} ./vscode.deb" | sha256sum -c - \ && wget -q https://github.com/microsoft/vscode-cpptools/releases/download/v1.20.5/cpptools-linux.vsix \ @@ -432,7 +436,7 @@ RUN python3 -m pip install \ ENV R_LIBS_USER="${SPARK_HOME}/R/lib" RUN fix-permissions "${R_LIBS_USER}" RUN apt-get update && \ - apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ + apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ Rscript -e "library(sparklyr); spark_install()" # If using the docker bit in other Dockerfiles, this must get written over in a later layer From 08de9908b2cfe8991ce851724a17d32439cfe86d Mon Sep 17 00:00:00 2001 From: Bryan Paget Date: Tue, 17 Dec 2024 13:26:55 -0500 Subject: [PATCH 07/14] feat: add spark for local r dev/debugging Add libxml2-dev and libcurl4-openssl-dev as spark R dependencies. --- docker-bits/6_jupyterlab.Dockerfile | 4 ---- docker-bits/6_rstudio.Dockerfile | 2 -- output/jupyterlab-cpu/Dockerfile | 6 ------ output/jupyterlab-pytorch/Dockerfile | 6 ------ output/jupyterlab-tensorflow/Dockerfile | 6 ------ output/rstudio/Dockerfile | 2 -- output/sas/Dockerfile | 6 ------ 7 files changed, 32 deletions(-) diff --git a/docker-bits/6_jupyterlab.Dockerfile b/docker-bits/6_jupyterlab.Dockerfile index ede929df8..27318a033 100644 --- a/docker-bits/6_jupyterlab.Dockerfile +++ b/docker-bits/6_jupyterlab.Dockerfile @@ -19,10 +19,6 @@ ENV CS_TEMP_HOME=/etc/share/code-server ENV CS_DEFAULT_HOME=$HOME/.local/share/code-server ENV SERVICE_URL=https://extensions.coder.com/api -ENV SPARK_HOME=/usr/local/spark -ENV PATH="${PATH}:${SPARK_HOME}/bin" -ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" - RUN wget -q "${VSCODE_URL}" -O ./vscode.deb \ && echo "${VSCODE_SHA} ./vscode.deb" | sha256sum -c - \ && wget -q https://github.com/microsoft/vscode-cpptools/releases/download/v1.20.5/cpptools-linux.vsix \ diff --git a/docker-bits/6_rstudio.Dockerfile b/docker-bits/6_rstudio.Dockerfile index b8ff9458c..980d29239 100644 --- a/docker-bits/6_rstudio.Dockerfile +++ b/docker-bits/6_rstudio.Dockerfile @@ -38,8 +38,6 @@ RUN python3 -m pip install \ fix-permissions /home/$NB_USER # Install sparklyr -ENV R_LIBS_USER="${SPARK_HOME}/R/lib" -RUN fix-permissions "${R_LIBS_USER}" RUN apt-get update && \ apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ Rscript -e "library(sparklyr); spark_install()" diff --git a/output/jupyterlab-cpu/Dockerfile b/output/jupyterlab-cpu/Dockerfile index cae793d7f..a23e264ef 100644 --- a/output/jupyterlab-cpu/Dockerfile +++ b/output/jupyterlab-cpu/Dockerfile @@ -269,8 +269,6 @@ RUN python3 -m pip install \ fix-permissions /home/$NB_USER # Install sparklyr -ENV R_LIBS_USER="${SPARK_HOME}/R/lib" -RUN fix-permissions "${R_LIBS_USER}" RUN apt-get update && \ apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ Rscript -e "library(sparklyr); spark_install()" @@ -304,10 +302,6 @@ ENV CS_TEMP_HOME=/etc/share/code-server ENV CS_DEFAULT_HOME=$HOME/.local/share/code-server ENV SERVICE_URL=https://extensions.coder.com/api -ENV SPARK_HOME=/usr/local/spark -ENV PATH="${PATH}:${SPARK_HOME}/bin" -ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" - RUN wget -q "${VSCODE_URL}" -O ./vscode.deb \ && echo "${VSCODE_SHA} ./vscode.deb" | sha256sum -c - \ && wget -q https://github.com/microsoft/vscode-cpptools/releases/download/v1.20.5/cpptools-linux.vsix \ diff --git a/output/jupyterlab-pytorch/Dockerfile b/output/jupyterlab-pytorch/Dockerfile index 8c98f0f9d..40c813401 100644 --- a/output/jupyterlab-pytorch/Dockerfile +++ b/output/jupyterlab-pytorch/Dockerfile @@ -291,8 +291,6 @@ RUN python3 -m pip install \ fix-permissions /home/$NB_USER # Install sparklyr -ENV R_LIBS_USER="${SPARK_HOME}/R/lib" -RUN fix-permissions "${R_LIBS_USER}" RUN apt-get update && \ apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ Rscript -e "library(sparklyr); spark_install()" @@ -326,10 +324,6 @@ ENV CS_TEMP_HOME=/etc/share/code-server ENV CS_DEFAULT_HOME=$HOME/.local/share/code-server ENV SERVICE_URL=https://extensions.coder.com/api -ENV SPARK_HOME=/usr/local/spark -ENV PATH="${PATH}:${SPARK_HOME}/bin" -ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" - RUN wget -q "${VSCODE_URL}" -O ./vscode.deb \ && echo "${VSCODE_SHA} ./vscode.deb" | sha256sum -c - \ && wget -q https://github.com/microsoft/vscode-cpptools/releases/download/v1.20.5/cpptools-linux.vsix \ diff --git a/output/jupyterlab-tensorflow/Dockerfile b/output/jupyterlab-tensorflow/Dockerfile index 577a4e252..04234a132 100644 --- a/output/jupyterlab-tensorflow/Dockerfile +++ b/output/jupyterlab-tensorflow/Dockerfile @@ -398,8 +398,6 @@ RUN python3 -m pip install \ fix-permissions /home/$NB_USER # Install sparklyr -ENV R_LIBS_USER="${SPARK_HOME}/R/lib" -RUN fix-permissions "${R_LIBS_USER}" RUN apt-get update && \ apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ Rscript -e "library(sparklyr); spark_install()" @@ -433,10 +431,6 @@ ENV CS_TEMP_HOME=/etc/share/code-server ENV CS_DEFAULT_HOME=$HOME/.local/share/code-server ENV SERVICE_URL=https://extensions.coder.com/api -ENV SPARK_HOME=/usr/local/spark -ENV PATH="${PATH}:${SPARK_HOME}/bin" -ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" - RUN wget -q "${VSCODE_URL}" -O ./vscode.deb \ && echo "${VSCODE_SHA} ./vscode.deb" | sha256sum -c - \ && wget -q https://github.com/microsoft/vscode-cpptools/releases/download/v1.20.5/cpptools-linux.vsix \ diff --git a/output/rstudio/Dockerfile b/output/rstudio/Dockerfile index 6d443e002..b9b02e58a 100644 --- a/output/rstudio/Dockerfile +++ b/output/rstudio/Dockerfile @@ -269,8 +269,6 @@ RUN python3 -m pip install \ fix-permissions /home/$NB_USER # Install sparklyr -ENV R_LIBS_USER="${SPARK_HOME}/R/lib" -RUN fix-permissions "${R_LIBS_USER}" RUN apt-get update && \ apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ Rscript -e "library(sparklyr); spark_install()" diff --git a/output/sas/Dockerfile b/output/sas/Dockerfile index c65d8b259..e3f1d97d7 100644 --- a/output/sas/Dockerfile +++ b/output/sas/Dockerfile @@ -230,10 +230,6 @@ ENV CS_TEMP_HOME=/etc/share/code-server ENV CS_DEFAULT_HOME=$HOME/.local/share/code-server ENV SERVICE_URL=https://extensions.coder.com/api -ENV SPARK_HOME=/usr/local/spark -ENV PATH="${PATH}:${SPARK_HOME}/bin" -ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" - RUN wget -q "${VSCODE_URL}" -O ./vscode.deb \ && echo "${VSCODE_SHA} ./vscode.deb" | sha256sum -c - \ && wget -q https://github.com/microsoft/vscode-cpptools/releases/download/v1.20.5/cpptools-linux.vsix \ @@ -433,8 +429,6 @@ RUN python3 -m pip install \ fix-permissions /home/$NB_USER # Install sparklyr -ENV R_LIBS_USER="${SPARK_HOME}/R/lib" -RUN fix-permissions "${R_LIBS_USER}" RUN apt-get update && \ apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ Rscript -e "library(sparklyr); spark_install()" From 78acae4d82065bb17976903a0abf4450568e0f2a Mon Sep 17 00:00:00 2001 From: Bryan Paget <8212170+bryanpaget@users.noreply.github.com> Date: Wed, 18 Dec 2024 15:51:08 +0000 Subject: [PATCH 08/14] feat: update spark version --- docker-bits/6_rstudio.Dockerfile | 6 ++++-- output/jupyterlab-cpu/Dockerfile | 4 +++- output/jupyterlab-pytorch/Dockerfile | 4 +++- output/jupyterlab-tensorflow/Dockerfile | 4 +++- output/rstudio/Dockerfile | 4 +++- output/sas/Dockerfile | 4 +++- 6 files changed, 19 insertions(+), 7 deletions(-) diff --git a/docker-bits/6_rstudio.Dockerfile b/docker-bits/6_rstudio.Dockerfile index 980d29239..4cb9af1ef 100644 --- a/docker-bits/6_rstudio.Dockerfile +++ b/docker-bits/6_rstudio.Dockerfile @@ -37,11 +37,13 @@ RUN python3 -m pip install \ fix-permissions $CONDA_DIR && \ fix-permissions /home/$NB_USER +ENV SPARK_VERSION=3.5 + # Install sparklyr RUN apt-get update && \ apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ - Rscript -e "library(sparklyr); spark_install()" + Rscript -e "library(sparklyr); spark_install(version = ${SPARK_VERSION})" # If using the docker bit in other Dockerfiles, this must get written over in a later layer ENV DEFAULT_JUPYTER_URL="/rstudio" -ENV GIT_EXAMPLE_NOTEBOOKS=https://gitlab.k8s.cloud.statcan.ca/business-transformation/aaw/aaw-contrib-r-notebooks.git \ No newline at end of file +ENV GIT_EXAMPLE_NOTEBOOKS=https://gitlab.k8s.cloud.statcan.ca/business-transformation/aaw/aaw-contrib-r-notebooks.git diff --git a/output/jupyterlab-cpu/Dockerfile b/output/jupyterlab-cpu/Dockerfile index a23e264ef..4dedc8854 100644 --- a/output/jupyterlab-cpu/Dockerfile +++ b/output/jupyterlab-cpu/Dockerfile @@ -268,10 +268,12 @@ RUN python3 -m pip install \ fix-permissions $CONDA_DIR && \ fix-permissions /home/$NB_USER +ENV SPARK_VERSION=3.5 + # Install sparklyr RUN apt-get update && \ apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ - Rscript -e "library(sparklyr); spark_install()" + Rscript -e "library(sparklyr); spark_install(version = ${SPARK_VERSION})" # If using the docker bit in other Dockerfiles, this must get written over in a later layer ENV DEFAULT_JUPYTER_URL="/rstudio" diff --git a/output/jupyterlab-pytorch/Dockerfile b/output/jupyterlab-pytorch/Dockerfile index 40c813401..1f28f707a 100644 --- a/output/jupyterlab-pytorch/Dockerfile +++ b/output/jupyterlab-pytorch/Dockerfile @@ -290,10 +290,12 @@ RUN python3 -m pip install \ fix-permissions $CONDA_DIR && \ fix-permissions /home/$NB_USER +ENV SPARK_VERSION=3.5 + # Install sparklyr RUN apt-get update && \ apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ - Rscript -e "library(sparklyr); spark_install()" + Rscript -e "library(sparklyr); spark_install(version = ${SPARK_VERSION})" # If using the docker bit in other Dockerfiles, this must get written over in a later layer ENV DEFAULT_JUPYTER_URL="/rstudio" diff --git a/output/jupyterlab-tensorflow/Dockerfile b/output/jupyterlab-tensorflow/Dockerfile index 04234a132..a8e086e8d 100644 --- a/output/jupyterlab-tensorflow/Dockerfile +++ b/output/jupyterlab-tensorflow/Dockerfile @@ -397,10 +397,12 @@ RUN python3 -m pip install \ fix-permissions $CONDA_DIR && \ fix-permissions /home/$NB_USER +ENV SPARK_VERSION=3.5 + # Install sparklyr RUN apt-get update && \ apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ - Rscript -e "library(sparklyr); spark_install()" + Rscript -e "library(sparklyr); spark_install(version = ${SPARK_VERSION})" # If using the docker bit in other Dockerfiles, this must get written over in a later layer ENV DEFAULT_JUPYTER_URL="/rstudio" diff --git a/output/rstudio/Dockerfile b/output/rstudio/Dockerfile index b9b02e58a..63df867c1 100644 --- a/output/rstudio/Dockerfile +++ b/output/rstudio/Dockerfile @@ -268,10 +268,12 @@ RUN python3 -m pip install \ fix-permissions $CONDA_DIR && \ fix-permissions /home/$NB_USER +ENV SPARK_VERSION=3.5 + # Install sparklyr RUN apt-get update && \ apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ - Rscript -e "library(sparklyr); spark_install()" + Rscript -e "library(sparklyr); spark_install(version = ${SPARK_VERSION})" # If using the docker bit in other Dockerfiles, this must get written over in a later layer ENV DEFAULT_JUPYTER_URL="/rstudio" diff --git a/output/sas/Dockerfile b/output/sas/Dockerfile index e3f1d97d7..5d593d74e 100644 --- a/output/sas/Dockerfile +++ b/output/sas/Dockerfile @@ -428,10 +428,12 @@ RUN python3 -m pip install \ fix-permissions $CONDA_DIR && \ fix-permissions /home/$NB_USER +ENV SPARK_VERSION=3.5 + # Install sparklyr RUN apt-get update && \ apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ - Rscript -e "library(sparklyr); spark_install()" + Rscript -e "library(sparklyr); spark_install(version = ${SPARK_VERSION})" # If using the docker bit in other Dockerfiles, this must get written over in a later layer ENV DEFAULT_JUPYTER_URL="/rstudio" From a187c623c9fedfccf9a8eff56d5ed328cb3ca10e Mon Sep 17 00:00:00 2001 From: Bryan Paget <8212170+bryanpaget@users.noreply.github.com> Date: Wed, 18 Dec 2024 16:38:06 +0000 Subject: [PATCH 09/14] feat: implement spark install similar to upstream --- docker-bits/6_rstudio.Dockerfile | 45 ++++-- .../setup_spark.py | 131 ++++++++++++++++++ output/jupyterlab-cpu/Dockerfile | 47 +++++-- output/jupyterlab-cpu/setup_spark.py | 131 ++++++++++++++++++ output/jupyterlab-pytorch/Dockerfile | 47 +++++-- output/jupyterlab-pytorch/setup_spark.py | 131 ++++++++++++++++++ output/jupyterlab-tensorflow/Dockerfile | 47 +++++-- output/jupyterlab-tensorflow/setup_spark.py | 131 ++++++++++++++++++ output/remote-desktop/setup_spark.py | 131 ++++++++++++++++++ output/rstudio/Dockerfile | 47 +++++-- output/rstudio/setup_spark.py | 131 ++++++++++++++++++ output/sas/Dockerfile | 47 +++++-- output/sas/setup_spark.py | 131 ++++++++++++++++++ resources/common/setup_spark.py | 131 ++++++++++++++++++ 14 files changed, 1269 insertions(+), 59 deletions(-) create mode 100644 output/docker-stacks-datascience-notebook/setup_spark.py create mode 100644 output/jupyterlab-cpu/setup_spark.py create mode 100644 output/jupyterlab-pytorch/setup_spark.py create mode 100644 output/jupyterlab-tensorflow/setup_spark.py create mode 100644 output/remote-desktop/setup_spark.py create mode 100644 output/rstudio/setup_spark.py create mode 100644 output/sas/setup_spark.py create mode 100644 resources/common/setup_spark.py diff --git a/docker-bits/6_rstudio.Dockerfile b/docker-bits/6_rstudio.Dockerfile index 4cb9af1ef..cbb8035e7 100644 --- a/docker-bits/6_rstudio.Dockerfile +++ b/docker-bits/6_rstudio.Dockerfile @@ -32,17 +32,44 @@ RUN mamba install --quiet --yes \ fix-permissions /home/$NB_USER RUN python3 -m pip install \ - 'jupyter-rsession-proxy==2.2.0' \ - 'jupyter-server-proxy==4.2.0' && \ - fix-permissions $CONDA_DIR && \ - fix-permissions /home/$NB_USER + 'jupyter-rsession-proxy==2.2.0' \ + 'jupyter-server-proxy==4.2.0' && \ + fix-permissions $CONDA_DIR && \ + fix-permissions /home/$NB_USER + +# ENV SPARK_VERSION=3.5 + +# RSpark config +ENV R_LIBS_USER="${SPARK_HOME}/R/lib" +RUN fix-permissions "${R_LIBS_USER}" + +# If spark_version is not set, latest Spark will be installed +ARG spark_version +ARG hadoop_version="3" +# If scala_version is not set, Spark without Scala will be installed +ARG scala_version +# URL to use for Spark downloads +# You need to use https://archive.apache.org/dist/spark/ website if you want to download old Spark versions +# But it seems to be slower, that's why we use the recommended site for download +ARG spark_download_url="https://dlcdn.apache.org/spark/" + +ENV SPARK_HOME=/usr/local/spark +ENV PATH="${PATH}:${SPARK_HOME}/bin" +ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" + +COPY setup_spark.py /opt/setup-scripts/ -ENV SPARK_VERSION=3.5 +# Setup Spark +RUN /opt/setup-scripts/setup_spark.py \ + --spark-version="${spark_version}" \ + --hadoop-version="${hadoop_version}" \ + --scala-version="${scala_version}" \ + --spark-download-url="${spark_download_url}" -# Install sparklyr -RUN apt-get update && \ - apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ - Rscript -e "library(sparklyr); spark_install(version = ${SPARK_VERSION})" +# # Install sparklyr +# RUN apt-get update && \ +# apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ +# Rscript -e "library(sparklyr); spark_install(version = ${SPARK_VERSION})" # If using the docker bit in other Dockerfiles, this must get written over in a later layer ENV DEFAULT_JUPYTER_URL="/rstudio" diff --git a/output/docker-stacks-datascience-notebook/setup_spark.py b/output/docker-stacks-datascience-notebook/setup_spark.py new file mode 100644 index 000000000..c5b76433f --- /dev/null +++ b/output/docker-stacks-datascience-notebook/setup_spark.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python3 +# Copyright (c) Jupyter Development Team. +# Distributed under the terms of the Modified BSD License. + +# Requirements: +# - Run as the root user +# - Required env variable: SPARK_HOME + +import argparse +import logging +import os +import subprocess +from pathlib import Path + +import requests +from bs4 import BeautifulSoup + +LOGGER = logging.getLogger(__name__) + + +def get_all_refs(url: str) -> list[str]: + """ + Get all the references for a given webpage + """ + resp = requests.get(url) + soup = BeautifulSoup(resp.text, "html.parser") + return [a["href"] for a in soup.find_all("a", href=True)] + + +def get_latest_spark_version() -> str: + """ + Returns the last version of Spark using spark archive + """ + LOGGER.info("Downloading Spark versions information") + all_refs = get_all_refs("https://archive.apache.org/dist/spark/") + versions = [ + ref.removeprefix("spark-").removesuffix("/") + for ref in all_refs + if ref.startswith("spark-") and "incubating" not in ref + ] + + # Compare versions semantically + def version_array(ver: str) -> tuple[int, int, int, str]: + # 3.5.3 -> [3, 5, 3, ""] + # 4.0.0-preview2 -> [4, 0, 0, "preview2"] + arr = ver.split(".") + assert len(arr) == 3, arr + major, minor = int(arr[0]), int(arr[1]) + patch, _, preview = arr[2].partition("-") + return (major, minor, int(patch), preview) + + latest_version = max(versions, key=lambda ver: version_array(ver)) + LOGGER.info(f"Latest version: {latest_version}") + return latest_version + + +def download_spark( + spark_version: str, + hadoop_version: str, + scala_version: str, + spark_download_url: Path, +) -> str: + """ + Downloads and unpacks spark + The resulting spark directory name is returned + """ + LOGGER.info("Downloading and unpacking Spark") + spark_dir_name = f"spark-{spark_version}-bin-hadoop{hadoop_version}" + if scala_version: + spark_dir_name += f"-scala{scala_version}" + LOGGER.info(f"Spark directory name: {spark_dir_name}") + spark_url = spark_download_url / f"spark-{spark_version}" / f"{spark_dir_name}.tgz" + + tmp_file = Path("/tmp/spark.tar.gz") + subprocess.check_call( + ["curl", "--progress-bar", "--location", "--output", tmp_file, spark_url] + ) + subprocess.check_call( + [ + "tar", + "xzf", + tmp_file, + "-C", + "/usr/local", + "--owner", + "root", + "--group", + "root", + "--no-same-owner", + ] + ) + tmp_file.unlink() + return spark_dir_name + + +def configure_spark(spark_dir_name: str, spark_home: Path) -> None: + """ + Creates a ${SPARK_HOME} symlink to a versioned spark directory + Creates a 10spark-config.sh symlink to source PYTHONPATH automatically + """ + LOGGER.info("Configuring Spark") + subprocess.check_call(["ln", "-s", f"/usr/local/{spark_dir_name}", spark_home]) + + # Add a link in the before_notebook hook in order to source PYTHONPATH automatically + CONFIG_SCRIPT = "/usr/local/bin/before-notebook.d/10spark-config.sh" + subprocess.check_call( + ["ln", "-s", spark_home / "sbin/spark-config.sh", CONFIG_SCRIPT] + ) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + + arg_parser = argparse.ArgumentParser() + arg_parser.add_argument("--spark-version", required=True) + arg_parser.add_argument("--hadoop-version", required=True) + arg_parser.add_argument("--scala-version", required=True) + arg_parser.add_argument("--spark-download-url", type=Path, required=True) + args = arg_parser.parse_args() + + args.spark_version = args.spark_version or get_latest_spark_version() + + spark_dir_name = download_spark( + spark_version=args.spark_version, + hadoop_version=args.hadoop_version, + scala_version=args.scala_version, + spark_download_url=args.spark_download_url, + ) + configure_spark( + spark_dir_name=spark_dir_name, spark_home=Path(os.environ["SPARK_HOME"]) + ) diff --git a/output/jupyterlab-cpu/Dockerfile b/output/jupyterlab-cpu/Dockerfile index 4dedc8854..8cee6d0fc 100644 --- a/output/jupyterlab-cpu/Dockerfile +++ b/output/jupyterlab-cpu/Dockerfile @@ -263,17 +263,44 @@ RUN mamba install --quiet --yes \ fix-permissions /home/$NB_USER RUN python3 -m pip install \ - 'jupyter-rsession-proxy==2.2.0' \ - 'jupyter-server-proxy==4.2.0' && \ - fix-permissions $CONDA_DIR && \ - fix-permissions /home/$NB_USER - -ENV SPARK_VERSION=3.5 + 'jupyter-rsession-proxy==2.2.0' \ + 'jupyter-server-proxy==4.2.0' && \ + fix-permissions $CONDA_DIR && \ + fix-permissions /home/$NB_USER -# Install sparklyr -RUN apt-get update && \ - apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ - Rscript -e "library(sparklyr); spark_install(version = ${SPARK_VERSION})" +# ENV SPARK_VERSION=3.5 + +# RSpark config +ENV R_LIBS_USER="${SPARK_HOME}/R/lib" +RUN fix-permissions "${R_LIBS_USER}" + +# If spark_version is not set, latest Spark will be installed +ARG spark_version +ARG hadoop_version="3" +# If scala_version is not set, Spark without Scala will be installed +ARG scala_version +# URL to use for Spark downloads +# You need to use https://archive.apache.org/dist/spark/ website if you want to download old Spark versions +# But it seems to be slower, that's why we use the recommended site for download +ARG spark_download_url="https://dlcdn.apache.org/spark/" + +ENV SPARK_HOME=/usr/local/spark +ENV PATH="${PATH}:${SPARK_HOME}/bin" +ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" + +COPY setup_spark.py /opt/setup-scripts/ + +# Setup Spark +RUN /opt/setup-scripts/setup_spark.py \ + --spark-version="${spark_version}" \ + --hadoop-version="${hadoop_version}" \ + --scala-version="${scala_version}" \ + --spark-download-url="${spark_download_url}" + +# # Install sparklyr +# RUN apt-get update && \ +# apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ +# Rscript -e "library(sparklyr); spark_install(version = ${SPARK_VERSION})" # If using the docker bit in other Dockerfiles, this must get written over in a later layer ENV DEFAULT_JUPYTER_URL="/rstudio" diff --git a/output/jupyterlab-cpu/setup_spark.py b/output/jupyterlab-cpu/setup_spark.py new file mode 100644 index 000000000..c5b76433f --- /dev/null +++ b/output/jupyterlab-cpu/setup_spark.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python3 +# Copyright (c) Jupyter Development Team. +# Distributed under the terms of the Modified BSD License. + +# Requirements: +# - Run as the root user +# - Required env variable: SPARK_HOME + +import argparse +import logging +import os +import subprocess +from pathlib import Path + +import requests +from bs4 import BeautifulSoup + +LOGGER = logging.getLogger(__name__) + + +def get_all_refs(url: str) -> list[str]: + """ + Get all the references for a given webpage + """ + resp = requests.get(url) + soup = BeautifulSoup(resp.text, "html.parser") + return [a["href"] for a in soup.find_all("a", href=True)] + + +def get_latest_spark_version() -> str: + """ + Returns the last version of Spark using spark archive + """ + LOGGER.info("Downloading Spark versions information") + all_refs = get_all_refs("https://archive.apache.org/dist/spark/") + versions = [ + ref.removeprefix("spark-").removesuffix("/") + for ref in all_refs + if ref.startswith("spark-") and "incubating" not in ref + ] + + # Compare versions semantically + def version_array(ver: str) -> tuple[int, int, int, str]: + # 3.5.3 -> [3, 5, 3, ""] + # 4.0.0-preview2 -> [4, 0, 0, "preview2"] + arr = ver.split(".") + assert len(arr) == 3, arr + major, minor = int(arr[0]), int(arr[1]) + patch, _, preview = arr[2].partition("-") + return (major, minor, int(patch), preview) + + latest_version = max(versions, key=lambda ver: version_array(ver)) + LOGGER.info(f"Latest version: {latest_version}") + return latest_version + + +def download_spark( + spark_version: str, + hadoop_version: str, + scala_version: str, + spark_download_url: Path, +) -> str: + """ + Downloads and unpacks spark + The resulting spark directory name is returned + """ + LOGGER.info("Downloading and unpacking Spark") + spark_dir_name = f"spark-{spark_version}-bin-hadoop{hadoop_version}" + if scala_version: + spark_dir_name += f"-scala{scala_version}" + LOGGER.info(f"Spark directory name: {spark_dir_name}") + spark_url = spark_download_url / f"spark-{spark_version}" / f"{spark_dir_name}.tgz" + + tmp_file = Path("/tmp/spark.tar.gz") + subprocess.check_call( + ["curl", "--progress-bar", "--location", "--output", tmp_file, spark_url] + ) + subprocess.check_call( + [ + "tar", + "xzf", + tmp_file, + "-C", + "/usr/local", + "--owner", + "root", + "--group", + "root", + "--no-same-owner", + ] + ) + tmp_file.unlink() + return spark_dir_name + + +def configure_spark(spark_dir_name: str, spark_home: Path) -> None: + """ + Creates a ${SPARK_HOME} symlink to a versioned spark directory + Creates a 10spark-config.sh symlink to source PYTHONPATH automatically + """ + LOGGER.info("Configuring Spark") + subprocess.check_call(["ln", "-s", f"/usr/local/{spark_dir_name}", spark_home]) + + # Add a link in the before_notebook hook in order to source PYTHONPATH automatically + CONFIG_SCRIPT = "/usr/local/bin/before-notebook.d/10spark-config.sh" + subprocess.check_call( + ["ln", "-s", spark_home / "sbin/spark-config.sh", CONFIG_SCRIPT] + ) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + + arg_parser = argparse.ArgumentParser() + arg_parser.add_argument("--spark-version", required=True) + arg_parser.add_argument("--hadoop-version", required=True) + arg_parser.add_argument("--scala-version", required=True) + arg_parser.add_argument("--spark-download-url", type=Path, required=True) + args = arg_parser.parse_args() + + args.spark_version = args.spark_version or get_latest_spark_version() + + spark_dir_name = download_spark( + spark_version=args.spark_version, + hadoop_version=args.hadoop_version, + scala_version=args.scala_version, + spark_download_url=args.spark_download_url, + ) + configure_spark( + spark_dir_name=spark_dir_name, spark_home=Path(os.environ["SPARK_HOME"]) + ) diff --git a/output/jupyterlab-pytorch/Dockerfile b/output/jupyterlab-pytorch/Dockerfile index 1f28f707a..2182ce106 100644 --- a/output/jupyterlab-pytorch/Dockerfile +++ b/output/jupyterlab-pytorch/Dockerfile @@ -285,17 +285,44 @@ RUN mamba install --quiet --yes \ fix-permissions /home/$NB_USER RUN python3 -m pip install \ - 'jupyter-rsession-proxy==2.2.0' \ - 'jupyter-server-proxy==4.2.0' && \ - fix-permissions $CONDA_DIR && \ - fix-permissions /home/$NB_USER - -ENV SPARK_VERSION=3.5 + 'jupyter-rsession-proxy==2.2.0' \ + 'jupyter-server-proxy==4.2.0' && \ + fix-permissions $CONDA_DIR && \ + fix-permissions /home/$NB_USER -# Install sparklyr -RUN apt-get update && \ - apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ - Rscript -e "library(sparklyr); spark_install(version = ${SPARK_VERSION})" +# ENV SPARK_VERSION=3.5 + +# RSpark config +ENV R_LIBS_USER="${SPARK_HOME}/R/lib" +RUN fix-permissions "${R_LIBS_USER}" + +# If spark_version is not set, latest Spark will be installed +ARG spark_version +ARG hadoop_version="3" +# If scala_version is not set, Spark without Scala will be installed +ARG scala_version +# URL to use for Spark downloads +# You need to use https://archive.apache.org/dist/spark/ website if you want to download old Spark versions +# But it seems to be slower, that's why we use the recommended site for download +ARG spark_download_url="https://dlcdn.apache.org/spark/" + +ENV SPARK_HOME=/usr/local/spark +ENV PATH="${PATH}:${SPARK_HOME}/bin" +ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" + +COPY setup_spark.py /opt/setup-scripts/ + +# Setup Spark +RUN /opt/setup-scripts/setup_spark.py \ + --spark-version="${spark_version}" \ + --hadoop-version="${hadoop_version}" \ + --scala-version="${scala_version}" \ + --spark-download-url="${spark_download_url}" + +# # Install sparklyr +# RUN apt-get update && \ +# apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ +# Rscript -e "library(sparklyr); spark_install(version = ${SPARK_VERSION})" # If using the docker bit in other Dockerfiles, this must get written over in a later layer ENV DEFAULT_JUPYTER_URL="/rstudio" diff --git a/output/jupyterlab-pytorch/setup_spark.py b/output/jupyterlab-pytorch/setup_spark.py new file mode 100644 index 000000000..c5b76433f --- /dev/null +++ b/output/jupyterlab-pytorch/setup_spark.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python3 +# Copyright (c) Jupyter Development Team. +# Distributed under the terms of the Modified BSD License. + +# Requirements: +# - Run as the root user +# - Required env variable: SPARK_HOME + +import argparse +import logging +import os +import subprocess +from pathlib import Path + +import requests +from bs4 import BeautifulSoup + +LOGGER = logging.getLogger(__name__) + + +def get_all_refs(url: str) -> list[str]: + """ + Get all the references for a given webpage + """ + resp = requests.get(url) + soup = BeautifulSoup(resp.text, "html.parser") + return [a["href"] for a in soup.find_all("a", href=True)] + + +def get_latest_spark_version() -> str: + """ + Returns the last version of Spark using spark archive + """ + LOGGER.info("Downloading Spark versions information") + all_refs = get_all_refs("https://archive.apache.org/dist/spark/") + versions = [ + ref.removeprefix("spark-").removesuffix("/") + for ref in all_refs + if ref.startswith("spark-") and "incubating" not in ref + ] + + # Compare versions semantically + def version_array(ver: str) -> tuple[int, int, int, str]: + # 3.5.3 -> [3, 5, 3, ""] + # 4.0.0-preview2 -> [4, 0, 0, "preview2"] + arr = ver.split(".") + assert len(arr) == 3, arr + major, minor = int(arr[0]), int(arr[1]) + patch, _, preview = arr[2].partition("-") + return (major, minor, int(patch), preview) + + latest_version = max(versions, key=lambda ver: version_array(ver)) + LOGGER.info(f"Latest version: {latest_version}") + return latest_version + + +def download_spark( + spark_version: str, + hadoop_version: str, + scala_version: str, + spark_download_url: Path, +) -> str: + """ + Downloads and unpacks spark + The resulting spark directory name is returned + """ + LOGGER.info("Downloading and unpacking Spark") + spark_dir_name = f"spark-{spark_version}-bin-hadoop{hadoop_version}" + if scala_version: + spark_dir_name += f"-scala{scala_version}" + LOGGER.info(f"Spark directory name: {spark_dir_name}") + spark_url = spark_download_url / f"spark-{spark_version}" / f"{spark_dir_name}.tgz" + + tmp_file = Path("/tmp/spark.tar.gz") + subprocess.check_call( + ["curl", "--progress-bar", "--location", "--output", tmp_file, spark_url] + ) + subprocess.check_call( + [ + "tar", + "xzf", + tmp_file, + "-C", + "/usr/local", + "--owner", + "root", + "--group", + "root", + "--no-same-owner", + ] + ) + tmp_file.unlink() + return spark_dir_name + + +def configure_spark(spark_dir_name: str, spark_home: Path) -> None: + """ + Creates a ${SPARK_HOME} symlink to a versioned spark directory + Creates a 10spark-config.sh symlink to source PYTHONPATH automatically + """ + LOGGER.info("Configuring Spark") + subprocess.check_call(["ln", "-s", f"/usr/local/{spark_dir_name}", spark_home]) + + # Add a link in the before_notebook hook in order to source PYTHONPATH automatically + CONFIG_SCRIPT = "/usr/local/bin/before-notebook.d/10spark-config.sh" + subprocess.check_call( + ["ln", "-s", spark_home / "sbin/spark-config.sh", CONFIG_SCRIPT] + ) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + + arg_parser = argparse.ArgumentParser() + arg_parser.add_argument("--spark-version", required=True) + arg_parser.add_argument("--hadoop-version", required=True) + arg_parser.add_argument("--scala-version", required=True) + arg_parser.add_argument("--spark-download-url", type=Path, required=True) + args = arg_parser.parse_args() + + args.spark_version = args.spark_version or get_latest_spark_version() + + spark_dir_name = download_spark( + spark_version=args.spark_version, + hadoop_version=args.hadoop_version, + scala_version=args.scala_version, + spark_download_url=args.spark_download_url, + ) + configure_spark( + spark_dir_name=spark_dir_name, spark_home=Path(os.environ["SPARK_HOME"]) + ) diff --git a/output/jupyterlab-tensorflow/Dockerfile b/output/jupyterlab-tensorflow/Dockerfile index a8e086e8d..233e2e2ae 100644 --- a/output/jupyterlab-tensorflow/Dockerfile +++ b/output/jupyterlab-tensorflow/Dockerfile @@ -392,17 +392,44 @@ RUN mamba install --quiet --yes \ fix-permissions /home/$NB_USER RUN python3 -m pip install \ - 'jupyter-rsession-proxy==2.2.0' \ - 'jupyter-server-proxy==4.2.0' && \ - fix-permissions $CONDA_DIR && \ - fix-permissions /home/$NB_USER - -ENV SPARK_VERSION=3.5 + 'jupyter-rsession-proxy==2.2.0' \ + 'jupyter-server-proxy==4.2.0' && \ + fix-permissions $CONDA_DIR && \ + fix-permissions /home/$NB_USER -# Install sparklyr -RUN apt-get update && \ - apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ - Rscript -e "library(sparklyr); spark_install(version = ${SPARK_VERSION})" +# ENV SPARK_VERSION=3.5 + +# RSpark config +ENV R_LIBS_USER="${SPARK_HOME}/R/lib" +RUN fix-permissions "${R_LIBS_USER}" + +# If spark_version is not set, latest Spark will be installed +ARG spark_version +ARG hadoop_version="3" +# If scala_version is not set, Spark without Scala will be installed +ARG scala_version +# URL to use for Spark downloads +# You need to use https://archive.apache.org/dist/spark/ website if you want to download old Spark versions +# But it seems to be slower, that's why we use the recommended site for download +ARG spark_download_url="https://dlcdn.apache.org/spark/" + +ENV SPARK_HOME=/usr/local/spark +ENV PATH="${PATH}:${SPARK_HOME}/bin" +ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" + +COPY setup_spark.py /opt/setup-scripts/ + +# Setup Spark +RUN /opt/setup-scripts/setup_spark.py \ + --spark-version="${spark_version}" \ + --hadoop-version="${hadoop_version}" \ + --scala-version="${scala_version}" \ + --spark-download-url="${spark_download_url}" + +# # Install sparklyr +# RUN apt-get update && \ +# apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ +# Rscript -e "library(sparklyr); spark_install(version = ${SPARK_VERSION})" # If using the docker bit in other Dockerfiles, this must get written over in a later layer ENV DEFAULT_JUPYTER_URL="/rstudio" diff --git a/output/jupyterlab-tensorflow/setup_spark.py b/output/jupyterlab-tensorflow/setup_spark.py new file mode 100644 index 000000000..c5b76433f --- /dev/null +++ b/output/jupyterlab-tensorflow/setup_spark.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python3 +# Copyright (c) Jupyter Development Team. +# Distributed under the terms of the Modified BSD License. + +# Requirements: +# - Run as the root user +# - Required env variable: SPARK_HOME + +import argparse +import logging +import os +import subprocess +from pathlib import Path + +import requests +from bs4 import BeautifulSoup + +LOGGER = logging.getLogger(__name__) + + +def get_all_refs(url: str) -> list[str]: + """ + Get all the references for a given webpage + """ + resp = requests.get(url) + soup = BeautifulSoup(resp.text, "html.parser") + return [a["href"] for a in soup.find_all("a", href=True)] + + +def get_latest_spark_version() -> str: + """ + Returns the last version of Spark using spark archive + """ + LOGGER.info("Downloading Spark versions information") + all_refs = get_all_refs("https://archive.apache.org/dist/spark/") + versions = [ + ref.removeprefix("spark-").removesuffix("/") + for ref in all_refs + if ref.startswith("spark-") and "incubating" not in ref + ] + + # Compare versions semantically + def version_array(ver: str) -> tuple[int, int, int, str]: + # 3.5.3 -> [3, 5, 3, ""] + # 4.0.0-preview2 -> [4, 0, 0, "preview2"] + arr = ver.split(".") + assert len(arr) == 3, arr + major, minor = int(arr[0]), int(arr[1]) + patch, _, preview = arr[2].partition("-") + return (major, minor, int(patch), preview) + + latest_version = max(versions, key=lambda ver: version_array(ver)) + LOGGER.info(f"Latest version: {latest_version}") + return latest_version + + +def download_spark( + spark_version: str, + hadoop_version: str, + scala_version: str, + spark_download_url: Path, +) -> str: + """ + Downloads and unpacks spark + The resulting spark directory name is returned + """ + LOGGER.info("Downloading and unpacking Spark") + spark_dir_name = f"spark-{spark_version}-bin-hadoop{hadoop_version}" + if scala_version: + spark_dir_name += f"-scala{scala_version}" + LOGGER.info(f"Spark directory name: {spark_dir_name}") + spark_url = spark_download_url / f"spark-{spark_version}" / f"{spark_dir_name}.tgz" + + tmp_file = Path("/tmp/spark.tar.gz") + subprocess.check_call( + ["curl", "--progress-bar", "--location", "--output", tmp_file, spark_url] + ) + subprocess.check_call( + [ + "tar", + "xzf", + tmp_file, + "-C", + "/usr/local", + "--owner", + "root", + "--group", + "root", + "--no-same-owner", + ] + ) + tmp_file.unlink() + return spark_dir_name + + +def configure_spark(spark_dir_name: str, spark_home: Path) -> None: + """ + Creates a ${SPARK_HOME} symlink to a versioned spark directory + Creates a 10spark-config.sh symlink to source PYTHONPATH automatically + """ + LOGGER.info("Configuring Spark") + subprocess.check_call(["ln", "-s", f"/usr/local/{spark_dir_name}", spark_home]) + + # Add a link in the before_notebook hook in order to source PYTHONPATH automatically + CONFIG_SCRIPT = "/usr/local/bin/before-notebook.d/10spark-config.sh" + subprocess.check_call( + ["ln", "-s", spark_home / "sbin/spark-config.sh", CONFIG_SCRIPT] + ) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + + arg_parser = argparse.ArgumentParser() + arg_parser.add_argument("--spark-version", required=True) + arg_parser.add_argument("--hadoop-version", required=True) + arg_parser.add_argument("--scala-version", required=True) + arg_parser.add_argument("--spark-download-url", type=Path, required=True) + args = arg_parser.parse_args() + + args.spark_version = args.spark_version or get_latest_spark_version() + + spark_dir_name = download_spark( + spark_version=args.spark_version, + hadoop_version=args.hadoop_version, + scala_version=args.scala_version, + spark_download_url=args.spark_download_url, + ) + configure_spark( + spark_dir_name=spark_dir_name, spark_home=Path(os.environ["SPARK_HOME"]) + ) diff --git a/output/remote-desktop/setup_spark.py b/output/remote-desktop/setup_spark.py new file mode 100644 index 000000000..c5b76433f --- /dev/null +++ b/output/remote-desktop/setup_spark.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python3 +# Copyright (c) Jupyter Development Team. +# Distributed under the terms of the Modified BSD License. + +# Requirements: +# - Run as the root user +# - Required env variable: SPARK_HOME + +import argparse +import logging +import os +import subprocess +from pathlib import Path + +import requests +from bs4 import BeautifulSoup + +LOGGER = logging.getLogger(__name__) + + +def get_all_refs(url: str) -> list[str]: + """ + Get all the references for a given webpage + """ + resp = requests.get(url) + soup = BeautifulSoup(resp.text, "html.parser") + return [a["href"] for a in soup.find_all("a", href=True)] + + +def get_latest_spark_version() -> str: + """ + Returns the last version of Spark using spark archive + """ + LOGGER.info("Downloading Spark versions information") + all_refs = get_all_refs("https://archive.apache.org/dist/spark/") + versions = [ + ref.removeprefix("spark-").removesuffix("/") + for ref in all_refs + if ref.startswith("spark-") and "incubating" not in ref + ] + + # Compare versions semantically + def version_array(ver: str) -> tuple[int, int, int, str]: + # 3.5.3 -> [3, 5, 3, ""] + # 4.0.0-preview2 -> [4, 0, 0, "preview2"] + arr = ver.split(".") + assert len(arr) == 3, arr + major, minor = int(arr[0]), int(arr[1]) + patch, _, preview = arr[2].partition("-") + return (major, minor, int(patch), preview) + + latest_version = max(versions, key=lambda ver: version_array(ver)) + LOGGER.info(f"Latest version: {latest_version}") + return latest_version + + +def download_spark( + spark_version: str, + hadoop_version: str, + scala_version: str, + spark_download_url: Path, +) -> str: + """ + Downloads and unpacks spark + The resulting spark directory name is returned + """ + LOGGER.info("Downloading and unpacking Spark") + spark_dir_name = f"spark-{spark_version}-bin-hadoop{hadoop_version}" + if scala_version: + spark_dir_name += f"-scala{scala_version}" + LOGGER.info(f"Spark directory name: {spark_dir_name}") + spark_url = spark_download_url / f"spark-{spark_version}" / f"{spark_dir_name}.tgz" + + tmp_file = Path("/tmp/spark.tar.gz") + subprocess.check_call( + ["curl", "--progress-bar", "--location", "--output", tmp_file, spark_url] + ) + subprocess.check_call( + [ + "tar", + "xzf", + tmp_file, + "-C", + "/usr/local", + "--owner", + "root", + "--group", + "root", + "--no-same-owner", + ] + ) + tmp_file.unlink() + return spark_dir_name + + +def configure_spark(spark_dir_name: str, spark_home: Path) -> None: + """ + Creates a ${SPARK_HOME} symlink to a versioned spark directory + Creates a 10spark-config.sh symlink to source PYTHONPATH automatically + """ + LOGGER.info("Configuring Spark") + subprocess.check_call(["ln", "-s", f"/usr/local/{spark_dir_name}", spark_home]) + + # Add a link in the before_notebook hook in order to source PYTHONPATH automatically + CONFIG_SCRIPT = "/usr/local/bin/before-notebook.d/10spark-config.sh" + subprocess.check_call( + ["ln", "-s", spark_home / "sbin/spark-config.sh", CONFIG_SCRIPT] + ) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + + arg_parser = argparse.ArgumentParser() + arg_parser.add_argument("--spark-version", required=True) + arg_parser.add_argument("--hadoop-version", required=True) + arg_parser.add_argument("--scala-version", required=True) + arg_parser.add_argument("--spark-download-url", type=Path, required=True) + args = arg_parser.parse_args() + + args.spark_version = args.spark_version or get_latest_spark_version() + + spark_dir_name = download_spark( + spark_version=args.spark_version, + hadoop_version=args.hadoop_version, + scala_version=args.scala_version, + spark_download_url=args.spark_download_url, + ) + configure_spark( + spark_dir_name=spark_dir_name, spark_home=Path(os.environ["SPARK_HOME"]) + ) diff --git a/output/rstudio/Dockerfile b/output/rstudio/Dockerfile index 63df867c1..80c95a5e7 100644 --- a/output/rstudio/Dockerfile +++ b/output/rstudio/Dockerfile @@ -263,17 +263,44 @@ RUN mamba install --quiet --yes \ fix-permissions /home/$NB_USER RUN python3 -m pip install \ - 'jupyter-rsession-proxy==2.2.0' \ - 'jupyter-server-proxy==4.2.0' && \ - fix-permissions $CONDA_DIR && \ - fix-permissions /home/$NB_USER - -ENV SPARK_VERSION=3.5 + 'jupyter-rsession-proxy==2.2.0' \ + 'jupyter-server-proxy==4.2.0' && \ + fix-permissions $CONDA_DIR && \ + fix-permissions /home/$NB_USER -# Install sparklyr -RUN apt-get update && \ - apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ - Rscript -e "library(sparklyr); spark_install(version = ${SPARK_VERSION})" +# ENV SPARK_VERSION=3.5 + +# RSpark config +ENV R_LIBS_USER="${SPARK_HOME}/R/lib" +RUN fix-permissions "${R_LIBS_USER}" + +# If spark_version is not set, latest Spark will be installed +ARG spark_version +ARG hadoop_version="3" +# If scala_version is not set, Spark without Scala will be installed +ARG scala_version +# URL to use for Spark downloads +# You need to use https://archive.apache.org/dist/spark/ website if you want to download old Spark versions +# But it seems to be slower, that's why we use the recommended site for download +ARG spark_download_url="https://dlcdn.apache.org/spark/" + +ENV SPARK_HOME=/usr/local/spark +ENV PATH="${PATH}:${SPARK_HOME}/bin" +ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" + +COPY setup_spark.py /opt/setup-scripts/ + +# Setup Spark +RUN /opt/setup-scripts/setup_spark.py \ + --spark-version="${spark_version}" \ + --hadoop-version="${hadoop_version}" \ + --scala-version="${scala_version}" \ + --spark-download-url="${spark_download_url}" + +# # Install sparklyr +# RUN apt-get update && \ +# apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ +# Rscript -e "library(sparklyr); spark_install(version = ${SPARK_VERSION})" # If using the docker bit in other Dockerfiles, this must get written over in a later layer ENV DEFAULT_JUPYTER_URL="/rstudio" diff --git a/output/rstudio/setup_spark.py b/output/rstudio/setup_spark.py new file mode 100644 index 000000000..c5b76433f --- /dev/null +++ b/output/rstudio/setup_spark.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python3 +# Copyright (c) Jupyter Development Team. +# Distributed under the terms of the Modified BSD License. + +# Requirements: +# - Run as the root user +# - Required env variable: SPARK_HOME + +import argparse +import logging +import os +import subprocess +from pathlib import Path + +import requests +from bs4 import BeautifulSoup + +LOGGER = logging.getLogger(__name__) + + +def get_all_refs(url: str) -> list[str]: + """ + Get all the references for a given webpage + """ + resp = requests.get(url) + soup = BeautifulSoup(resp.text, "html.parser") + return [a["href"] for a in soup.find_all("a", href=True)] + + +def get_latest_spark_version() -> str: + """ + Returns the last version of Spark using spark archive + """ + LOGGER.info("Downloading Spark versions information") + all_refs = get_all_refs("https://archive.apache.org/dist/spark/") + versions = [ + ref.removeprefix("spark-").removesuffix("/") + for ref in all_refs + if ref.startswith("spark-") and "incubating" not in ref + ] + + # Compare versions semantically + def version_array(ver: str) -> tuple[int, int, int, str]: + # 3.5.3 -> [3, 5, 3, ""] + # 4.0.0-preview2 -> [4, 0, 0, "preview2"] + arr = ver.split(".") + assert len(arr) == 3, arr + major, minor = int(arr[0]), int(arr[1]) + patch, _, preview = arr[2].partition("-") + return (major, minor, int(patch), preview) + + latest_version = max(versions, key=lambda ver: version_array(ver)) + LOGGER.info(f"Latest version: {latest_version}") + return latest_version + + +def download_spark( + spark_version: str, + hadoop_version: str, + scala_version: str, + spark_download_url: Path, +) -> str: + """ + Downloads and unpacks spark + The resulting spark directory name is returned + """ + LOGGER.info("Downloading and unpacking Spark") + spark_dir_name = f"spark-{spark_version}-bin-hadoop{hadoop_version}" + if scala_version: + spark_dir_name += f"-scala{scala_version}" + LOGGER.info(f"Spark directory name: {spark_dir_name}") + spark_url = spark_download_url / f"spark-{spark_version}" / f"{spark_dir_name}.tgz" + + tmp_file = Path("/tmp/spark.tar.gz") + subprocess.check_call( + ["curl", "--progress-bar", "--location", "--output", tmp_file, spark_url] + ) + subprocess.check_call( + [ + "tar", + "xzf", + tmp_file, + "-C", + "/usr/local", + "--owner", + "root", + "--group", + "root", + "--no-same-owner", + ] + ) + tmp_file.unlink() + return spark_dir_name + + +def configure_spark(spark_dir_name: str, spark_home: Path) -> None: + """ + Creates a ${SPARK_HOME} symlink to a versioned spark directory + Creates a 10spark-config.sh symlink to source PYTHONPATH automatically + """ + LOGGER.info("Configuring Spark") + subprocess.check_call(["ln", "-s", f"/usr/local/{spark_dir_name}", spark_home]) + + # Add a link in the before_notebook hook in order to source PYTHONPATH automatically + CONFIG_SCRIPT = "/usr/local/bin/before-notebook.d/10spark-config.sh" + subprocess.check_call( + ["ln", "-s", spark_home / "sbin/spark-config.sh", CONFIG_SCRIPT] + ) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + + arg_parser = argparse.ArgumentParser() + arg_parser.add_argument("--spark-version", required=True) + arg_parser.add_argument("--hadoop-version", required=True) + arg_parser.add_argument("--scala-version", required=True) + arg_parser.add_argument("--spark-download-url", type=Path, required=True) + args = arg_parser.parse_args() + + args.spark_version = args.spark_version or get_latest_spark_version() + + spark_dir_name = download_spark( + spark_version=args.spark_version, + hadoop_version=args.hadoop_version, + scala_version=args.scala_version, + spark_download_url=args.spark_download_url, + ) + configure_spark( + spark_dir_name=spark_dir_name, spark_home=Path(os.environ["SPARK_HOME"]) + ) diff --git a/output/sas/Dockerfile b/output/sas/Dockerfile index 5d593d74e..eb90aabf2 100644 --- a/output/sas/Dockerfile +++ b/output/sas/Dockerfile @@ -423,17 +423,44 @@ RUN mamba install --quiet --yes \ fix-permissions /home/$NB_USER RUN python3 -m pip install \ - 'jupyter-rsession-proxy==2.2.0' \ - 'jupyter-server-proxy==4.2.0' && \ - fix-permissions $CONDA_DIR && \ - fix-permissions /home/$NB_USER - -ENV SPARK_VERSION=3.5 + 'jupyter-rsession-proxy==2.2.0' \ + 'jupyter-server-proxy==4.2.0' && \ + fix-permissions $CONDA_DIR && \ + fix-permissions /home/$NB_USER -# Install sparklyr -RUN apt-get update && \ - apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ - Rscript -e "library(sparklyr); spark_install(version = ${SPARK_VERSION})" +# ENV SPARK_VERSION=3.5 + +# RSpark config +ENV R_LIBS_USER="${SPARK_HOME}/R/lib" +RUN fix-permissions "${R_LIBS_USER}" + +# If spark_version is not set, latest Spark will be installed +ARG spark_version +ARG hadoop_version="3" +# If scala_version is not set, Spark without Scala will be installed +ARG scala_version +# URL to use for Spark downloads +# You need to use https://archive.apache.org/dist/spark/ website if you want to download old Spark versions +# But it seems to be slower, that's why we use the recommended site for download +ARG spark_download_url="https://dlcdn.apache.org/spark/" + +ENV SPARK_HOME=/usr/local/spark +ENV PATH="${PATH}:${SPARK_HOME}/bin" +ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" + +COPY setup_spark.py /opt/setup-scripts/ + +# Setup Spark +RUN /opt/setup-scripts/setup_spark.py \ + --spark-version="${spark_version}" \ + --hadoop-version="${hadoop_version}" \ + --scala-version="${scala_version}" \ + --spark-download-url="${spark_download_url}" + +# # Install sparklyr +# RUN apt-get update && \ +# apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ +# Rscript -e "library(sparklyr); spark_install(version = ${SPARK_VERSION})" # If using the docker bit in other Dockerfiles, this must get written over in a later layer ENV DEFAULT_JUPYTER_URL="/rstudio" diff --git a/output/sas/setup_spark.py b/output/sas/setup_spark.py new file mode 100644 index 000000000..c5b76433f --- /dev/null +++ b/output/sas/setup_spark.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python3 +# Copyright (c) Jupyter Development Team. +# Distributed under the terms of the Modified BSD License. + +# Requirements: +# - Run as the root user +# - Required env variable: SPARK_HOME + +import argparse +import logging +import os +import subprocess +from pathlib import Path + +import requests +from bs4 import BeautifulSoup + +LOGGER = logging.getLogger(__name__) + + +def get_all_refs(url: str) -> list[str]: + """ + Get all the references for a given webpage + """ + resp = requests.get(url) + soup = BeautifulSoup(resp.text, "html.parser") + return [a["href"] for a in soup.find_all("a", href=True)] + + +def get_latest_spark_version() -> str: + """ + Returns the last version of Spark using spark archive + """ + LOGGER.info("Downloading Spark versions information") + all_refs = get_all_refs("https://archive.apache.org/dist/spark/") + versions = [ + ref.removeprefix("spark-").removesuffix("/") + for ref in all_refs + if ref.startswith("spark-") and "incubating" not in ref + ] + + # Compare versions semantically + def version_array(ver: str) -> tuple[int, int, int, str]: + # 3.5.3 -> [3, 5, 3, ""] + # 4.0.0-preview2 -> [4, 0, 0, "preview2"] + arr = ver.split(".") + assert len(arr) == 3, arr + major, minor = int(arr[0]), int(arr[1]) + patch, _, preview = arr[2].partition("-") + return (major, minor, int(patch), preview) + + latest_version = max(versions, key=lambda ver: version_array(ver)) + LOGGER.info(f"Latest version: {latest_version}") + return latest_version + + +def download_spark( + spark_version: str, + hadoop_version: str, + scala_version: str, + spark_download_url: Path, +) -> str: + """ + Downloads and unpacks spark + The resulting spark directory name is returned + """ + LOGGER.info("Downloading and unpacking Spark") + spark_dir_name = f"spark-{spark_version}-bin-hadoop{hadoop_version}" + if scala_version: + spark_dir_name += f"-scala{scala_version}" + LOGGER.info(f"Spark directory name: {spark_dir_name}") + spark_url = spark_download_url / f"spark-{spark_version}" / f"{spark_dir_name}.tgz" + + tmp_file = Path("/tmp/spark.tar.gz") + subprocess.check_call( + ["curl", "--progress-bar", "--location", "--output", tmp_file, spark_url] + ) + subprocess.check_call( + [ + "tar", + "xzf", + tmp_file, + "-C", + "/usr/local", + "--owner", + "root", + "--group", + "root", + "--no-same-owner", + ] + ) + tmp_file.unlink() + return spark_dir_name + + +def configure_spark(spark_dir_name: str, spark_home: Path) -> None: + """ + Creates a ${SPARK_HOME} symlink to a versioned spark directory + Creates a 10spark-config.sh symlink to source PYTHONPATH automatically + """ + LOGGER.info("Configuring Spark") + subprocess.check_call(["ln", "-s", f"/usr/local/{spark_dir_name}", spark_home]) + + # Add a link in the before_notebook hook in order to source PYTHONPATH automatically + CONFIG_SCRIPT = "/usr/local/bin/before-notebook.d/10spark-config.sh" + subprocess.check_call( + ["ln", "-s", spark_home / "sbin/spark-config.sh", CONFIG_SCRIPT] + ) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + + arg_parser = argparse.ArgumentParser() + arg_parser.add_argument("--spark-version", required=True) + arg_parser.add_argument("--hadoop-version", required=True) + arg_parser.add_argument("--scala-version", required=True) + arg_parser.add_argument("--spark-download-url", type=Path, required=True) + args = arg_parser.parse_args() + + args.spark_version = args.spark_version or get_latest_spark_version() + + spark_dir_name = download_spark( + spark_version=args.spark_version, + hadoop_version=args.hadoop_version, + scala_version=args.scala_version, + spark_download_url=args.spark_download_url, + ) + configure_spark( + spark_dir_name=spark_dir_name, spark_home=Path(os.environ["SPARK_HOME"]) + ) diff --git a/resources/common/setup_spark.py b/resources/common/setup_spark.py new file mode 100644 index 000000000..c5b76433f --- /dev/null +++ b/resources/common/setup_spark.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python3 +# Copyright (c) Jupyter Development Team. +# Distributed under the terms of the Modified BSD License. + +# Requirements: +# - Run as the root user +# - Required env variable: SPARK_HOME + +import argparse +import logging +import os +import subprocess +from pathlib import Path + +import requests +from bs4 import BeautifulSoup + +LOGGER = logging.getLogger(__name__) + + +def get_all_refs(url: str) -> list[str]: + """ + Get all the references for a given webpage + """ + resp = requests.get(url) + soup = BeautifulSoup(resp.text, "html.parser") + return [a["href"] for a in soup.find_all("a", href=True)] + + +def get_latest_spark_version() -> str: + """ + Returns the last version of Spark using spark archive + """ + LOGGER.info("Downloading Spark versions information") + all_refs = get_all_refs("https://archive.apache.org/dist/spark/") + versions = [ + ref.removeprefix("spark-").removesuffix("/") + for ref in all_refs + if ref.startswith("spark-") and "incubating" not in ref + ] + + # Compare versions semantically + def version_array(ver: str) -> tuple[int, int, int, str]: + # 3.5.3 -> [3, 5, 3, ""] + # 4.0.0-preview2 -> [4, 0, 0, "preview2"] + arr = ver.split(".") + assert len(arr) == 3, arr + major, minor = int(arr[0]), int(arr[1]) + patch, _, preview = arr[2].partition("-") + return (major, minor, int(patch), preview) + + latest_version = max(versions, key=lambda ver: version_array(ver)) + LOGGER.info(f"Latest version: {latest_version}") + return latest_version + + +def download_spark( + spark_version: str, + hadoop_version: str, + scala_version: str, + spark_download_url: Path, +) -> str: + """ + Downloads and unpacks spark + The resulting spark directory name is returned + """ + LOGGER.info("Downloading and unpacking Spark") + spark_dir_name = f"spark-{spark_version}-bin-hadoop{hadoop_version}" + if scala_version: + spark_dir_name += f"-scala{scala_version}" + LOGGER.info(f"Spark directory name: {spark_dir_name}") + spark_url = spark_download_url / f"spark-{spark_version}" / f"{spark_dir_name}.tgz" + + tmp_file = Path("/tmp/spark.tar.gz") + subprocess.check_call( + ["curl", "--progress-bar", "--location", "--output", tmp_file, spark_url] + ) + subprocess.check_call( + [ + "tar", + "xzf", + tmp_file, + "-C", + "/usr/local", + "--owner", + "root", + "--group", + "root", + "--no-same-owner", + ] + ) + tmp_file.unlink() + return spark_dir_name + + +def configure_spark(spark_dir_name: str, spark_home: Path) -> None: + """ + Creates a ${SPARK_HOME} symlink to a versioned spark directory + Creates a 10spark-config.sh symlink to source PYTHONPATH automatically + """ + LOGGER.info("Configuring Spark") + subprocess.check_call(["ln", "-s", f"/usr/local/{spark_dir_name}", spark_home]) + + # Add a link in the before_notebook hook in order to source PYTHONPATH automatically + CONFIG_SCRIPT = "/usr/local/bin/before-notebook.d/10spark-config.sh" + subprocess.check_call( + ["ln", "-s", spark_home / "sbin/spark-config.sh", CONFIG_SCRIPT] + ) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + + arg_parser = argparse.ArgumentParser() + arg_parser.add_argument("--spark-version", required=True) + arg_parser.add_argument("--hadoop-version", required=True) + arg_parser.add_argument("--scala-version", required=True) + arg_parser.add_argument("--spark-download-url", type=Path, required=True) + args = arg_parser.parse_args() + + args.spark_version = args.spark_version or get_latest_spark_version() + + spark_dir_name = download_spark( + spark_version=args.spark_version, + hadoop_version=args.hadoop_version, + scala_version=args.scala_version, + spark_download_url=args.spark_download_url, + ) + configure_spark( + spark_dir_name=spark_dir_name, spark_home=Path(os.environ["SPARK_HOME"]) + ) From 29e8daa3b53380474f7204831adfaa4522be71fe Mon Sep 17 00:00:00 2001 From: Bryan Paget <8212170+bryanpaget@users.noreply.github.com> Date: Wed, 18 Dec 2024 16:38:11 +0000 Subject: [PATCH 10/14] feat: implement spark install similar to upstream --- docker-bits/6_rstudio.Dockerfile | 8 ++++---- output/jupyterlab-cpu/Dockerfile | 8 ++++---- output/jupyterlab-pytorch/Dockerfile | 8 ++++---- output/jupyterlab-tensorflow/Dockerfile | 8 ++++---- output/rstudio/Dockerfile | 8 ++++---- output/sas/Dockerfile | 8 ++++---- 6 files changed, 24 insertions(+), 24 deletions(-) diff --git a/docker-bits/6_rstudio.Dockerfile b/docker-bits/6_rstudio.Dockerfile index cbb8035e7..ffcbed573 100644 --- a/docker-bits/6_rstudio.Dockerfile +++ b/docker-bits/6_rstudio.Dockerfile @@ -39,10 +39,6 @@ RUN python3 -m pip install \ # ENV SPARK_VERSION=3.5 -# RSpark config -ENV R_LIBS_USER="${SPARK_HOME}/R/lib" -RUN fix-permissions "${R_LIBS_USER}" - # If spark_version is not set, latest Spark will be installed ARG spark_version ARG hadoop_version="3" @@ -57,6 +53,10 @@ ENV SPARK_HOME=/usr/local/spark ENV PATH="${PATH}:${SPARK_HOME}/bin" ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" +# RSpark config +ENV R_LIBS_USER="${SPARK_HOME}/R/lib" +RUN fix-permissions "${R_LIBS_USER}" + COPY setup_spark.py /opt/setup-scripts/ # Setup Spark diff --git a/output/jupyterlab-cpu/Dockerfile b/output/jupyterlab-cpu/Dockerfile index 8cee6d0fc..f12c257fc 100644 --- a/output/jupyterlab-cpu/Dockerfile +++ b/output/jupyterlab-cpu/Dockerfile @@ -270,10 +270,6 @@ RUN python3 -m pip install \ # ENV SPARK_VERSION=3.5 -# RSpark config -ENV R_LIBS_USER="${SPARK_HOME}/R/lib" -RUN fix-permissions "${R_LIBS_USER}" - # If spark_version is not set, latest Spark will be installed ARG spark_version ARG hadoop_version="3" @@ -288,6 +284,10 @@ ENV SPARK_HOME=/usr/local/spark ENV PATH="${PATH}:${SPARK_HOME}/bin" ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" +# RSpark config +ENV R_LIBS_USER="${SPARK_HOME}/R/lib" +RUN fix-permissions "${R_LIBS_USER}" + COPY setup_spark.py /opt/setup-scripts/ # Setup Spark diff --git a/output/jupyterlab-pytorch/Dockerfile b/output/jupyterlab-pytorch/Dockerfile index 2182ce106..dc53a6e06 100644 --- a/output/jupyterlab-pytorch/Dockerfile +++ b/output/jupyterlab-pytorch/Dockerfile @@ -292,10 +292,6 @@ RUN python3 -m pip install \ # ENV SPARK_VERSION=3.5 -# RSpark config -ENV R_LIBS_USER="${SPARK_HOME}/R/lib" -RUN fix-permissions "${R_LIBS_USER}" - # If spark_version is not set, latest Spark will be installed ARG spark_version ARG hadoop_version="3" @@ -310,6 +306,10 @@ ENV SPARK_HOME=/usr/local/spark ENV PATH="${PATH}:${SPARK_HOME}/bin" ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" +# RSpark config +ENV R_LIBS_USER="${SPARK_HOME}/R/lib" +RUN fix-permissions "${R_LIBS_USER}" + COPY setup_spark.py /opt/setup-scripts/ # Setup Spark diff --git a/output/jupyterlab-tensorflow/Dockerfile b/output/jupyterlab-tensorflow/Dockerfile index 233e2e2ae..c32dd0fe8 100644 --- a/output/jupyterlab-tensorflow/Dockerfile +++ b/output/jupyterlab-tensorflow/Dockerfile @@ -399,10 +399,6 @@ RUN python3 -m pip install \ # ENV SPARK_VERSION=3.5 -# RSpark config -ENV R_LIBS_USER="${SPARK_HOME}/R/lib" -RUN fix-permissions "${R_LIBS_USER}" - # If spark_version is not set, latest Spark will be installed ARG spark_version ARG hadoop_version="3" @@ -417,6 +413,10 @@ ENV SPARK_HOME=/usr/local/spark ENV PATH="${PATH}:${SPARK_HOME}/bin" ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" +# RSpark config +ENV R_LIBS_USER="${SPARK_HOME}/R/lib" +RUN fix-permissions "${R_LIBS_USER}" + COPY setup_spark.py /opt/setup-scripts/ # Setup Spark diff --git a/output/rstudio/Dockerfile b/output/rstudio/Dockerfile index 80c95a5e7..e97988422 100644 --- a/output/rstudio/Dockerfile +++ b/output/rstudio/Dockerfile @@ -270,10 +270,6 @@ RUN python3 -m pip install \ # ENV SPARK_VERSION=3.5 -# RSpark config -ENV R_LIBS_USER="${SPARK_HOME}/R/lib" -RUN fix-permissions "${R_LIBS_USER}" - # If spark_version is not set, latest Spark will be installed ARG spark_version ARG hadoop_version="3" @@ -288,6 +284,10 @@ ENV SPARK_HOME=/usr/local/spark ENV PATH="${PATH}:${SPARK_HOME}/bin" ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" +# RSpark config +ENV R_LIBS_USER="${SPARK_HOME}/R/lib" +RUN fix-permissions "${R_LIBS_USER}" + COPY setup_spark.py /opt/setup-scripts/ # Setup Spark diff --git a/output/sas/Dockerfile b/output/sas/Dockerfile index eb90aabf2..e7be7eb32 100644 --- a/output/sas/Dockerfile +++ b/output/sas/Dockerfile @@ -430,10 +430,6 @@ RUN python3 -m pip install \ # ENV SPARK_VERSION=3.5 -# RSpark config -ENV R_LIBS_USER="${SPARK_HOME}/R/lib" -RUN fix-permissions "${R_LIBS_USER}" - # If spark_version is not set, latest Spark will be installed ARG spark_version ARG hadoop_version="3" @@ -448,6 +444,10 @@ ENV SPARK_HOME=/usr/local/spark ENV PATH="${PATH}:${SPARK_HOME}/bin" ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" +# RSpark config +ENV R_LIBS_USER="${SPARK_HOME}/R/lib" +RUN fix-permissions "${R_LIBS_USER}" + COPY setup_spark.py /opt/setup-scripts/ # Setup Spark From 16783b906e0909b1c262d700696d2db9b4b3fa3e Mon Sep 17 00:00:00 2001 From: Bryan Paget Date: Wed, 18 Dec 2024 12:49:33 -0500 Subject: [PATCH 11/14] fix: remove spark download url --- docker-bits/6_rstudio.Dockerfile | 6 +----- output/jupyterlab-cpu/Dockerfile | 6 +----- output/jupyterlab-pytorch/Dockerfile | 6 +----- output/jupyterlab-tensorflow/Dockerfile | 6 +----- output/rstudio/Dockerfile | 6 +----- output/sas/Dockerfile | 6 +----- 6 files changed, 6 insertions(+), 30 deletions(-) diff --git a/docker-bits/6_rstudio.Dockerfile b/docker-bits/6_rstudio.Dockerfile index ffcbed573..c059d3401 100644 --- a/docker-bits/6_rstudio.Dockerfile +++ b/docker-bits/6_rstudio.Dockerfile @@ -37,17 +37,13 @@ RUN python3 -m pip install \ fix-permissions $CONDA_DIR && \ fix-permissions /home/$NB_USER -# ENV SPARK_VERSION=3.5 +# ARG SPARK_VERSION=3.5 # If spark_version is not set, latest Spark will be installed ARG spark_version ARG hadoop_version="3" # If scala_version is not set, Spark without Scala will be installed ARG scala_version -# URL to use for Spark downloads -# You need to use https://archive.apache.org/dist/spark/ website if you want to download old Spark versions -# But it seems to be slower, that's why we use the recommended site for download -ARG spark_download_url="https://dlcdn.apache.org/spark/" ENV SPARK_HOME=/usr/local/spark ENV PATH="${PATH}:${SPARK_HOME}/bin" diff --git a/output/jupyterlab-cpu/Dockerfile b/output/jupyterlab-cpu/Dockerfile index f12c257fc..53c0ba519 100644 --- a/output/jupyterlab-cpu/Dockerfile +++ b/output/jupyterlab-cpu/Dockerfile @@ -268,17 +268,13 @@ RUN python3 -m pip install \ fix-permissions $CONDA_DIR && \ fix-permissions /home/$NB_USER -# ENV SPARK_VERSION=3.5 +# ARG SPARK_VERSION=3.5 # If spark_version is not set, latest Spark will be installed ARG spark_version ARG hadoop_version="3" # If scala_version is not set, Spark without Scala will be installed ARG scala_version -# URL to use for Spark downloads -# You need to use https://archive.apache.org/dist/spark/ website if you want to download old Spark versions -# But it seems to be slower, that's why we use the recommended site for download -ARG spark_download_url="https://dlcdn.apache.org/spark/" ENV SPARK_HOME=/usr/local/spark ENV PATH="${PATH}:${SPARK_HOME}/bin" diff --git a/output/jupyterlab-pytorch/Dockerfile b/output/jupyterlab-pytorch/Dockerfile index dc53a6e06..3751a804f 100644 --- a/output/jupyterlab-pytorch/Dockerfile +++ b/output/jupyterlab-pytorch/Dockerfile @@ -290,17 +290,13 @@ RUN python3 -m pip install \ fix-permissions $CONDA_DIR && \ fix-permissions /home/$NB_USER -# ENV SPARK_VERSION=3.5 +# ARG SPARK_VERSION=3.5 # If spark_version is not set, latest Spark will be installed ARG spark_version ARG hadoop_version="3" # If scala_version is not set, Spark without Scala will be installed ARG scala_version -# URL to use for Spark downloads -# You need to use https://archive.apache.org/dist/spark/ website if you want to download old Spark versions -# But it seems to be slower, that's why we use the recommended site for download -ARG spark_download_url="https://dlcdn.apache.org/spark/" ENV SPARK_HOME=/usr/local/spark ENV PATH="${PATH}:${SPARK_HOME}/bin" diff --git a/output/jupyterlab-tensorflow/Dockerfile b/output/jupyterlab-tensorflow/Dockerfile index c32dd0fe8..66a4f6916 100644 --- a/output/jupyterlab-tensorflow/Dockerfile +++ b/output/jupyterlab-tensorflow/Dockerfile @@ -397,17 +397,13 @@ RUN python3 -m pip install \ fix-permissions $CONDA_DIR && \ fix-permissions /home/$NB_USER -# ENV SPARK_VERSION=3.5 +# ARG SPARK_VERSION=3.5 # If spark_version is not set, latest Spark will be installed ARG spark_version ARG hadoop_version="3" # If scala_version is not set, Spark without Scala will be installed ARG scala_version -# URL to use for Spark downloads -# You need to use https://archive.apache.org/dist/spark/ website if you want to download old Spark versions -# But it seems to be slower, that's why we use the recommended site for download -ARG spark_download_url="https://dlcdn.apache.org/spark/" ENV SPARK_HOME=/usr/local/spark ENV PATH="${PATH}:${SPARK_HOME}/bin" diff --git a/output/rstudio/Dockerfile b/output/rstudio/Dockerfile index e97988422..f28f4ea3c 100644 --- a/output/rstudio/Dockerfile +++ b/output/rstudio/Dockerfile @@ -268,17 +268,13 @@ RUN python3 -m pip install \ fix-permissions $CONDA_DIR && \ fix-permissions /home/$NB_USER -# ENV SPARK_VERSION=3.5 +# ARG SPARK_VERSION=3.5 # If spark_version is not set, latest Spark will be installed ARG spark_version ARG hadoop_version="3" # If scala_version is not set, Spark without Scala will be installed ARG scala_version -# URL to use for Spark downloads -# You need to use https://archive.apache.org/dist/spark/ website if you want to download old Spark versions -# But it seems to be slower, that's why we use the recommended site for download -ARG spark_download_url="https://dlcdn.apache.org/spark/" ENV SPARK_HOME=/usr/local/spark ENV PATH="${PATH}:${SPARK_HOME}/bin" diff --git a/output/sas/Dockerfile b/output/sas/Dockerfile index e7be7eb32..7d69ae56e 100644 --- a/output/sas/Dockerfile +++ b/output/sas/Dockerfile @@ -428,17 +428,13 @@ RUN python3 -m pip install \ fix-permissions $CONDA_DIR && \ fix-permissions /home/$NB_USER -# ENV SPARK_VERSION=3.5 +# ARG SPARK_VERSION=3.5 # If spark_version is not set, latest Spark will be installed ARG spark_version ARG hadoop_version="3" # If scala_version is not set, Spark without Scala will be installed ARG scala_version -# URL to use for Spark downloads -# You need to use https://archive.apache.org/dist/spark/ website if you want to download old Spark versions -# But it seems to be slower, that's why we use the recommended site for download -ARG spark_download_url="https://dlcdn.apache.org/spark/" ENV SPARK_HOME=/usr/local/spark ENV PATH="${PATH}:${SPARK_HOME}/bin" From 2e4e2fe0ccf17bae70b356e933ae21ae77b81be0 Mon Sep 17 00:00:00 2001 From: Bryan Paget Date: Wed, 18 Dec 2024 12:58:10 -0500 Subject: [PATCH 12/14] fix: specify spark 3.4.1 because that url works --- docker-bits/6_rstudio.Dockerfile | 33 ++++------------------- output/jupyterlab-cpu/Dockerfile | 35 +++++-------------------- output/jupyterlab-pytorch/Dockerfile | 35 +++++-------------------- output/jupyterlab-tensorflow/Dockerfile | 35 +++++-------------------- output/rstudio/Dockerfile | 35 +++++-------------------- output/sas/Dockerfile | 35 +++++-------------------- 6 files changed, 35 insertions(+), 173 deletions(-) diff --git a/docker-bits/6_rstudio.Dockerfile b/docker-bits/6_rstudio.Dockerfile index c059d3401..0a9972bc8 100644 --- a/docker-bits/6_rstudio.Dockerfile +++ b/docker-bits/6_rstudio.Dockerfile @@ -37,35 +37,12 @@ RUN python3 -m pip install \ fix-permissions $CONDA_DIR && \ fix-permissions /home/$NB_USER -# ARG SPARK_VERSION=3.5 +ARG SPARK_VERSION="3.4.1" -# If spark_version is not set, latest Spark will be installed -ARG spark_version -ARG hadoop_version="3" -# If scala_version is not set, Spark without Scala will be installed -ARG scala_version - -ENV SPARK_HOME=/usr/local/spark -ENV PATH="${PATH}:${SPARK_HOME}/bin" -ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" - -# RSpark config -ENV R_LIBS_USER="${SPARK_HOME}/R/lib" -RUN fix-permissions "${R_LIBS_USER}" - -COPY setup_spark.py /opt/setup-scripts/ - -# Setup Spark -RUN /opt/setup-scripts/setup_spark.py \ - --spark-version="${spark_version}" \ - --hadoop-version="${hadoop_version}" \ - --scala-version="${scala_version}" \ - --spark-download-url="${spark_download_url}" - -# # Install sparklyr -# RUN apt-get update && \ -# apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ -# Rscript -e "library(sparklyr); spark_install(version = ${SPARK_VERSION})" +# Install sparklyr +RUN apt-get update && \ + apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ + Rscript -e "library(sparklyr); spark_install(version = ${SPARK_VERSION})" # If using the docker bit in other Dockerfiles, this must get written over in a later layer ENV DEFAULT_JUPYTER_URL="/rstudio" diff --git a/output/jupyterlab-cpu/Dockerfile b/output/jupyterlab-cpu/Dockerfile index 53c0ba519..fdfc8eff4 100644 --- a/output/jupyterlab-cpu/Dockerfile +++ b/output/jupyterlab-cpu/Dockerfile @@ -268,35 +268,12 @@ RUN python3 -m pip install \ fix-permissions $CONDA_DIR && \ fix-permissions /home/$NB_USER -# ARG SPARK_VERSION=3.5 - -# If spark_version is not set, latest Spark will be installed -ARG spark_version -ARG hadoop_version="3" -# If scala_version is not set, Spark without Scala will be installed -ARG scala_version - -ENV SPARK_HOME=/usr/local/spark -ENV PATH="${PATH}:${SPARK_HOME}/bin" -ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" - -# RSpark config -ENV R_LIBS_USER="${SPARK_HOME}/R/lib" -RUN fix-permissions "${R_LIBS_USER}" - -COPY setup_spark.py /opt/setup-scripts/ - -# Setup Spark -RUN /opt/setup-scripts/setup_spark.py \ - --spark-version="${spark_version}" \ - --hadoop-version="${hadoop_version}" \ - --scala-version="${scala_version}" \ - --spark-download-url="${spark_download_url}" - -# # Install sparklyr -# RUN apt-get update && \ -# apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ -# Rscript -e "library(sparklyr); spark_install(version = ${SPARK_VERSION})" +ARG SPARK_VERSION="3.4.1" + +# Install sparklyr +RUN apt-get update && \ + apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ + Rscript -e "library(sparklyr); spark_install(version = ${SPARK_VERSION})" # If using the docker bit in other Dockerfiles, this must get written over in a later layer ENV DEFAULT_JUPYTER_URL="/rstudio" diff --git a/output/jupyterlab-pytorch/Dockerfile b/output/jupyterlab-pytorch/Dockerfile index 3751a804f..ca5c05a00 100644 --- a/output/jupyterlab-pytorch/Dockerfile +++ b/output/jupyterlab-pytorch/Dockerfile @@ -290,35 +290,12 @@ RUN python3 -m pip install \ fix-permissions $CONDA_DIR && \ fix-permissions /home/$NB_USER -# ARG SPARK_VERSION=3.5 - -# If spark_version is not set, latest Spark will be installed -ARG spark_version -ARG hadoop_version="3" -# If scala_version is not set, Spark without Scala will be installed -ARG scala_version - -ENV SPARK_HOME=/usr/local/spark -ENV PATH="${PATH}:${SPARK_HOME}/bin" -ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" - -# RSpark config -ENV R_LIBS_USER="${SPARK_HOME}/R/lib" -RUN fix-permissions "${R_LIBS_USER}" - -COPY setup_spark.py /opt/setup-scripts/ - -# Setup Spark -RUN /opt/setup-scripts/setup_spark.py \ - --spark-version="${spark_version}" \ - --hadoop-version="${hadoop_version}" \ - --scala-version="${scala_version}" \ - --spark-download-url="${spark_download_url}" - -# # Install sparklyr -# RUN apt-get update && \ -# apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ -# Rscript -e "library(sparklyr); spark_install(version = ${SPARK_VERSION})" +ARG SPARK_VERSION="3.4.1" + +# Install sparklyr +RUN apt-get update && \ + apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ + Rscript -e "library(sparklyr); spark_install(version = ${SPARK_VERSION})" # If using the docker bit in other Dockerfiles, this must get written over in a later layer ENV DEFAULT_JUPYTER_URL="/rstudio" diff --git a/output/jupyterlab-tensorflow/Dockerfile b/output/jupyterlab-tensorflow/Dockerfile index 66a4f6916..7073928db 100644 --- a/output/jupyterlab-tensorflow/Dockerfile +++ b/output/jupyterlab-tensorflow/Dockerfile @@ -397,35 +397,12 @@ RUN python3 -m pip install \ fix-permissions $CONDA_DIR && \ fix-permissions /home/$NB_USER -# ARG SPARK_VERSION=3.5 - -# If spark_version is not set, latest Spark will be installed -ARG spark_version -ARG hadoop_version="3" -# If scala_version is not set, Spark without Scala will be installed -ARG scala_version - -ENV SPARK_HOME=/usr/local/spark -ENV PATH="${PATH}:${SPARK_HOME}/bin" -ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" - -# RSpark config -ENV R_LIBS_USER="${SPARK_HOME}/R/lib" -RUN fix-permissions "${R_LIBS_USER}" - -COPY setup_spark.py /opt/setup-scripts/ - -# Setup Spark -RUN /opt/setup-scripts/setup_spark.py \ - --spark-version="${spark_version}" \ - --hadoop-version="${hadoop_version}" \ - --scala-version="${scala_version}" \ - --spark-download-url="${spark_download_url}" - -# # Install sparklyr -# RUN apt-get update && \ -# apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ -# Rscript -e "library(sparklyr); spark_install(version = ${SPARK_VERSION})" +ARG SPARK_VERSION="3.4.1" + +# Install sparklyr +RUN apt-get update && \ + apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ + Rscript -e "library(sparklyr); spark_install(version = ${SPARK_VERSION})" # If using the docker bit in other Dockerfiles, this must get written over in a later layer ENV DEFAULT_JUPYTER_URL="/rstudio" diff --git a/output/rstudio/Dockerfile b/output/rstudio/Dockerfile index f28f4ea3c..f670d902f 100644 --- a/output/rstudio/Dockerfile +++ b/output/rstudio/Dockerfile @@ -268,35 +268,12 @@ RUN python3 -m pip install \ fix-permissions $CONDA_DIR && \ fix-permissions /home/$NB_USER -# ARG SPARK_VERSION=3.5 - -# If spark_version is not set, latest Spark will be installed -ARG spark_version -ARG hadoop_version="3" -# If scala_version is not set, Spark without Scala will be installed -ARG scala_version - -ENV SPARK_HOME=/usr/local/spark -ENV PATH="${PATH}:${SPARK_HOME}/bin" -ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" - -# RSpark config -ENV R_LIBS_USER="${SPARK_HOME}/R/lib" -RUN fix-permissions "${R_LIBS_USER}" - -COPY setup_spark.py /opt/setup-scripts/ - -# Setup Spark -RUN /opt/setup-scripts/setup_spark.py \ - --spark-version="${spark_version}" \ - --hadoop-version="${hadoop_version}" \ - --scala-version="${scala_version}" \ - --spark-download-url="${spark_download_url}" - -# # Install sparklyr -# RUN apt-get update && \ -# apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ -# Rscript -e "library(sparklyr); spark_install(version = ${SPARK_VERSION})" +ARG SPARK_VERSION="3.4.1" + +# Install sparklyr +RUN apt-get update && \ + apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ + Rscript -e "library(sparklyr); spark_install(version = ${SPARK_VERSION})" # If using the docker bit in other Dockerfiles, this must get written over in a later layer ENV DEFAULT_JUPYTER_URL="/rstudio" diff --git a/output/sas/Dockerfile b/output/sas/Dockerfile index 7d69ae56e..e0fffc22a 100644 --- a/output/sas/Dockerfile +++ b/output/sas/Dockerfile @@ -428,35 +428,12 @@ RUN python3 -m pip install \ fix-permissions $CONDA_DIR && \ fix-permissions /home/$NB_USER -# ARG SPARK_VERSION=3.5 - -# If spark_version is not set, latest Spark will be installed -ARG spark_version -ARG hadoop_version="3" -# If scala_version is not set, Spark without Scala will be installed -ARG scala_version - -ENV SPARK_HOME=/usr/local/spark -ENV PATH="${PATH}:${SPARK_HOME}/bin" -ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" - -# RSpark config -ENV R_LIBS_USER="${SPARK_HOME}/R/lib" -RUN fix-permissions "${R_LIBS_USER}" - -COPY setup_spark.py /opt/setup-scripts/ - -# Setup Spark -RUN /opt/setup-scripts/setup_spark.py \ - --spark-version="${spark_version}" \ - --hadoop-version="${hadoop_version}" \ - --scala-version="${scala_version}" \ - --spark-download-url="${spark_download_url}" - -# # Install sparklyr -# RUN apt-get update && \ -# apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ -# Rscript -e "library(sparklyr); spark_install(version = ${SPARK_VERSION})" +ARG SPARK_VERSION="3.4.1" + +# Install sparklyr +RUN apt-get update && \ + apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ + Rscript -e "library(sparklyr); spark_install(version = ${SPARK_VERSION})" # If using the docker bit in other Dockerfiles, this must get written over in a later layer ENV DEFAULT_JUPYTER_URL="/rstudio" From e0b56212afc91d49b613f361aa6b58bfe6c86a3d Mon Sep 17 00:00:00 2001 From: Bryan Paget Date: Wed, 18 Dec 2024 13:56:12 -0500 Subject: [PATCH 13/14] feat: add SPARK_HOME env var --- docker-bits/6_rstudio.Dockerfile | 10 +- .../setup_spark.py | 131 ------------------ output/jupyterlab-cpu/Dockerfile | 10 +- output/jupyterlab-cpu/setup_spark.py | 131 ------------------ output/jupyterlab-pytorch/Dockerfile | 10 +- output/jupyterlab-pytorch/setup_spark.py | 131 ------------------ output/jupyterlab-tensorflow/Dockerfile | 10 +- output/jupyterlab-tensorflow/setup_spark.py | 131 ------------------ output/remote-desktop/setup_spark.py | 131 ------------------ output/rstudio/Dockerfile | 10 +- output/rstudio/setup_spark.py | 131 ------------------ output/sas/Dockerfile | 10 +- output/sas/setup_spark.py | 131 ------------------ resources/common/setup_spark.py | 131 ------------------ 14 files changed, 12 insertions(+), 1096 deletions(-) delete mode 100644 output/docker-stacks-datascience-notebook/setup_spark.py delete mode 100644 output/jupyterlab-cpu/setup_spark.py delete mode 100644 output/jupyterlab-pytorch/setup_spark.py delete mode 100644 output/jupyterlab-tensorflow/setup_spark.py delete mode 100644 output/remote-desktop/setup_spark.py delete mode 100644 output/rstudio/setup_spark.py delete mode 100644 output/sas/setup_spark.py delete mode 100644 resources/common/setup_spark.py diff --git a/docker-bits/6_rstudio.Dockerfile b/docker-bits/6_rstudio.Dockerfile index 0a9972bc8..62724d652 100644 --- a/docker-bits/6_rstudio.Dockerfile +++ b/docker-bits/6_rstudio.Dockerfile @@ -12,6 +12,8 @@ RUN mkdir -p /etc/rstudio && \ ENV PATH=$PATH:/usr/lib/rstudio-server/bin +ENV SPARK_HOME="/opt/conda/lib/python3.11/site-packages/pyspark" + # Install some default R packages RUN mamba install --quiet --yes \ 'r-arrow' \ @@ -20,7 +22,6 @@ RUN mamba install --quiet --yes \ 'r-e1071' \ 'r-hdf5r' \ 'r-markdown' \ - 'r-sparklyr' \ 'r-odbc' \ 'r-renv' \ 'r-rodbc' \ @@ -37,13 +38,6 @@ RUN python3 -m pip install \ fix-permissions $CONDA_DIR && \ fix-permissions /home/$NB_USER -ARG SPARK_VERSION="3.4.1" - -# Install sparklyr -RUN apt-get update && \ - apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ - Rscript -e "library(sparklyr); spark_install(version = ${SPARK_VERSION})" - # If using the docker bit in other Dockerfiles, this must get written over in a later layer ENV DEFAULT_JUPYTER_URL="/rstudio" ENV GIT_EXAMPLE_NOTEBOOKS=https://gitlab.k8s.cloud.statcan.ca/business-transformation/aaw/aaw-contrib-r-notebooks.git diff --git a/output/docker-stacks-datascience-notebook/setup_spark.py b/output/docker-stacks-datascience-notebook/setup_spark.py deleted file mode 100644 index c5b76433f..000000000 --- a/output/docker-stacks-datascience-notebook/setup_spark.py +++ /dev/null @@ -1,131 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) Jupyter Development Team. -# Distributed under the terms of the Modified BSD License. - -# Requirements: -# - Run as the root user -# - Required env variable: SPARK_HOME - -import argparse -import logging -import os -import subprocess -from pathlib import Path - -import requests -from bs4 import BeautifulSoup - -LOGGER = logging.getLogger(__name__) - - -def get_all_refs(url: str) -> list[str]: - """ - Get all the references for a given webpage - """ - resp = requests.get(url) - soup = BeautifulSoup(resp.text, "html.parser") - return [a["href"] for a in soup.find_all("a", href=True)] - - -def get_latest_spark_version() -> str: - """ - Returns the last version of Spark using spark archive - """ - LOGGER.info("Downloading Spark versions information") - all_refs = get_all_refs("https://archive.apache.org/dist/spark/") - versions = [ - ref.removeprefix("spark-").removesuffix("/") - for ref in all_refs - if ref.startswith("spark-") and "incubating" not in ref - ] - - # Compare versions semantically - def version_array(ver: str) -> tuple[int, int, int, str]: - # 3.5.3 -> [3, 5, 3, ""] - # 4.0.0-preview2 -> [4, 0, 0, "preview2"] - arr = ver.split(".") - assert len(arr) == 3, arr - major, minor = int(arr[0]), int(arr[1]) - patch, _, preview = arr[2].partition("-") - return (major, minor, int(patch), preview) - - latest_version = max(versions, key=lambda ver: version_array(ver)) - LOGGER.info(f"Latest version: {latest_version}") - return latest_version - - -def download_spark( - spark_version: str, - hadoop_version: str, - scala_version: str, - spark_download_url: Path, -) -> str: - """ - Downloads and unpacks spark - The resulting spark directory name is returned - """ - LOGGER.info("Downloading and unpacking Spark") - spark_dir_name = f"spark-{spark_version}-bin-hadoop{hadoop_version}" - if scala_version: - spark_dir_name += f"-scala{scala_version}" - LOGGER.info(f"Spark directory name: {spark_dir_name}") - spark_url = spark_download_url / f"spark-{spark_version}" / f"{spark_dir_name}.tgz" - - tmp_file = Path("/tmp/spark.tar.gz") - subprocess.check_call( - ["curl", "--progress-bar", "--location", "--output", tmp_file, spark_url] - ) - subprocess.check_call( - [ - "tar", - "xzf", - tmp_file, - "-C", - "/usr/local", - "--owner", - "root", - "--group", - "root", - "--no-same-owner", - ] - ) - tmp_file.unlink() - return spark_dir_name - - -def configure_spark(spark_dir_name: str, spark_home: Path) -> None: - """ - Creates a ${SPARK_HOME} symlink to a versioned spark directory - Creates a 10spark-config.sh symlink to source PYTHONPATH automatically - """ - LOGGER.info("Configuring Spark") - subprocess.check_call(["ln", "-s", f"/usr/local/{spark_dir_name}", spark_home]) - - # Add a link in the before_notebook hook in order to source PYTHONPATH automatically - CONFIG_SCRIPT = "/usr/local/bin/before-notebook.d/10spark-config.sh" - subprocess.check_call( - ["ln", "-s", spark_home / "sbin/spark-config.sh", CONFIG_SCRIPT] - ) - - -if __name__ == "__main__": - logging.basicConfig(level=logging.INFO) - - arg_parser = argparse.ArgumentParser() - arg_parser.add_argument("--spark-version", required=True) - arg_parser.add_argument("--hadoop-version", required=True) - arg_parser.add_argument("--scala-version", required=True) - arg_parser.add_argument("--spark-download-url", type=Path, required=True) - args = arg_parser.parse_args() - - args.spark_version = args.spark_version or get_latest_spark_version() - - spark_dir_name = download_spark( - spark_version=args.spark_version, - hadoop_version=args.hadoop_version, - scala_version=args.scala_version, - spark_download_url=args.spark_download_url, - ) - configure_spark( - spark_dir_name=spark_dir_name, spark_home=Path(os.environ["SPARK_HOME"]) - ) diff --git a/output/jupyterlab-cpu/Dockerfile b/output/jupyterlab-cpu/Dockerfile index fdfc8eff4..4f23054c2 100644 --- a/output/jupyterlab-cpu/Dockerfile +++ b/output/jupyterlab-cpu/Dockerfile @@ -243,6 +243,8 @@ RUN mkdir -p /etc/rstudio && \ ENV PATH=$PATH:/usr/lib/rstudio-server/bin +ENV SPARK_HOME="/opt/conda/lib/python3.11/site-packages/pyspark" + # Install some default R packages RUN mamba install --quiet --yes \ 'r-arrow' \ @@ -251,7 +253,6 @@ RUN mamba install --quiet --yes \ 'r-e1071' \ 'r-hdf5r' \ 'r-markdown' \ - 'r-sparklyr' \ 'r-odbc' \ 'r-renv' \ 'r-rodbc' \ @@ -268,13 +269,6 @@ RUN python3 -m pip install \ fix-permissions $CONDA_DIR && \ fix-permissions /home/$NB_USER -ARG SPARK_VERSION="3.4.1" - -# Install sparklyr -RUN apt-get update && \ - apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ - Rscript -e "library(sparklyr); spark_install(version = ${SPARK_VERSION})" - # If using the docker bit in other Dockerfiles, this must get written over in a later layer ENV DEFAULT_JUPYTER_URL="/rstudio" ENV GIT_EXAMPLE_NOTEBOOKS=https://gitlab.k8s.cloud.statcan.ca/business-transformation/aaw/aaw-contrib-r-notebooks.git diff --git a/output/jupyterlab-cpu/setup_spark.py b/output/jupyterlab-cpu/setup_spark.py deleted file mode 100644 index c5b76433f..000000000 --- a/output/jupyterlab-cpu/setup_spark.py +++ /dev/null @@ -1,131 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) Jupyter Development Team. -# Distributed under the terms of the Modified BSD License. - -# Requirements: -# - Run as the root user -# - Required env variable: SPARK_HOME - -import argparse -import logging -import os -import subprocess -from pathlib import Path - -import requests -from bs4 import BeautifulSoup - -LOGGER = logging.getLogger(__name__) - - -def get_all_refs(url: str) -> list[str]: - """ - Get all the references for a given webpage - """ - resp = requests.get(url) - soup = BeautifulSoup(resp.text, "html.parser") - return [a["href"] for a in soup.find_all("a", href=True)] - - -def get_latest_spark_version() -> str: - """ - Returns the last version of Spark using spark archive - """ - LOGGER.info("Downloading Spark versions information") - all_refs = get_all_refs("https://archive.apache.org/dist/spark/") - versions = [ - ref.removeprefix("spark-").removesuffix("/") - for ref in all_refs - if ref.startswith("spark-") and "incubating" not in ref - ] - - # Compare versions semantically - def version_array(ver: str) -> tuple[int, int, int, str]: - # 3.5.3 -> [3, 5, 3, ""] - # 4.0.0-preview2 -> [4, 0, 0, "preview2"] - arr = ver.split(".") - assert len(arr) == 3, arr - major, minor = int(arr[0]), int(arr[1]) - patch, _, preview = arr[2].partition("-") - return (major, minor, int(patch), preview) - - latest_version = max(versions, key=lambda ver: version_array(ver)) - LOGGER.info(f"Latest version: {latest_version}") - return latest_version - - -def download_spark( - spark_version: str, - hadoop_version: str, - scala_version: str, - spark_download_url: Path, -) -> str: - """ - Downloads and unpacks spark - The resulting spark directory name is returned - """ - LOGGER.info("Downloading and unpacking Spark") - spark_dir_name = f"spark-{spark_version}-bin-hadoop{hadoop_version}" - if scala_version: - spark_dir_name += f"-scala{scala_version}" - LOGGER.info(f"Spark directory name: {spark_dir_name}") - spark_url = spark_download_url / f"spark-{spark_version}" / f"{spark_dir_name}.tgz" - - tmp_file = Path("/tmp/spark.tar.gz") - subprocess.check_call( - ["curl", "--progress-bar", "--location", "--output", tmp_file, spark_url] - ) - subprocess.check_call( - [ - "tar", - "xzf", - tmp_file, - "-C", - "/usr/local", - "--owner", - "root", - "--group", - "root", - "--no-same-owner", - ] - ) - tmp_file.unlink() - return spark_dir_name - - -def configure_spark(spark_dir_name: str, spark_home: Path) -> None: - """ - Creates a ${SPARK_HOME} symlink to a versioned spark directory - Creates a 10spark-config.sh symlink to source PYTHONPATH automatically - """ - LOGGER.info("Configuring Spark") - subprocess.check_call(["ln", "-s", f"/usr/local/{spark_dir_name}", spark_home]) - - # Add a link in the before_notebook hook in order to source PYTHONPATH automatically - CONFIG_SCRIPT = "/usr/local/bin/before-notebook.d/10spark-config.sh" - subprocess.check_call( - ["ln", "-s", spark_home / "sbin/spark-config.sh", CONFIG_SCRIPT] - ) - - -if __name__ == "__main__": - logging.basicConfig(level=logging.INFO) - - arg_parser = argparse.ArgumentParser() - arg_parser.add_argument("--spark-version", required=True) - arg_parser.add_argument("--hadoop-version", required=True) - arg_parser.add_argument("--scala-version", required=True) - arg_parser.add_argument("--spark-download-url", type=Path, required=True) - args = arg_parser.parse_args() - - args.spark_version = args.spark_version or get_latest_spark_version() - - spark_dir_name = download_spark( - spark_version=args.spark_version, - hadoop_version=args.hadoop_version, - scala_version=args.scala_version, - spark_download_url=args.spark_download_url, - ) - configure_spark( - spark_dir_name=spark_dir_name, spark_home=Path(os.environ["SPARK_HOME"]) - ) diff --git a/output/jupyterlab-pytorch/Dockerfile b/output/jupyterlab-pytorch/Dockerfile index ca5c05a00..d631d8ff3 100644 --- a/output/jupyterlab-pytorch/Dockerfile +++ b/output/jupyterlab-pytorch/Dockerfile @@ -265,6 +265,8 @@ RUN mkdir -p /etc/rstudio && \ ENV PATH=$PATH:/usr/lib/rstudio-server/bin +ENV SPARK_HOME="/opt/conda/lib/python3.11/site-packages/pyspark" + # Install some default R packages RUN mamba install --quiet --yes \ 'r-arrow' \ @@ -273,7 +275,6 @@ RUN mamba install --quiet --yes \ 'r-e1071' \ 'r-hdf5r' \ 'r-markdown' \ - 'r-sparklyr' \ 'r-odbc' \ 'r-renv' \ 'r-rodbc' \ @@ -290,13 +291,6 @@ RUN python3 -m pip install \ fix-permissions $CONDA_DIR && \ fix-permissions /home/$NB_USER -ARG SPARK_VERSION="3.4.1" - -# Install sparklyr -RUN apt-get update && \ - apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ - Rscript -e "library(sparklyr); spark_install(version = ${SPARK_VERSION})" - # If using the docker bit in other Dockerfiles, this must get written over in a later layer ENV DEFAULT_JUPYTER_URL="/rstudio" ENV GIT_EXAMPLE_NOTEBOOKS=https://gitlab.k8s.cloud.statcan.ca/business-transformation/aaw/aaw-contrib-r-notebooks.git diff --git a/output/jupyterlab-pytorch/setup_spark.py b/output/jupyterlab-pytorch/setup_spark.py deleted file mode 100644 index c5b76433f..000000000 --- a/output/jupyterlab-pytorch/setup_spark.py +++ /dev/null @@ -1,131 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) Jupyter Development Team. -# Distributed under the terms of the Modified BSD License. - -# Requirements: -# - Run as the root user -# - Required env variable: SPARK_HOME - -import argparse -import logging -import os -import subprocess -from pathlib import Path - -import requests -from bs4 import BeautifulSoup - -LOGGER = logging.getLogger(__name__) - - -def get_all_refs(url: str) -> list[str]: - """ - Get all the references for a given webpage - """ - resp = requests.get(url) - soup = BeautifulSoup(resp.text, "html.parser") - return [a["href"] for a in soup.find_all("a", href=True)] - - -def get_latest_spark_version() -> str: - """ - Returns the last version of Spark using spark archive - """ - LOGGER.info("Downloading Spark versions information") - all_refs = get_all_refs("https://archive.apache.org/dist/spark/") - versions = [ - ref.removeprefix("spark-").removesuffix("/") - for ref in all_refs - if ref.startswith("spark-") and "incubating" not in ref - ] - - # Compare versions semantically - def version_array(ver: str) -> tuple[int, int, int, str]: - # 3.5.3 -> [3, 5, 3, ""] - # 4.0.0-preview2 -> [4, 0, 0, "preview2"] - arr = ver.split(".") - assert len(arr) == 3, arr - major, minor = int(arr[0]), int(arr[1]) - patch, _, preview = arr[2].partition("-") - return (major, minor, int(patch), preview) - - latest_version = max(versions, key=lambda ver: version_array(ver)) - LOGGER.info(f"Latest version: {latest_version}") - return latest_version - - -def download_spark( - spark_version: str, - hadoop_version: str, - scala_version: str, - spark_download_url: Path, -) -> str: - """ - Downloads and unpacks spark - The resulting spark directory name is returned - """ - LOGGER.info("Downloading and unpacking Spark") - spark_dir_name = f"spark-{spark_version}-bin-hadoop{hadoop_version}" - if scala_version: - spark_dir_name += f"-scala{scala_version}" - LOGGER.info(f"Spark directory name: {spark_dir_name}") - spark_url = spark_download_url / f"spark-{spark_version}" / f"{spark_dir_name}.tgz" - - tmp_file = Path("/tmp/spark.tar.gz") - subprocess.check_call( - ["curl", "--progress-bar", "--location", "--output", tmp_file, spark_url] - ) - subprocess.check_call( - [ - "tar", - "xzf", - tmp_file, - "-C", - "/usr/local", - "--owner", - "root", - "--group", - "root", - "--no-same-owner", - ] - ) - tmp_file.unlink() - return spark_dir_name - - -def configure_spark(spark_dir_name: str, spark_home: Path) -> None: - """ - Creates a ${SPARK_HOME} symlink to a versioned spark directory - Creates a 10spark-config.sh symlink to source PYTHONPATH automatically - """ - LOGGER.info("Configuring Spark") - subprocess.check_call(["ln", "-s", f"/usr/local/{spark_dir_name}", spark_home]) - - # Add a link in the before_notebook hook in order to source PYTHONPATH automatically - CONFIG_SCRIPT = "/usr/local/bin/before-notebook.d/10spark-config.sh" - subprocess.check_call( - ["ln", "-s", spark_home / "sbin/spark-config.sh", CONFIG_SCRIPT] - ) - - -if __name__ == "__main__": - logging.basicConfig(level=logging.INFO) - - arg_parser = argparse.ArgumentParser() - arg_parser.add_argument("--spark-version", required=True) - arg_parser.add_argument("--hadoop-version", required=True) - arg_parser.add_argument("--scala-version", required=True) - arg_parser.add_argument("--spark-download-url", type=Path, required=True) - args = arg_parser.parse_args() - - args.spark_version = args.spark_version or get_latest_spark_version() - - spark_dir_name = download_spark( - spark_version=args.spark_version, - hadoop_version=args.hadoop_version, - scala_version=args.scala_version, - spark_download_url=args.spark_download_url, - ) - configure_spark( - spark_dir_name=spark_dir_name, spark_home=Path(os.environ["SPARK_HOME"]) - ) diff --git a/output/jupyterlab-tensorflow/Dockerfile b/output/jupyterlab-tensorflow/Dockerfile index 7073928db..0bae81c7c 100644 --- a/output/jupyterlab-tensorflow/Dockerfile +++ b/output/jupyterlab-tensorflow/Dockerfile @@ -372,6 +372,8 @@ RUN mkdir -p /etc/rstudio && \ ENV PATH=$PATH:/usr/lib/rstudio-server/bin +ENV SPARK_HOME="/opt/conda/lib/python3.11/site-packages/pyspark" + # Install some default R packages RUN mamba install --quiet --yes \ 'r-arrow' \ @@ -380,7 +382,6 @@ RUN mamba install --quiet --yes \ 'r-e1071' \ 'r-hdf5r' \ 'r-markdown' \ - 'r-sparklyr' \ 'r-odbc' \ 'r-renv' \ 'r-rodbc' \ @@ -397,13 +398,6 @@ RUN python3 -m pip install \ fix-permissions $CONDA_DIR && \ fix-permissions /home/$NB_USER -ARG SPARK_VERSION="3.4.1" - -# Install sparklyr -RUN apt-get update && \ - apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ - Rscript -e "library(sparklyr); spark_install(version = ${SPARK_VERSION})" - # If using the docker bit in other Dockerfiles, this must get written over in a later layer ENV DEFAULT_JUPYTER_URL="/rstudio" ENV GIT_EXAMPLE_NOTEBOOKS=https://gitlab.k8s.cloud.statcan.ca/business-transformation/aaw/aaw-contrib-r-notebooks.git diff --git a/output/jupyterlab-tensorflow/setup_spark.py b/output/jupyterlab-tensorflow/setup_spark.py deleted file mode 100644 index c5b76433f..000000000 --- a/output/jupyterlab-tensorflow/setup_spark.py +++ /dev/null @@ -1,131 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) Jupyter Development Team. -# Distributed under the terms of the Modified BSD License. - -# Requirements: -# - Run as the root user -# - Required env variable: SPARK_HOME - -import argparse -import logging -import os -import subprocess -from pathlib import Path - -import requests -from bs4 import BeautifulSoup - -LOGGER = logging.getLogger(__name__) - - -def get_all_refs(url: str) -> list[str]: - """ - Get all the references for a given webpage - """ - resp = requests.get(url) - soup = BeautifulSoup(resp.text, "html.parser") - return [a["href"] for a in soup.find_all("a", href=True)] - - -def get_latest_spark_version() -> str: - """ - Returns the last version of Spark using spark archive - """ - LOGGER.info("Downloading Spark versions information") - all_refs = get_all_refs("https://archive.apache.org/dist/spark/") - versions = [ - ref.removeprefix("spark-").removesuffix("/") - for ref in all_refs - if ref.startswith("spark-") and "incubating" not in ref - ] - - # Compare versions semantically - def version_array(ver: str) -> tuple[int, int, int, str]: - # 3.5.3 -> [3, 5, 3, ""] - # 4.0.0-preview2 -> [4, 0, 0, "preview2"] - arr = ver.split(".") - assert len(arr) == 3, arr - major, minor = int(arr[0]), int(arr[1]) - patch, _, preview = arr[2].partition("-") - return (major, minor, int(patch), preview) - - latest_version = max(versions, key=lambda ver: version_array(ver)) - LOGGER.info(f"Latest version: {latest_version}") - return latest_version - - -def download_spark( - spark_version: str, - hadoop_version: str, - scala_version: str, - spark_download_url: Path, -) -> str: - """ - Downloads and unpacks spark - The resulting spark directory name is returned - """ - LOGGER.info("Downloading and unpacking Spark") - spark_dir_name = f"spark-{spark_version}-bin-hadoop{hadoop_version}" - if scala_version: - spark_dir_name += f"-scala{scala_version}" - LOGGER.info(f"Spark directory name: {spark_dir_name}") - spark_url = spark_download_url / f"spark-{spark_version}" / f"{spark_dir_name}.tgz" - - tmp_file = Path("/tmp/spark.tar.gz") - subprocess.check_call( - ["curl", "--progress-bar", "--location", "--output", tmp_file, spark_url] - ) - subprocess.check_call( - [ - "tar", - "xzf", - tmp_file, - "-C", - "/usr/local", - "--owner", - "root", - "--group", - "root", - "--no-same-owner", - ] - ) - tmp_file.unlink() - return spark_dir_name - - -def configure_spark(spark_dir_name: str, spark_home: Path) -> None: - """ - Creates a ${SPARK_HOME} symlink to a versioned spark directory - Creates a 10spark-config.sh symlink to source PYTHONPATH automatically - """ - LOGGER.info("Configuring Spark") - subprocess.check_call(["ln", "-s", f"/usr/local/{spark_dir_name}", spark_home]) - - # Add a link in the before_notebook hook in order to source PYTHONPATH automatically - CONFIG_SCRIPT = "/usr/local/bin/before-notebook.d/10spark-config.sh" - subprocess.check_call( - ["ln", "-s", spark_home / "sbin/spark-config.sh", CONFIG_SCRIPT] - ) - - -if __name__ == "__main__": - logging.basicConfig(level=logging.INFO) - - arg_parser = argparse.ArgumentParser() - arg_parser.add_argument("--spark-version", required=True) - arg_parser.add_argument("--hadoop-version", required=True) - arg_parser.add_argument("--scala-version", required=True) - arg_parser.add_argument("--spark-download-url", type=Path, required=True) - args = arg_parser.parse_args() - - args.spark_version = args.spark_version or get_latest_spark_version() - - spark_dir_name = download_spark( - spark_version=args.spark_version, - hadoop_version=args.hadoop_version, - scala_version=args.scala_version, - spark_download_url=args.spark_download_url, - ) - configure_spark( - spark_dir_name=spark_dir_name, spark_home=Path(os.environ["SPARK_HOME"]) - ) diff --git a/output/remote-desktop/setup_spark.py b/output/remote-desktop/setup_spark.py deleted file mode 100644 index c5b76433f..000000000 --- a/output/remote-desktop/setup_spark.py +++ /dev/null @@ -1,131 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) Jupyter Development Team. -# Distributed under the terms of the Modified BSD License. - -# Requirements: -# - Run as the root user -# - Required env variable: SPARK_HOME - -import argparse -import logging -import os -import subprocess -from pathlib import Path - -import requests -from bs4 import BeautifulSoup - -LOGGER = logging.getLogger(__name__) - - -def get_all_refs(url: str) -> list[str]: - """ - Get all the references for a given webpage - """ - resp = requests.get(url) - soup = BeautifulSoup(resp.text, "html.parser") - return [a["href"] for a in soup.find_all("a", href=True)] - - -def get_latest_spark_version() -> str: - """ - Returns the last version of Spark using spark archive - """ - LOGGER.info("Downloading Spark versions information") - all_refs = get_all_refs("https://archive.apache.org/dist/spark/") - versions = [ - ref.removeprefix("spark-").removesuffix("/") - for ref in all_refs - if ref.startswith("spark-") and "incubating" not in ref - ] - - # Compare versions semantically - def version_array(ver: str) -> tuple[int, int, int, str]: - # 3.5.3 -> [3, 5, 3, ""] - # 4.0.0-preview2 -> [4, 0, 0, "preview2"] - arr = ver.split(".") - assert len(arr) == 3, arr - major, minor = int(arr[0]), int(arr[1]) - patch, _, preview = arr[2].partition("-") - return (major, minor, int(patch), preview) - - latest_version = max(versions, key=lambda ver: version_array(ver)) - LOGGER.info(f"Latest version: {latest_version}") - return latest_version - - -def download_spark( - spark_version: str, - hadoop_version: str, - scala_version: str, - spark_download_url: Path, -) -> str: - """ - Downloads and unpacks spark - The resulting spark directory name is returned - """ - LOGGER.info("Downloading and unpacking Spark") - spark_dir_name = f"spark-{spark_version}-bin-hadoop{hadoop_version}" - if scala_version: - spark_dir_name += f"-scala{scala_version}" - LOGGER.info(f"Spark directory name: {spark_dir_name}") - spark_url = spark_download_url / f"spark-{spark_version}" / f"{spark_dir_name}.tgz" - - tmp_file = Path("/tmp/spark.tar.gz") - subprocess.check_call( - ["curl", "--progress-bar", "--location", "--output", tmp_file, spark_url] - ) - subprocess.check_call( - [ - "tar", - "xzf", - tmp_file, - "-C", - "/usr/local", - "--owner", - "root", - "--group", - "root", - "--no-same-owner", - ] - ) - tmp_file.unlink() - return spark_dir_name - - -def configure_spark(spark_dir_name: str, spark_home: Path) -> None: - """ - Creates a ${SPARK_HOME} symlink to a versioned spark directory - Creates a 10spark-config.sh symlink to source PYTHONPATH automatically - """ - LOGGER.info("Configuring Spark") - subprocess.check_call(["ln", "-s", f"/usr/local/{spark_dir_name}", spark_home]) - - # Add a link in the before_notebook hook in order to source PYTHONPATH automatically - CONFIG_SCRIPT = "/usr/local/bin/before-notebook.d/10spark-config.sh" - subprocess.check_call( - ["ln", "-s", spark_home / "sbin/spark-config.sh", CONFIG_SCRIPT] - ) - - -if __name__ == "__main__": - logging.basicConfig(level=logging.INFO) - - arg_parser = argparse.ArgumentParser() - arg_parser.add_argument("--spark-version", required=True) - arg_parser.add_argument("--hadoop-version", required=True) - arg_parser.add_argument("--scala-version", required=True) - arg_parser.add_argument("--spark-download-url", type=Path, required=True) - args = arg_parser.parse_args() - - args.spark_version = args.spark_version or get_latest_spark_version() - - spark_dir_name = download_spark( - spark_version=args.spark_version, - hadoop_version=args.hadoop_version, - scala_version=args.scala_version, - spark_download_url=args.spark_download_url, - ) - configure_spark( - spark_dir_name=spark_dir_name, spark_home=Path(os.environ["SPARK_HOME"]) - ) diff --git a/output/rstudio/Dockerfile b/output/rstudio/Dockerfile index f670d902f..9795add3f 100644 --- a/output/rstudio/Dockerfile +++ b/output/rstudio/Dockerfile @@ -243,6 +243,8 @@ RUN mkdir -p /etc/rstudio && \ ENV PATH=$PATH:/usr/lib/rstudio-server/bin +ENV SPARK_HOME="/opt/conda/lib/python3.11/site-packages/pyspark" + # Install some default R packages RUN mamba install --quiet --yes \ 'r-arrow' \ @@ -251,7 +253,6 @@ RUN mamba install --quiet --yes \ 'r-e1071' \ 'r-hdf5r' \ 'r-markdown' \ - 'r-sparklyr' \ 'r-odbc' \ 'r-renv' \ 'r-rodbc' \ @@ -268,13 +269,6 @@ RUN python3 -m pip install \ fix-permissions $CONDA_DIR && \ fix-permissions /home/$NB_USER -ARG SPARK_VERSION="3.4.1" - -# Install sparklyr -RUN apt-get update && \ - apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ - Rscript -e "library(sparklyr); spark_install(version = ${SPARK_VERSION})" - # If using the docker bit in other Dockerfiles, this must get written over in a later layer ENV DEFAULT_JUPYTER_URL="/rstudio" ENV GIT_EXAMPLE_NOTEBOOKS=https://gitlab.k8s.cloud.statcan.ca/business-transformation/aaw/aaw-contrib-r-notebooks.git diff --git a/output/rstudio/setup_spark.py b/output/rstudio/setup_spark.py deleted file mode 100644 index c5b76433f..000000000 --- a/output/rstudio/setup_spark.py +++ /dev/null @@ -1,131 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) Jupyter Development Team. -# Distributed under the terms of the Modified BSD License. - -# Requirements: -# - Run as the root user -# - Required env variable: SPARK_HOME - -import argparse -import logging -import os -import subprocess -from pathlib import Path - -import requests -from bs4 import BeautifulSoup - -LOGGER = logging.getLogger(__name__) - - -def get_all_refs(url: str) -> list[str]: - """ - Get all the references for a given webpage - """ - resp = requests.get(url) - soup = BeautifulSoup(resp.text, "html.parser") - return [a["href"] for a in soup.find_all("a", href=True)] - - -def get_latest_spark_version() -> str: - """ - Returns the last version of Spark using spark archive - """ - LOGGER.info("Downloading Spark versions information") - all_refs = get_all_refs("https://archive.apache.org/dist/spark/") - versions = [ - ref.removeprefix("spark-").removesuffix("/") - for ref in all_refs - if ref.startswith("spark-") and "incubating" not in ref - ] - - # Compare versions semantically - def version_array(ver: str) -> tuple[int, int, int, str]: - # 3.5.3 -> [3, 5, 3, ""] - # 4.0.0-preview2 -> [4, 0, 0, "preview2"] - arr = ver.split(".") - assert len(arr) == 3, arr - major, minor = int(arr[0]), int(arr[1]) - patch, _, preview = arr[2].partition("-") - return (major, minor, int(patch), preview) - - latest_version = max(versions, key=lambda ver: version_array(ver)) - LOGGER.info(f"Latest version: {latest_version}") - return latest_version - - -def download_spark( - spark_version: str, - hadoop_version: str, - scala_version: str, - spark_download_url: Path, -) -> str: - """ - Downloads and unpacks spark - The resulting spark directory name is returned - """ - LOGGER.info("Downloading and unpacking Spark") - spark_dir_name = f"spark-{spark_version}-bin-hadoop{hadoop_version}" - if scala_version: - spark_dir_name += f"-scala{scala_version}" - LOGGER.info(f"Spark directory name: {spark_dir_name}") - spark_url = spark_download_url / f"spark-{spark_version}" / f"{spark_dir_name}.tgz" - - tmp_file = Path("/tmp/spark.tar.gz") - subprocess.check_call( - ["curl", "--progress-bar", "--location", "--output", tmp_file, spark_url] - ) - subprocess.check_call( - [ - "tar", - "xzf", - tmp_file, - "-C", - "/usr/local", - "--owner", - "root", - "--group", - "root", - "--no-same-owner", - ] - ) - tmp_file.unlink() - return spark_dir_name - - -def configure_spark(spark_dir_name: str, spark_home: Path) -> None: - """ - Creates a ${SPARK_HOME} symlink to a versioned spark directory - Creates a 10spark-config.sh symlink to source PYTHONPATH automatically - """ - LOGGER.info("Configuring Spark") - subprocess.check_call(["ln", "-s", f"/usr/local/{spark_dir_name}", spark_home]) - - # Add a link in the before_notebook hook in order to source PYTHONPATH automatically - CONFIG_SCRIPT = "/usr/local/bin/before-notebook.d/10spark-config.sh" - subprocess.check_call( - ["ln", "-s", spark_home / "sbin/spark-config.sh", CONFIG_SCRIPT] - ) - - -if __name__ == "__main__": - logging.basicConfig(level=logging.INFO) - - arg_parser = argparse.ArgumentParser() - arg_parser.add_argument("--spark-version", required=True) - arg_parser.add_argument("--hadoop-version", required=True) - arg_parser.add_argument("--scala-version", required=True) - arg_parser.add_argument("--spark-download-url", type=Path, required=True) - args = arg_parser.parse_args() - - args.spark_version = args.spark_version or get_latest_spark_version() - - spark_dir_name = download_spark( - spark_version=args.spark_version, - hadoop_version=args.hadoop_version, - scala_version=args.scala_version, - spark_download_url=args.spark_download_url, - ) - configure_spark( - spark_dir_name=spark_dir_name, spark_home=Path(os.environ["SPARK_HOME"]) - ) diff --git a/output/sas/Dockerfile b/output/sas/Dockerfile index e0fffc22a..53c3134a7 100644 --- a/output/sas/Dockerfile +++ b/output/sas/Dockerfile @@ -403,6 +403,8 @@ RUN mkdir -p /etc/rstudio && \ ENV PATH=$PATH:/usr/lib/rstudio-server/bin +ENV SPARK_HOME="/opt/conda/lib/python3.11/site-packages/pyspark" + # Install some default R packages RUN mamba install --quiet --yes \ 'r-arrow' \ @@ -411,7 +413,6 @@ RUN mamba install --quiet --yes \ 'r-e1071' \ 'r-hdf5r' \ 'r-markdown' \ - 'r-sparklyr' \ 'r-odbc' \ 'r-renv' \ 'r-rodbc' \ @@ -428,13 +429,6 @@ RUN python3 -m pip install \ fix-permissions $CONDA_DIR && \ fix-permissions /home/$NB_USER -ARG SPARK_VERSION="3.4.1" - -# Install sparklyr -RUN apt-get update && \ - apt install -y --no-install-recommends libxml2-dev libcurl4-openssl-dev && \ - Rscript -e "library(sparklyr); spark_install(version = ${SPARK_VERSION})" - # If using the docker bit in other Dockerfiles, this must get written over in a later layer ENV DEFAULT_JUPYTER_URL="/rstudio" ENV GIT_EXAMPLE_NOTEBOOKS=https://gitlab.k8s.cloud.statcan.ca/business-transformation/aaw/aaw-contrib-r-notebooks.git diff --git a/output/sas/setup_spark.py b/output/sas/setup_spark.py deleted file mode 100644 index c5b76433f..000000000 --- a/output/sas/setup_spark.py +++ /dev/null @@ -1,131 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) Jupyter Development Team. -# Distributed under the terms of the Modified BSD License. - -# Requirements: -# - Run as the root user -# - Required env variable: SPARK_HOME - -import argparse -import logging -import os -import subprocess -from pathlib import Path - -import requests -from bs4 import BeautifulSoup - -LOGGER = logging.getLogger(__name__) - - -def get_all_refs(url: str) -> list[str]: - """ - Get all the references for a given webpage - """ - resp = requests.get(url) - soup = BeautifulSoup(resp.text, "html.parser") - return [a["href"] for a in soup.find_all("a", href=True)] - - -def get_latest_spark_version() -> str: - """ - Returns the last version of Spark using spark archive - """ - LOGGER.info("Downloading Spark versions information") - all_refs = get_all_refs("https://archive.apache.org/dist/spark/") - versions = [ - ref.removeprefix("spark-").removesuffix("/") - for ref in all_refs - if ref.startswith("spark-") and "incubating" not in ref - ] - - # Compare versions semantically - def version_array(ver: str) -> tuple[int, int, int, str]: - # 3.5.3 -> [3, 5, 3, ""] - # 4.0.0-preview2 -> [4, 0, 0, "preview2"] - arr = ver.split(".") - assert len(arr) == 3, arr - major, minor = int(arr[0]), int(arr[1]) - patch, _, preview = arr[2].partition("-") - return (major, minor, int(patch), preview) - - latest_version = max(versions, key=lambda ver: version_array(ver)) - LOGGER.info(f"Latest version: {latest_version}") - return latest_version - - -def download_spark( - spark_version: str, - hadoop_version: str, - scala_version: str, - spark_download_url: Path, -) -> str: - """ - Downloads and unpacks spark - The resulting spark directory name is returned - """ - LOGGER.info("Downloading and unpacking Spark") - spark_dir_name = f"spark-{spark_version}-bin-hadoop{hadoop_version}" - if scala_version: - spark_dir_name += f"-scala{scala_version}" - LOGGER.info(f"Spark directory name: {spark_dir_name}") - spark_url = spark_download_url / f"spark-{spark_version}" / f"{spark_dir_name}.tgz" - - tmp_file = Path("/tmp/spark.tar.gz") - subprocess.check_call( - ["curl", "--progress-bar", "--location", "--output", tmp_file, spark_url] - ) - subprocess.check_call( - [ - "tar", - "xzf", - tmp_file, - "-C", - "/usr/local", - "--owner", - "root", - "--group", - "root", - "--no-same-owner", - ] - ) - tmp_file.unlink() - return spark_dir_name - - -def configure_spark(spark_dir_name: str, spark_home: Path) -> None: - """ - Creates a ${SPARK_HOME} symlink to a versioned spark directory - Creates a 10spark-config.sh symlink to source PYTHONPATH automatically - """ - LOGGER.info("Configuring Spark") - subprocess.check_call(["ln", "-s", f"/usr/local/{spark_dir_name}", spark_home]) - - # Add a link in the before_notebook hook in order to source PYTHONPATH automatically - CONFIG_SCRIPT = "/usr/local/bin/before-notebook.d/10spark-config.sh" - subprocess.check_call( - ["ln", "-s", spark_home / "sbin/spark-config.sh", CONFIG_SCRIPT] - ) - - -if __name__ == "__main__": - logging.basicConfig(level=logging.INFO) - - arg_parser = argparse.ArgumentParser() - arg_parser.add_argument("--spark-version", required=True) - arg_parser.add_argument("--hadoop-version", required=True) - arg_parser.add_argument("--scala-version", required=True) - arg_parser.add_argument("--spark-download-url", type=Path, required=True) - args = arg_parser.parse_args() - - args.spark_version = args.spark_version or get_latest_spark_version() - - spark_dir_name = download_spark( - spark_version=args.spark_version, - hadoop_version=args.hadoop_version, - scala_version=args.scala_version, - spark_download_url=args.spark_download_url, - ) - configure_spark( - spark_dir_name=spark_dir_name, spark_home=Path(os.environ["SPARK_HOME"]) - ) diff --git a/resources/common/setup_spark.py b/resources/common/setup_spark.py deleted file mode 100644 index c5b76433f..000000000 --- a/resources/common/setup_spark.py +++ /dev/null @@ -1,131 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) Jupyter Development Team. -# Distributed under the terms of the Modified BSD License. - -# Requirements: -# - Run as the root user -# - Required env variable: SPARK_HOME - -import argparse -import logging -import os -import subprocess -from pathlib import Path - -import requests -from bs4 import BeautifulSoup - -LOGGER = logging.getLogger(__name__) - - -def get_all_refs(url: str) -> list[str]: - """ - Get all the references for a given webpage - """ - resp = requests.get(url) - soup = BeautifulSoup(resp.text, "html.parser") - return [a["href"] for a in soup.find_all("a", href=True)] - - -def get_latest_spark_version() -> str: - """ - Returns the last version of Spark using spark archive - """ - LOGGER.info("Downloading Spark versions information") - all_refs = get_all_refs("https://archive.apache.org/dist/spark/") - versions = [ - ref.removeprefix("spark-").removesuffix("/") - for ref in all_refs - if ref.startswith("spark-") and "incubating" not in ref - ] - - # Compare versions semantically - def version_array(ver: str) -> tuple[int, int, int, str]: - # 3.5.3 -> [3, 5, 3, ""] - # 4.0.0-preview2 -> [4, 0, 0, "preview2"] - arr = ver.split(".") - assert len(arr) == 3, arr - major, minor = int(arr[0]), int(arr[1]) - patch, _, preview = arr[2].partition("-") - return (major, minor, int(patch), preview) - - latest_version = max(versions, key=lambda ver: version_array(ver)) - LOGGER.info(f"Latest version: {latest_version}") - return latest_version - - -def download_spark( - spark_version: str, - hadoop_version: str, - scala_version: str, - spark_download_url: Path, -) -> str: - """ - Downloads and unpacks spark - The resulting spark directory name is returned - """ - LOGGER.info("Downloading and unpacking Spark") - spark_dir_name = f"spark-{spark_version}-bin-hadoop{hadoop_version}" - if scala_version: - spark_dir_name += f"-scala{scala_version}" - LOGGER.info(f"Spark directory name: {spark_dir_name}") - spark_url = spark_download_url / f"spark-{spark_version}" / f"{spark_dir_name}.tgz" - - tmp_file = Path("/tmp/spark.tar.gz") - subprocess.check_call( - ["curl", "--progress-bar", "--location", "--output", tmp_file, spark_url] - ) - subprocess.check_call( - [ - "tar", - "xzf", - tmp_file, - "-C", - "/usr/local", - "--owner", - "root", - "--group", - "root", - "--no-same-owner", - ] - ) - tmp_file.unlink() - return spark_dir_name - - -def configure_spark(spark_dir_name: str, spark_home: Path) -> None: - """ - Creates a ${SPARK_HOME} symlink to a versioned spark directory - Creates a 10spark-config.sh symlink to source PYTHONPATH automatically - """ - LOGGER.info("Configuring Spark") - subprocess.check_call(["ln", "-s", f"/usr/local/{spark_dir_name}", spark_home]) - - # Add a link in the before_notebook hook in order to source PYTHONPATH automatically - CONFIG_SCRIPT = "/usr/local/bin/before-notebook.d/10spark-config.sh" - subprocess.check_call( - ["ln", "-s", spark_home / "sbin/spark-config.sh", CONFIG_SCRIPT] - ) - - -if __name__ == "__main__": - logging.basicConfig(level=logging.INFO) - - arg_parser = argparse.ArgumentParser() - arg_parser.add_argument("--spark-version", required=True) - arg_parser.add_argument("--hadoop-version", required=True) - arg_parser.add_argument("--scala-version", required=True) - arg_parser.add_argument("--spark-download-url", type=Path, required=True) - args = arg_parser.parse_args() - - args.spark_version = args.spark_version or get_latest_spark_version() - - spark_dir_name = download_spark( - spark_version=args.spark_version, - hadoop_version=args.hadoop_version, - scala_version=args.scala_version, - spark_download_url=args.spark_download_url, - ) - configure_spark( - spark_dir_name=spark_dir_name, spark_home=Path(os.environ["SPARK_HOME"]) - ) From ac650fc49d2429548cc601178b0f9d6cb1a847f9 Mon Sep 17 00:00:00 2001 From: Bryan Paget <8212170+bryanpaget@users.noreply.github.com> Date: Fri, 20 Dec 2024 18:16:40 +0000 Subject: [PATCH 14/14] feat: add r-sparklyr I'm adding this package to the base because it seems to get uninstalled when I sleep and resume my notebook server. --- docker-bits/6_rstudio.Dockerfile | 1 + output/jupyterlab-cpu/Dockerfile | 1 + output/jupyterlab-pytorch/Dockerfile | 1 + output/jupyterlab-tensorflow/Dockerfile | 1 + output/rstudio/Dockerfile | 1 + output/sas/Dockerfile | 1 + 6 files changed, 6 insertions(+) diff --git a/docker-bits/6_rstudio.Dockerfile b/docker-bits/6_rstudio.Dockerfile index 62724d652..7fe5ba430 100644 --- a/docker-bits/6_rstudio.Dockerfile +++ b/docker-bits/6_rstudio.Dockerfile @@ -26,6 +26,7 @@ RUN mamba install --quiet --yes \ 'r-renv' \ 'r-rodbc' \ 'r-sf' \ + 'r-sparklyr' \ 'r-tidyverse' \ && \ clean-layer.sh && \ diff --git a/output/jupyterlab-cpu/Dockerfile b/output/jupyterlab-cpu/Dockerfile index 4f23054c2..11bfefdb1 100644 --- a/output/jupyterlab-cpu/Dockerfile +++ b/output/jupyterlab-cpu/Dockerfile @@ -257,6 +257,7 @@ RUN mamba install --quiet --yes \ 'r-renv' \ 'r-rodbc' \ 'r-sf' \ + 'r-sparklyr' \ 'r-tidyverse' \ && \ clean-layer.sh && \ diff --git a/output/jupyterlab-pytorch/Dockerfile b/output/jupyterlab-pytorch/Dockerfile index d631d8ff3..e63edc78c 100644 --- a/output/jupyterlab-pytorch/Dockerfile +++ b/output/jupyterlab-pytorch/Dockerfile @@ -279,6 +279,7 @@ RUN mamba install --quiet --yes \ 'r-renv' \ 'r-rodbc' \ 'r-sf' \ + 'r-sparklyr' \ 'r-tidyverse' \ && \ clean-layer.sh && \ diff --git a/output/jupyterlab-tensorflow/Dockerfile b/output/jupyterlab-tensorflow/Dockerfile index 0bae81c7c..1be4277fd 100644 --- a/output/jupyterlab-tensorflow/Dockerfile +++ b/output/jupyterlab-tensorflow/Dockerfile @@ -386,6 +386,7 @@ RUN mamba install --quiet --yes \ 'r-renv' \ 'r-rodbc' \ 'r-sf' \ + 'r-sparklyr' \ 'r-tidyverse' \ && \ clean-layer.sh && \ diff --git a/output/rstudio/Dockerfile b/output/rstudio/Dockerfile index 9795add3f..0166d038b 100644 --- a/output/rstudio/Dockerfile +++ b/output/rstudio/Dockerfile @@ -257,6 +257,7 @@ RUN mamba install --quiet --yes \ 'r-renv' \ 'r-rodbc' \ 'r-sf' \ + 'r-sparklyr' \ 'r-tidyverse' \ && \ clean-layer.sh && \ diff --git a/output/sas/Dockerfile b/output/sas/Dockerfile index 53c3134a7..45fa2aed7 100644 --- a/output/sas/Dockerfile +++ b/output/sas/Dockerfile @@ -417,6 +417,7 @@ RUN mamba install --quiet --yes \ 'r-renv' \ 'r-rodbc' \ 'r-sf' \ + 'r-sparklyr' \ 'r-tidyverse' \ && \ clean-layer.sh && \