From 9ec4a3907252f5bf3beb69bcbeaa048522e66c92 Mon Sep 17 00:00:00 2001 From: Erik Sundell Date: Mon, 4 Apr 2022 00:31:34 +0200 Subject: [PATCH] ci: de-duplicate deps and get ci images to build again --- .github/workflows/build-publish-docs.yaml | 56 +----- .github/workflows/test.yaml | 101 ++++++++--- continuous_integration/docker/README.md | 137 +++++++++++++++ continuous_integration/docker/base/Dockerfile | 102 ++++++++--- .../base/files/etc/sudoers.d/preserve_path | 24 +++ .../docker/hadoop/Dockerfile | 166 +++++++++++++----- .../docker/hadoop/_install.sh | 31 ++-- .../docker/hadoop/_script.sh | 7 +- .../conf.kerberos/container-executor.cfg | 14 +- .../etc/hadoop/conf.kerberos/core-site.xml | 2 +- .../etc/hadoop/conf.kerberos/hdfs-site.xml | 6 +- .../etc/hadoop/conf.kerberos/mapred-site.xml | 2 +- .../etc/hadoop/conf.kerberos/yarn-site.xml | 19 +- .../etc/hadoop/conf.simple/core-site.xml | 2 +- .../docker/hadoop/files/etc/krb5.conf | 29 +-- .../docker/hadoop/files/etc/supervisord.conf | 14 +- .../etc/supervisord.d/hdfs-datanode.conf | 16 +- .../etc/supervisord.d/hdfs-namenode.conf | 16 +- .../files/etc/supervisord.d/kerberos.conf | 27 +-- .../etc/supervisord.d/yarn-nodemanager.conf | 16 +- .../supervisord.d/yarn-resourcemanager.conf | 16 +- .../docker/hadoop/files/root/setup-hadoop.sh | 33 ---- .../docker/hadoop/files/root/setup-kerb.sh | 29 --- .../files/{root => scripts}/init-hdfs.sh | 3 +- .../hadoop/files/scripts/setup-hadoop.sh | 75 ++++++++ .../docker/hadoop/files/scripts/setup-kerb.sh | 88 ++++++++++ .../files/var/kerberos/krb5kdc/kdc.conf | 14 +- continuous_integration/docker/hadoop/start.sh | 2 +- continuous_integration/docker/pbs/Dockerfile | 52 ++++-- continuous_integration/docker/pbs/_install.sh | 28 +-- continuous_integration/docker/pbs/_script.sh | 5 +- .../docker/pbs/files/etc/sudoers.d/dask | 2 +- .../pbs/files/{root => scripts}/start.sh | 0 continuous_integration/docker/pbs/start.sh | 2 +- .../docker/slurm/Dockerfile | 117 +++++++----- .../docker/slurm/_install.sh | 28 +-- .../docker/slurm/_script.sh | 5 +- .../docker/slurm/files/etc/slurm/slurm.conf | 5 +- .../docker/slurm/files/etc/sudoers.d/dask | 2 +- .../slurm/files/etc/supervisord.d/slurm.conf | 1 + .../files/{root => scripts}/init-mysql.sh | 7 +- continuous_integration/docker/slurm/start.sh | 2 +- continuous_integration/install.sh | 34 ---- continuous_integration/kubernetes/install.sh | 24 --- continuous_integration/kubernetes/script.sh | 2 - dask-gateway-server/setup.py | 15 +- dask-gateway/setup.py | 16 +- docs/requirements.txt | 20 +++ docs/source/develop.rst | 2 +- tests/requirements.txt | 65 +++++++ tests/test_pbs_backend.py | 4 +- tests/test_slurm_backend.py | 4 +- tests/test_yarn_backend.py | 4 +- 53 files changed, 980 insertions(+), 513 deletions(-) create mode 100644 continuous_integration/docker/README.md create mode 100644 continuous_integration/docker/base/files/etc/sudoers.d/preserve_path delete mode 100755 continuous_integration/docker/hadoop/files/root/setup-hadoop.sh delete mode 100755 continuous_integration/docker/hadoop/files/root/setup-kerb.sh rename continuous_integration/docker/hadoop/files/{root => scripts}/init-hdfs.sh (98%) create mode 100755 continuous_integration/docker/hadoop/files/scripts/setup-hadoop.sh create mode 100755 continuous_integration/docker/hadoop/files/scripts/setup-kerb.sh rename continuous_integration/docker/pbs/files/{root => scripts}/start.sh (100%) rename continuous_integration/docker/slurm/files/{root => scripts}/init-mysql.sh (94%) delete mode 100755 continuous_integration/install.sh delete mode 100755 continuous_integration/kubernetes/install.sh delete mode 100755 continuous_integration/kubernetes/script.sh create mode 100644 docs/requirements.txt create mode 100644 tests/requirements.txt diff --git a/.github/workflows/build-publish-docs.yaml b/.github/workflows/build-publish-docs.yaml index 3ae786d4..d92cac0e 100644 --- a/.github/workflows/build-publish-docs.yaml +++ b/.github/workflows/build-publish-docs.yaml @@ -9,14 +9,12 @@ on: - "docs/**" - "dask-gateway/**" - "dask-gateway-server/**" - - "continuous_integration/install.sh" - ".github/workflows/build-publish-docs.yaml" push: paths: - "docs/**" - "dask-gateway/**" - "dask-gateway-server/**" - - "continuous_integration/install.sh" - ".github/workflows/build-publish-docs.yaml" branches-ignore: - "dependabot/**" @@ -39,44 +37,18 @@ jobs: steps: - uses: actions/checkout@v3 - - uses: actions/setup-python@v3 with: python-version: "3.10" - # FIXME: go is used to compile dask-gateway-server/dask-gateway-proxy, but - # that isn't relevant for us when building documentation, but, we - # end up doing it anyhow currently because we re-use a course - # installation script. - # - # If this is fixed, also update the job below. - # - - uses: actions/setup-go@v3 - with: - go-version: "1.18" - - # FIXME: node is used to install configurable-http-proxy, used by - # JupyterHub when started for us to run tests. As we don't need - # this to setup docs, we can get rid of this if we update our - # installation of dependencies to be a bit more scoped. - # - # If this is fixed, also update the job below. - # - - uses: actions/setup-node@v3 - with: - node-version: "16" - - - name: Install docs requirements + - name: Install Python docs requirements run: | - pushd dask-gateway-server/dask-gateway-proxy - go get github.com/stretchr/testify/assert - popd - continuous_integration/install.sh - pip install sphinx dask-sphinx-theme sphinx-autobuild autodoc-traits kubernetes_asyncio skein sqlalchemy + cd docs + pip install -r requirements.txt - name: Build docs (make html) run: | - pushd docs + cd docs make html SPHINXOPTS='--color -W --keep-going' - name: Push built docs to gh-pages branch @@ -92,28 +64,16 @@ jobs: steps: - uses: actions/checkout@v3 - - uses: actions/setup-python@v3 with: python-version: "3.10" - - uses: actions/setup-go@v3 - with: - go-version: "1.18" - - - uses: actions/setup-node@v3 - with: - node-version: "16" - - - name: Install docs requirements + - name: Install Python docs requirements run: | - pushd dask-gateway-server/dask-gateway-proxy - go get github.com/stretchr/testify/assert - popd - continuous_integration/install.sh - pip install sphinx dask-sphinx-theme sphinx-autobuild autodoc-traits kubernetes_asyncio skein sqlalchemy + cd docs + pip install -r requirements.txt - name: Linkcheck docs (make linkcheck) run: | - pushd docs + cd docs make linkcheck SPHINXOPTS='--color -W --keep-going' diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 8b38bf68..bf620bb4 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -67,20 +67,26 @@ jobs: steps: - uses: actions/checkout@v3 - - uses: actions/setup-python@v3 with: python-version: "${{ matrix.python-version }}" - - uses: actions/setup-go@v3 with: go-version: "${{ matrix.go-version }}" - - uses: actions/setup-node@v3 + # jupyterhub will when being mock started as part of running tests depend + # on the Node npm package configurable-http-proxy. + # + - name: Install jupyterhub with system dependencies + run: | + npm install -g configurable-http-proxy + pip install jupyterhub + - name: Install Python test requirements run: | - continuous_integration/install.sh + cd tests + pip install -r requirements.txt - name: List Python packages run: | @@ -88,7 +94,7 @@ jobs: - name: Run Python tests run: | - py.test tests/ -k 'not kubernetes' -v + pytest -v tests/ -k 'not kubernetes' - name: Install Go test requirements run: | @@ -100,17 +106,6 @@ jobs: cd dask-gateway-server/dask-gateway-proxy go test - hadoop-tests: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - name: Hadoop Install/Start - if: ${{ env.commit_msg != 'skip-tests' }} - run: | - ./continuous_integration/docker/hadoop/start.sh - ./continuous_integration/docker/hadoop/install.sh - ./continuous_integration/docker/hadoop/script.sh - kubernetes-tests: runs-on: ubuntu-latest strategy: @@ -150,11 +145,21 @@ jobs: - name: Helm Install run: | ./continuous_integration/kubernetes/helm-install.sh - ./continuous_integration/kubernetes/install.sh + + - name: Install Python test requirements + run: | + cd tests + pip install -r requirements.txt + + - name: List Python packages + run: | + pip list - name: Kubernetes Tests run: | - ./continuous_integration/kubernetes/script.sh + TEST_DASK_GATEWAY_KUBE=true \ + TEST_DASK_GATEWAY_KUBE_ADDRESS=http://localhost:30200/services/dask-gateway/ \ + pytest -v tests/kubernetes/ # ref: https://github.com/jupyterhub/action-k8s-namespace-report - name: Kubernetes namespace report @@ -166,22 +171,78 @@ jobs: deploy/controller-test-dask-gateway deploy/traefik-test-dask-gateway + # The tests run in this job rely on by setting up a development environment in + # a pre-built container. + # + # - start.sh - starts the container + # - install.sh - setups for testing in the container + # - script.sh - runs tests in the container + # + hadoop-tests: + runs-on: ubuntu-latest + permissions: + contents: read + packages: read + + steps: + - uses: actions/checkout@v3 + + - name: Login to ghcr.io read access to CI image + run: echo "${{ secrets.github_token }}" | docker login ghcr.io -u $ --password-stdin + + - name: Hadoop tests (Yarn and Kerberos involved) + if: ${{ env.commit_msg != 'skip-tests' }} + run: | + ./continuous_integration/docker/hadoop/start.sh + ./continuous_integration/docker/hadoop/install.sh + ./continuous_integration/docker/hadoop/script.sh + + # The tests run in this job rely on by setting up a development environment in + # a pre-built container. + # + # - start.sh - starts the container + # - install.sh - setups for testing in the container + # - script.sh - runs tests in the container + # pbs-tests: runs-on: ubuntu-latest + permissions: + contents: read + packages: read + steps: - uses: actions/checkout@v3 - - name: PBS Tests + + - name: Login to ghcr.io read access to CI image + run: echo "${{ secrets.github_token }}" | docker login ghcr.io -u $ --password-stdin + + - name: PBS tests (A jobqueue backend) if: ${{ env.commit_msg != 'skip-tests' }} run: | ./continuous_integration/docker/pbs/start.sh ./continuous_integration/docker/pbs/install.sh ./continuous_integration/docker/pbs/script.sh + # The tests run in this job rely on by setting up a development environment in + # a pre-built container. + # + # - start.sh - starts the container + # - install.sh - setups for testing in the container + # - script.sh - runs tests in the container + # slurm-tests: runs-on: ubuntu-latest + permissions: + contents: read + packages: read + steps: - uses: actions/checkout@v3 - - name: Slurm Tests + + - name: Login to ghcr.io read access to CI image + run: echo "${{ secrets.github_token }}" | docker login ghcr.io -u $ --password-stdin + + - name: Slurm tests (A jobqueue backend) if: ${{ env.commit_msg != 'skip-tests' }} run: | ./continuous_integration/docker/slurm/start.sh diff --git a/continuous_integration/docker/README.md b/continuous_integration/docker/README.md new file mode 100644 index 00000000..d8d74280 --- /dev/null +++ b/continuous_integration/docker/README.md @@ -0,0 +1,137 @@ +# About these Dockerfiles + +As dask-gateway can be used to start different kinds of dask clusters, we need +to be able to test against those dask cluster backends. To do that we maintain +docker images setup to run the various dask cluster backends so we can test +against them. + +The images doesn't install `dask-gateway-server` within them as then we would +need to rebuild the images all the time with the specific version of +`dask-gateway-server` we want to test. Instead, the idea is to mount the local +code to a container and install dependencies before that before running the +tests. For example the `start.sh` script starts a container, and +`install.sh`/`script.sh` are wrappers to run `_install.sh`/`_script.py` scripts +in the started container. + +## Manual build and update of images + +For now these images are built and updated manually. Below are instructions for +a maintainer of the dask/dask-gateway repo on how to do it. + +1. Create a personal access token (PAT) for your account with `write:packages` + permissions at https://github.com/settings/tokens/new. + +1. Login to the ghcr.io container registry with the PAT: + + ```shell + docker login ghcr.io -u your-username + ``` + +1. Build the images: + + ```shell + docker build --no-cache -t ghcr.io/dask/dask-gateway-ci-base ./base + docker build --no-cache -t ghcr.io/dask/dask-gateway-ci-hadoop ./hadoop + docker build --no-cache -t ghcr.io/dask/dask-gateway-ci-pbs ./pbs + docker build --no-cache -t ghcr.io/dask/dask-gateway-ci-slurm ./slurm + ``` + +1. Verify that images seem to work + + ```shell + # hadoop: verify that the supervisord programs starts successfully + docker run --hostname=master.example.com --rm ghcr.io/dask/dask-gateway-ci-hadoop + + # pbs: verify that logs doesn't include errors + docker run --hostname=pbs --rm ghcr.io/dask/dask-gateway-ci-pbs + + # slurm: verify that the supervisord programs starts successfully + docker run --hostname=slurm --rm ghcr.io/dask/dask-gateway-ci-slurm + ``` + +1. Push the images: + + ```shell + docker push ghcr.io/dask/dask-gateway-ci-base + docker push ghcr.io/dask/dask-gateway-ci-hadoop + docker push ghcr.io/dask/dask-gateway-ci-pbs + docker push ghcr.io/dask/dask-gateway-ci-slurm + ``` + +## Debugging + +### General advice + +1. If you get a `docker build` error, you can do `docker run -it --rm ` to + a saved layer before the erroring step and then manually do the next `RUN` + step or inspect the file system of its current state. Note that intermediary + layers are not saved if you have set `export DOCKER_BUILDKIT=1`, so this + trick can only be used without buildkit. +1. A Dockerfile's `COPY` command can update permissions of folders if you let it + copy nested folders. For example, `COPY ./files /` would update the + permissions of `/etc` based on the permissions set on the folder and files in + this git repo locally. +1. File permissions you have set in this git repo locally won't be version + controlled, besides the execute bit. Due to that, you must avoid relying on + local file permissions when building images. + +### The hadoop image + +Setting up the YARN backend, part of Hadoop, was very tricky. Here are some +commands of relevance to debug the container. + +```shell +# Build the base image +docker build --tag ghcr.io/dask/dask-gateway-ci-base ./base + +# Build the hadoop image +docker build --tag ghcr.io/dask/dask-gateway-ci-hadoop ./hadoop + +# Start a container and watch logs from supervisord that starts the various +# programs we need to configure and run successfully. +docker run --hostname master.example.com --rm ghcr.io/dask/dask-gateway-ci-hadoop + +# Start a container and inspect the container from a shell if something doesn't +# start correctly. +docker stop hadoop --timeout=0 +docker run --name hadoop --hostname master.example.com --detach --rm ghcr.io/dask/dask-gateway-ci-hadoop +docker exec -it hadoop bash + +# Useful commands to run INSIDE the built and started container +supervisorctl status +cat /var/log/supervisor/hdfs-namenode.log +cat /var/log/supervisor/hdfs-datanode.log +cat /var/log/supervisor/yarn-nodemanager.log +cat /var/log/supervisor/yarn-resourcemanager.log +cat /var/log/supervisor/krb5kdc.log +cat /var/log/supervisor/kadmind.log +``` + +### The slurm image + +If you upgrade `slurm` to a new version, you may very well run into breaking +changes in your `slurm.conf`. + +```shell +# Build the base image +docker build --tag ghcr.io/dask/dask-gateway-ci-base ./base + +# Build the slurm image +docker build --tag ghcr.io/dask/dask-gateway-ci-slurm ./slurm + +# Start a container and watch logs from supervisord that starts the various +# programs we need to configure and run successfully. +docker run --hostname slurm --rm ghcr.io/dask/dask-gateway-ci-slurm + +# Start a container and inspect the container from a shell if something doesn't +# start correctly. +docker stop slurm --timeout=0 +docker run --name slurm --hostname slurm --detach --rm ghcr.io/dask/dask-gateway-ci-slurm +docker exec -it slurm bash + +# Useful commands to run INSIDE the built and started container +supervisorctl status +cat /var/log/supervisord.log +cat /var/log/supervisor/slurmdbd.log +cat /var/log/supervisor/slurmctld.log +``` diff --git a/continuous_integration/docker/base/Dockerfile b/continuous_integration/docker/base/Dockerfile index 9f99be2c..d7e06060 100644 --- a/continuous_integration/docker/base/Dockerfile +++ b/continuous_integration/docker/base/Dockerfile @@ -1,33 +1,79 @@ -FROM centos:7.6.1810 +# See continuous_integration/docker/README.md for details about this and other +# Dockerfiles under the continuous_integration/docker folder on their purpose +# and how to work with them. +# +# centos:8 reached end-of-life 31 Dec 2021 +# centos:7 reach end-of-life 30 Jun 2024 +# +FROM centos:7 -# Install miniconda and tini +ARG python_version="3.10" +ARG go_version="1.18" + +LABEL org.opencontainers.image.source="https://github.com/dask-gateway" + +# Install common yum packages +RUN yum install -y \ + sudo \ + # sudo is used to run commands as various other users + git \ + # git is a requirement for golang to fetch dependencies during + # compilation of the golang code we have in + # dask-gateway-server/dask-gateway-proxy. + && yum clean all \ + && rm -rf /var/cache/yum + +# Install python and the following utilities: +# +# - tini: can wrap an container entrypoint to avoid misc issues, see +# https://github.com/krallin/tini#readme +# - psutil: provides misc tools of relevance for debugging, see +# https://psutil.readthedocs.io/en/latest/#about +# +# NOTE: micromamba is a slimmed mamba/conda executable functioning without a +# pre-installed Python environment we use to install a Python version of +# choice to not first need to install a full Python environment to then +# install another Python environment. +# +# See https://github.com/mamba-org/mamba#micromamba. +# RUN yum install -y bzip2 \ - && curl https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -o /tmp/miniconda.sh \ - && /bin/bash /tmp/miniconda.sh -b -p /opt/miniconda \ - && rm /tmp/miniconda.sh \ - && /opt/miniconda/bin/conda update conda -y \ - && /opt/miniconda/bin/conda config --set always_yes yes --set changeps1 no \ - && /opt/miniconda/bin/conda install tini \ - && /opt/miniconda/bin/conda clean -af \ - && find /opt/miniconda/ -type f -name '*.a' -delete \ - && find /opt/miniconda/ -type f -name '*.pyc' -delete \ - && yum remove -y bzip2 \ - && yum clean all \ - && rm -rf /var/cache/yum + \ + && curl -sL https://micromamba.snakepit.net/api/micromamba/linux-64/latest \ + | tar --extract --verbose --bzip2 bin/micromamba --strip-components=1 \ + && ./micromamba install \ + --channel conda-forge \ + --root-prefix="/opt/python" \ + --prefix="/opt/python" \ + python=${python_version} \ + mamba \ + psutil \ + tini \ + && rm ./micromamba \ + && /opt/python/bin/mamba clean -af \ + && find /opt/python/ -type f -name '*.a' -delete \ + && find /opt/python/ -type f -name '*.pyc' -delete \ + \ + && yum remove -y bzip2 \ + && yum clean all \ + && rm -rf /var/cache/yum # Install go -RUN curl https://dl.google.com/go/go1.12.2.linux-amd64.tar.gz -o /tmp/go.tar.gz \ - && tar -xzf /tmp/go.tar.gz -C /opt/ \ - && rm /tmp/go.tar.gz +RUN curl -sL https://dl.google.com/go/go${go_version}.linux-amd64.tar.gz \ + | tar --extract --verbose --gzip --directory=/opt/ + +# Put Python and Go environments on PATH +# +# NOTE: This PATH environment will be preserved if sudo is used to switch to +# other users thanks to changes to /etc/sudoers.d/preserve_path. +# +ENV PATH=/opt/python/bin:/opt/go/bin:$PATH +COPY ./files/etc /etc/ -# Make a few user accounts, and configure their bashrc files -RUN useradd -m dask \ - && useradd -m alice \ - && useradd -m bob \ - && groupadd dask_users \ - && usermod -a -G dask_users alice \ - && usermod -a -G dask_users bob \ - && echo 'export PATH="/opt/go/bin:/opt/miniconda/bin:$PATH"' >> ~/.bashrc \ - && echo 'export PATH="/opt/miniconda/bin:$PATH"' >> /home/dask/.bashrc \ - && echo 'export PATH="/opt/miniconda/bin:$PATH"' >> /home/alice/.bashrc \ - && echo 'export PATH="/opt/miniconda/bin:$PATH"' >> /home/bob/.bashrc +# Make a few user accounts and a user group for later use +RUN useradd --create-home dask \ + && useradd --create-home alice \ + && useradd --create-home bob \ + && groupadd dask_users \ + && usermod --append --groups dask_users alice \ + && usermod --append --groups dask_users bob diff --git a/continuous_integration/docker/base/files/etc/sudoers.d/preserve_path b/continuous_integration/docker/base/files/etc/sudoers.d/preserve_path new file mode 100644 index 00000000..4c06f097 --- /dev/null +++ b/continuous_integration/docker/base/files/etc/sudoers.d/preserve_path @@ -0,0 +1,24 @@ +# This config ensures that the PATH environment variable this only-for-testing +# container is started with is preserved when changing to other users with sudo. +# +# NOTES: +# +# - `sudo` is used to execute commands as other users. What then happens to the +# environment will be determined by configuration in /etc/sudoers and +# /etc/sudoers.d/* as well as flags we pass to the sudo command. The behavior +# can be inspected with `sudo -V` run as root. +# +# ref: `man sudo` https://linux.die.net/man/8/sudo +# ref: `man sudoers` https://www.sudo.ws/man/1.8.15/sudoers.man.html +# +# - We disable the `secure_path` which is set by default in /etc/sudoers as it +# would override the PATH variable. +Defaults !secure_path +# +# - We can use the `-E` or `--preserve-env` flag to pass through most +# environment variables, but understand that exceptions are caused by the +# sudoers configuration: `env_delete`, `env_check`, and `secure_path`. +# +# - We reduce the `env_delete` list of default variables to be deleted. It has +# higher priority than the `--preserve-env` flag and `env_keep` configuration. +Defaults env_delete -= "PATH" diff --git a/continuous_integration/docker/hadoop/Dockerfile b/continuous_integration/docker/hadoop/Dockerfile index 91ff74b5..f7f88663 100644 --- a/continuous_integration/docker/hadoop/Dockerfile +++ b/continuous_integration/docker/hadoop/Dockerfile @@ -1,56 +1,128 @@ -FROM daskgateway/testing-base:latest -MAINTAINER jcrist - -# Install common utilities -RUN yum install -y epel-release \ - && yum install -y \ - sudo \ - bzip2 \ - java-1.8.0-openjdk \ - supervisor \ - && yum clean all \ - && rm -rf /var/cache/yum +# See continuous_integration/docker/README.md for details about this and other +# Dockerfiles under the continuous_integration/docker folder on their purpose +# and how to work with them. +# +FROM ghcr.io/dask/dask-gateway-ci-base:latest + +# Notify dask-gateway tests that Yarn (part of Hadoop) is available +ENV TEST_DASK_GATEWAY_YARN true + + + +# Install hadoop +# +# 1. Create hadoop users and groups. +# +RUN groupadd --system hadoop \ + && useradd yarn --system --gid hadoop \ + && useradd hdfs --system --gid hadoop \ + && useradd mapred --system --gid hadoop +# +# 2. Install hadoop v3 dependencies +# +# - Java 8+ +# - OpenSSL 1.1 (openssl 1.0 comes with centos:7, found via epel-release repo) +# +RUN yum install -y \ + epel-release \ + && yum install -y \ + java-11-openjdk \ + openssl11-libs \ + && yum clean all \ + && rm -rf /var/cache/yum +ENV JAVA_HOME /usr/lib/jvm/jre-openjdk +# +# 3. Download and unpack hadoop +# +# hadoop versions: https://dlcdn.apache.org/hadoop/common/ +# hadoop changelog: https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/release/ +# +# We set the owner user:group to root:hadoop and declare the setuid and +# setgid bits for this directory so that folders created in it become owned +# by root:hadoop as well. +# +RUN INSTALL_HADOOP_VERSION=3.3.2 \ + && curl -sL /tmp/hadoop.tar.gz https://dlcdn.apache.org/hadoop/common/stable/hadoop-${INSTALL_HADOOP_VERSION}.tar.gz \ + | tar -xvz --directory /opt \ + && mv /opt/hadoop-* /opt/hadoop \ + && chown -R root:hadoop /opt/hadoop \ + && chmod ug+s /opt/hadoop +# +# 4. Copy our hadoop configurations +# +# The permissions are important. +# +COPY --chown=root:hadoop ./files/etc/hadoop /etc/hadoop/ +# +# 5. Update PATH environment variable +# +# Note that this PATH environment will be preserved when sudo is used to +# switch to other users thanks to changes to /etc/sudoers.d/preserve_path, +# which is configured in the base Dockerfile. +# +ENV PATH=/opt/hadoop/sbin:/opt/hadoop/bin:$PATH +# +# 6. Copy our setup script and run it +# +COPY --chown=root:hadoop ./files/scripts/setup-hadoop.sh /scripts/ +COPY --chown=root:hadoop ./files/scripts/init-hdfs.sh /scripts/ +RUN /scripts/setup-hadoop.sh + + # Install kerberos +# +# 1. Install yum packages +# RUN yum install -y \ krb5-libs \ krb5-server \ krb5-workstation \ - && yum clean all \ - && rm -rf /var/cache/yum + && yum clean all \ + && rm -rf /var/cache/yum +# +# 2. Copy our kerberos configuration +# +COPY ./files/etc/krb5.conf /etc/ +COPY ./files/var/kerberos/krb5kdc /var/kerberos/krb5kdc/ +# +# 3. Copy our setup script and run it +# +COPY ./files/scripts/setup-kerb.sh /scripts/ +RUN /scripts/setup-kerb.sh -# Install hadoop -RUN cd /etc/yum.repos.d/ && { curl -O https://archive.cloudera.com/cdh5/redhat/7/x86_64/cdh/cloudera-cdh5.repo ; cd -; } \ - && rpm --import https://archive.cloudera.com/cdh5/redhat/7/x86_64/cdh/RPM-GPG-KEY-cloudera \ - && yum install -y \ - hadoop-yarn-resourcemanager \ - hadoop-hdfs-namenode \ - hadoop-yarn-nodemanager \ - hadoop-hdfs-datanode \ - hadoop-client \ - hadoop-libhdfs \ - && yum clean all \ - && rm -rf /var/cache/yum - -# Copy over files -COPY ./files / - -# Fix container-executor permissions -RUN chmod 6050 /etc/hadoop/conf.kerberos/container-executor.cfg - -# Configure and setup hadoop -RUN /root/setup-hadoop.sh - -# Setup kerberos -RUN /root/setup-kerb.sh -ENV JAVA_HOME /usr/lib/jvm/jre-openjdk -ENV LIBHDFS3_CONF /etc/hadoop/conf/hdfs-site.xml -ENV HADOOP_CONF_DIR /etc/hadoop/conf -ENV HADOOP_HOME /usr/lib/hadoop -ENV HADOOP_COMMON_HOME /usr/lib/hadoop -ENV HADOOP_YARN_HOME /usr/lib/hadoop-yarn -ENV HADOOP_HDFS_HOME /usr/lib/hadoop-hdfs -ENV TEST_DASK_GATEWAY_YARN true -CMD ["/usr/bin/supervisord", "--configuration", "/etc/supervisord.conf"] +# Install supervisor +# +# - supervisord will be the entrypoint of the container, configured to start +# multiple services via provided files. +# - /etc/supervisor.conf declares that /etc/supervisor.d/* should be included +# among other things. +# - /etc/supervisor.d/* declares a few supervisor programs, running the +# following commands as specified user: +# +# COMMAND | USER | LOGFILE +# -------------------- | ---- | ----------------------------------------------------- +# hdfs datanode | hdfs | /var/log/supervisor/hdfs-datanode.log +# hdfs namenode | hdfs | /var/log/supervisor/hdfs-namenode.log +# krb5kdc | root | /var/log/supervisor/krb5kdc.log +# kadmind | root | /var/log/supervisor/kadmind.log +# yarn nodemanager | yarn | /var/log/supervisor/yarn-nodemanager.log +# yarn resourcemanager | yarn | /var/log/supervisor/yarn-resourcemanager.log +# +# 1. Install supervisor (which requires already installed epel-release). +# +RUN yum install -y \ + supervisor \ + && yum clean all \ + && rm -rf /var/cache/yum +# +# 2. Copy files used by supervisor +# +COPY ./files/etc/supervisord.d /etc/supervisord.d/ +COPY ./files/etc/supervisord.conf /etc/ +# +# 3. Configure the container to start supervisord with our configuration. +# +ENTRYPOINT ["/usr/bin/supervisord", "--configuration", "/etc/supervisord.conf"] diff --git a/continuous_integration/docker/hadoop/_install.sh b/continuous_integration/docker/hadoop/_install.sh index 23848c1b..52a7ec93 100755 --- a/continuous_integration/docker/hadoop/_install.sh +++ b/continuous_integration/docker/hadoop/_install.sh @@ -1,31 +1,20 @@ #!/usr/bin/env bash -source ~/.bashrc - set -xe cd /working -conda install psutil pykerberos -conda install -c conda-forge python=3.8 - -pip install \ - aiohttp \ - colorlog \ - dask \ - distributed \ - cryptography \ - traitlets \ - sqlalchemy \ - skein \ - pytest \ - pytest-asyncio +# pykerberos needs to compile c++ code that depends on system libraries, by +# installing it from conda-forge, we avoid such hassle. +mamba install pykerberos -pushd dask-gateway -python setup.py develop +# This installs everything besides compiling +# dask-gateway-server/dask-gateway-proxy +pushd tests +pip install -r requirements.txt popd -pushd dask-gateway-server -python setup.py develop -popd +# This ensures we also have a compiled dask-gateway-server/dask-gateway-proxy +# bundled with dask-gateway-proxy. +pip install --editable dask-gateway-server pip list diff --git a/continuous_integration/docker/hadoop/_script.sh b/continuous_integration/docker/hadoop/_script.sh index b319d10b..edd89088 100755 --- a/continuous_integration/docker/hadoop/_script.sh +++ b/continuous_integration/docker/hadoop/_script.sh @@ -1,8 +1,7 @@ #!/usr/bin/env bash -source ~/.bashrc - set -xe cd /working - -py.test tests/test_yarn_backend.py tests/test_auth.py -v +pytest -v \ + tests/test_yarn_backend.py \ + tests/test_auth.py diff --git a/continuous_integration/docker/hadoop/files/etc/hadoop/conf.kerberos/container-executor.cfg b/continuous_integration/docker/hadoop/files/etc/hadoop/conf.kerberos/container-executor.cfg index 9973476e..04ccddcd 100644 --- a/continuous_integration/docker/hadoop/files/etc/hadoop/conf.kerberos/container-executor.cfg +++ b/continuous_integration/docker/hadoop/files/etc/hadoop/conf.kerberos/container-executor.cfg @@ -1,6 +1,10 @@ -yarn.nodemanager.local-dirs=/var/lib/hadoop-yarn/cache/yarn/nm-local-dir -yarn.nodemanager.linux-container-executor.group=yarn -yarn.nodemanager.log-dirs=/var/log/hadoop-yarn/containers -banned.users=hdfs,yarn,mapred,bin +# The configuration of yarn.nodemanager is duplicated to yarn-site.xml. It is +# unclear if we need it written out twice. +# +yarn.application.classpath=$HADOOP_COMMON_HOME/*,$HADOOP_COMMON_HOME/lib/* +yarn.nodemanager.local-dirs=/var/tmp/hadoop-yarn/local +yarn.nodemanager.log-dirs=/var/tmp/hadoop-yarn/log +yarn.nodemanager.linux-container-executor.group=hadoop -min.user.id=500 +banned.users=hdfs,yarn,mapred,bin +min.user.id=1000 diff --git a/continuous_integration/docker/hadoop/files/etc/hadoop/conf.kerberos/core-site.xml b/continuous_integration/docker/hadoop/files/etc/hadoop/conf.kerberos/core-site.xml index f1585ad8..48cf2d1b 100644 --- a/continuous_integration/docker/hadoop/files/etc/hadoop/conf.kerberos/core-site.xml +++ b/continuous_integration/docker/hadoop/files/etc/hadoop/conf.kerberos/core-site.xml @@ -52,7 +52,7 @@ hadoop.http.authentication.signature.secret.file - /etc/hadoop/conf/http-secret-file + /opt/hadoop/etc/hadoop/http-secret-file diff --git a/continuous_integration/docker/hadoop/files/etc/hadoop/conf.kerberos/hdfs-site.xml b/continuous_integration/docker/hadoop/files/etc/hadoop/conf.kerberos/hdfs-site.xml index 6e3639fa..f066376e 100644 --- a/continuous_integration/docker/hadoop/files/etc/hadoop/conf.kerberos/hdfs-site.xml +++ b/continuous_integration/docker/hadoop/files/etc/hadoop/conf.kerberos/hdfs-site.xml @@ -22,7 +22,7 @@ dfs.namenode.keytab.file - /etc/hadoop/conf/master-keytabs/hdfs.keytab + /opt/hadoop/etc/hadoop/master-keytabs/hdfs.keytab @@ -37,7 +37,7 @@ dfs.datanode.keytab.file - /etc/hadoop/conf/master-keytabs/hdfs.keytab + /opt/hadoop/etc/hadoop/master-keytabs/hdfs.keytab @@ -52,7 +52,7 @@ dfs.web.authentication.kerberos.keytab - /etc/hadoop/conf/master-keytabs/HTTP.keytab + /opt/hadoop/etc/hadoop/master-keytabs/HTTP.keytab diff --git a/continuous_integration/docker/hadoop/files/etc/hadoop/conf.kerberos/mapred-site.xml b/continuous_integration/docker/hadoop/files/etc/hadoop/conf.kerberos/mapred-site.xml index 89352338..7e416209 100644 --- a/continuous_integration/docker/hadoop/files/etc/hadoop/conf.kerberos/mapred-site.xml +++ b/continuous_integration/docker/hadoop/files/etc/hadoop/conf.kerberos/mapred-site.xml @@ -17,7 +17,7 @@ mapreduce.jobhistory.keytab - /etc/hadoop/conf/master-keytabs/mapred.keytab + /opt/hadoop/etc/hadoop/master-keytabs/mapred.keytab diff --git a/continuous_integration/docker/hadoop/files/etc/hadoop/conf.kerberos/yarn-site.xml b/continuous_integration/docker/hadoop/files/etc/hadoop/conf.kerberos/yarn-site.xml index 51f21646..9bb7a9f2 100644 --- a/continuous_integration/docker/hadoop/files/etc/hadoop/conf.kerberos/yarn-site.xml +++ b/continuous_integration/docker/hadoop/files/etc/hadoop/conf.kerberos/yarn-site.xml @@ -13,15 +13,8 @@ yarn.application.classpath - $HADOOP_CONF_DIR, $HADOOP_COMMON_HOME/*, - $HADOOP_COMMON_HOME/lib/*, - $HADOOP_HDFS_HOME/*, - $HADOOP_HDFS_HOME/lib/*, - $HADOOP_MAPRED_HOME/*, - $HADOOP_MAPRED_HOME/lib/*, - $HADOOP_YARN_HOME/*, - $HADOOP_YARN_HOME/lib/* + $HADOOP_COMMON_HOME/lib/* @@ -32,7 +25,7 @@ yarn.nodemanager.log-dirs - file:///var/tmp/hadoop-yarn/logs + file:///var/tmp/hadoop-yarn/log @@ -72,7 +65,7 @@ yarn.resourcemanager.keytab - /etc/hadoop/conf/master-keytabs/yarn.keytab + /opt/hadoop/etc/hadoop/master-keytabs/yarn.keytab @@ -82,7 +75,7 @@ yarn.nodemanager.keytab - /etc/hadoop/conf/master-keytabs/yarn.keytab + /opt/hadoop/etc/hadoop/master-keytabs/yarn.keytab @@ -97,12 +90,12 @@ yarn.nodemanager.linux-container-executor.path - /usr/lib/hadoop-yarn/bin/container-executor + /opt/hadoop/bin/container-executor yarn.nodemanager.linux-container-executor.group - yarn + hadoop diff --git a/continuous_integration/docker/hadoop/files/etc/hadoop/conf.simple/core-site.xml b/continuous_integration/docker/hadoop/files/etc/hadoop/conf.simple/core-site.xml index df6090df..21eff8c8 100644 --- a/continuous_integration/docker/hadoop/files/etc/hadoop/conf.simple/core-site.xml +++ b/continuous_integration/docker/hadoop/files/etc/hadoop/conf.simple/core-site.xml @@ -42,7 +42,7 @@ hadoop.http.authentication.signature.secret.file - /etc/hadoop/conf/http-secret-file + /opt/hadoop/etc/hadoop/http-secret-file diff --git a/continuous_integration/docker/hadoop/files/etc/krb5.conf b/continuous_integration/docker/hadoop/files/etc/krb5.conf index 9413d1cc..449a8ebf 100644 --- a/continuous_integration/docker/hadoop/files/etc/krb5.conf +++ b/continuous_integration/docker/hadoop/files/etc/krb5.conf @@ -1,18 +1,23 @@ +# krb5.conf is a configuration for Kerberos. supervisord is configured to start +# a Kerberos Key Distribution Center (KDC) influenced by this configuration. +# +# krb5.conf reference: +# https://web.mit.edu/kerberos/krb5-1.19/doc/admin/conf_files/krb5_conf.html +# [logging] - default = FILE:/var/log/krb5libs.log - kdc = FILE:/var/log/krb5kdc.log - admin_server = FILE:/var/log/kadmind.log +default = FILE:/var/log/supervisor/krb5libs.log +kdc = FILE:/var/log/supervisor/krb5kdc.log +admin_server = FILE:/var/log/supervisor/kadmind.log [libdefaults] - default_realm = EXAMPLE.COM - dns_lookup_realm = false - dns_lookup_kdc = false - ticket_lifetime = 24h - renew_lifetime = 7d - forwardable = true +default_realm = EXAMPLE.COM +dns_lookup_realm = false +dns_lookup_kdc = false +ticket_lifetime = 24h +forwardable = true [realms] - EXAMPLE.COM = { - kdc = master.example.com - admin_server = master.example.com +EXAMPLE.COM = { + kdc = master.example.com + admin_server = master.example.com } diff --git a/continuous_integration/docker/hadoop/files/etc/supervisord.conf b/continuous_integration/docker/hadoop/files/etc/supervisord.conf index 76b6e9d0..27e43bc6 100644 --- a/continuous_integration/docker/hadoop/files/etc/supervisord.conf +++ b/continuous_integration/docker/hadoop/files/etc/supervisord.conf @@ -1,8 +1,14 @@ +# supervisord starts other "programs" declared in the additional configuration +# files found in the /etc/supervisor.d folder. +# +# supervisord configuration reference: +# http://supervisord.org/configuration.html#configuration-file +# [supervisord] strip_ansi = true -nodaemon = true -logfile = /var/log/supervisord.log -pidfile = /var/run/supervisord.pid +nodaemon = true +logfile = /var/log/supervisord.log +pidfile = /var/run/supervisord.pid [unix_http_server] file = /tmp/supervisor.sock @@ -12,7 +18,7 @@ supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface [supervisorctl] serverurl = unix:///tmp/supervisor.sock -prompt = hadoop +prompt = hadoop [include] files = /etc/supervisord.d/*.conf diff --git a/continuous_integration/docker/hadoop/files/etc/supervisord.d/hdfs-datanode.conf b/continuous_integration/docker/hadoop/files/etc/supervisord.d/hdfs-datanode.conf index 7f619a3d..7e3b22d7 100644 --- a/continuous_integration/docker/hadoop/files/etc/supervisord.d/hdfs-datanode.conf +++ b/continuous_integration/docker/hadoop/files/etc/supervisord.d/hdfs-datanode.conf @@ -1,9 +1,9 @@ [program:hdfs-datanode] -command=hdfs datanode -startsecs=2 -stopwaitsecs=10 -user=hdfs -redirect_stderr=true -stdout_logfile=/var/log/hadoop-hdfs/hadoop-hdfs-datanode.log -autostart=true -autorestart=false +user = hdfs +command = hdfs datanode +stdout_logfile = /var/log/supervisor/hdfs-datanode.log +redirect_stderr = true +autostart = true +autorestart = false +startsecs = 3 +stopwaitsecs = 10 diff --git a/continuous_integration/docker/hadoop/files/etc/supervisord.d/hdfs-namenode.conf b/continuous_integration/docker/hadoop/files/etc/supervisord.d/hdfs-namenode.conf index a74d19f0..97f62481 100644 --- a/continuous_integration/docker/hadoop/files/etc/supervisord.d/hdfs-namenode.conf +++ b/continuous_integration/docker/hadoop/files/etc/supervisord.d/hdfs-namenode.conf @@ -1,9 +1,9 @@ [program:hdfs-namenode] -command=hdfs namenode -startsecs=2 -stopwaitsecs=10 -user=hdfs -redirect_stderr=true -stdout_logfile=/var/log/hadoop-hdfs/hadoop-hdfs-namenode.log -autostart=true -autorestart=false +user = hdfs +command = hdfs namenode +stdout_logfile = /var/log/supervisor/hdfs-namenode.log +redirect_stderr = true +autostart = true +autorestart = false +startsecs = 3 +stopwaitsecs = 10 diff --git a/continuous_integration/docker/hadoop/files/etc/supervisord.d/kerberos.conf b/continuous_integration/docker/hadoop/files/etc/supervisord.d/kerberos.conf index a0ac2282..b3d5ec59 100644 --- a/continuous_integration/docker/hadoop/files/etc/supervisord.d/kerberos.conf +++ b/continuous_integration/docker/hadoop/files/etc/supervisord.d/kerberos.conf @@ -1,15 +1,18 @@ +# krb5kdc or kadmind aren't emitting logs to stdout but writing logs directly to +# files as configured in /etc/krb5.conf +# [program:krb5kdc] -command=/bin/bash -c "exec /usr/sbin/krb5kdc -r EXAMPLE.COM -P /var/run/krb5kdc.pid -n" -redirect_stderr=true -stdout_logfile=/dev/stdout -stdout_logfile_maxbytes=0 -autostart=true -autorestart=true +user = root +command = /usr/sbin/krb5kdc -r EXAMPLE.COM -P /var/run/krb5kdc.pid -n +stdout_logfile = /dev/stdout +stdout_logfile_maxbytes = 0 +autostart = true +autorestart = true [program:kadmind] -command=/bin/bash -c "exec /usr/sbin/kadmind -r EXAMPLE.COM -P /var/run/kadmind.pid -nofork" -redirect_stderr=true -stdout_logfile=/dev/stdout -stdout_logfile_maxbytes=0 -autostart=true -autorestart=true +user = root +command = /usr/sbin/kadmind -r EXAMPLE.COM -P /var/run/kadmind.pid -nofork +stdout_logfile = /dev/stdout +stdout_logfile_maxbytes = 0 +autostart = true +autorestart = true diff --git a/continuous_integration/docker/hadoop/files/etc/supervisord.d/yarn-nodemanager.conf b/continuous_integration/docker/hadoop/files/etc/supervisord.d/yarn-nodemanager.conf index 8a0db5be..be61e617 100644 --- a/continuous_integration/docker/hadoop/files/etc/supervisord.d/yarn-nodemanager.conf +++ b/continuous_integration/docker/hadoop/files/etc/supervisord.d/yarn-nodemanager.conf @@ -1,9 +1,9 @@ [program:yarn-nodemanager] -command=yarn nodemanager -startsecs=2 -stopwaitsecs=10 -user=yarn -redirect_stderr=true -stdout_logfile=/var/log/hadoop-yarn/hadoop-yarn-nodemanager.log -autostart=true -autorestart=false +user = yarn +command = yarn nodemanager +stdout_logfile = /var/log/supervisor/yarn-nodemanager.log +redirect_stderr = true +autostart = true +autorestart = false +startsecs = 3 +stopwaitsecs = 10 diff --git a/continuous_integration/docker/hadoop/files/etc/supervisord.d/yarn-resourcemanager.conf b/continuous_integration/docker/hadoop/files/etc/supervisord.d/yarn-resourcemanager.conf index 4d5db5bc..6643a422 100644 --- a/continuous_integration/docker/hadoop/files/etc/supervisord.d/yarn-resourcemanager.conf +++ b/continuous_integration/docker/hadoop/files/etc/supervisord.d/yarn-resourcemanager.conf @@ -1,9 +1,9 @@ [program:yarn-resourcemanager] -command=yarn resourcemanager -startsecs=2 -stopwaitsecs=10 -user=yarn -redirect_stderr=true -stdout_logfile=/var/log/hadoop-yarn/hadoop-yarn-resourcemanager.log -autostart=true -autorestart=false +user = yarn +command = yarn resourcemanager +stdout_logfile = /var/log/supervisor/yarn-resourcemanager.log +redirect_stderr = true +autostart = true +autorestart = false +startsecs = 3 +stopwaitsecs = 10 diff --git a/continuous_integration/docker/hadoop/files/root/setup-hadoop.sh b/continuous_integration/docker/hadoop/files/root/setup-hadoop.sh deleted file mode 100755 index 1fdb64f7..00000000 --- a/continuous_integration/docker/hadoop/files/root/setup-hadoop.sh +++ /dev/null @@ -1,33 +0,0 @@ -# /bin/bash - -set -ex - -# Configure HDFS -ln -s /etc/hadoop/conf.empty/log4j.properties /etc/hadoop/conf.simple/log4j.properties \ - && ln -s /etc/hadoop/conf.empty/log4j.properties /etc/hadoop/conf.kerberos/log4j.properties \ - && alternatives --install /etc/hadoop/conf hadoop-conf /etc/hadoop/conf.simple 50 \ - && alternatives --set hadoop-conf /etc/hadoop/conf.simple - -# Create yarn directories with proper permissions -mkdir -p /var/tmp/hadoop-yarn/local /var/tmp/hadoop-yarn/logs \ - && chown -R yarn:yarn /var/tmp/hadoop-yarn/local /var/tmp/hadoop-yarn/logs - -# Create secret key to authenticate web access -dd if=/dev/urandom bs=64 count=1 > /etc/hadoop/conf/http-secret-file -chown hdfs:hadoop /etc/hadoop/conf/http-secret-file -chmod 440 /etc/hadoop/conf/http-secret-file - -# Format namenode -sudo -E -u hdfs bash -c "hdfs namenode -format -force" - -# Format filesystem -# XXX: Add to hosts to resolve name temporarily -echo "127.0.0.1 master.example.com" >> /etc/hosts -sudo -E -u hdfs bash -c "hdfs namenode"& -sudo -E -u hdfs bash -c "hdfs datanode"& -sudo -E -u hdfs /root/init-hdfs.sh -killall java - -# Install conf.kerberos as final conf -alternatives --install /etc/hadoop/conf hadoop-conf /etc/hadoop/conf.kerberos 50 \ -&& alternatives --set hadoop-conf /etc/hadoop/conf.kerberos diff --git a/continuous_integration/docker/hadoop/files/root/setup-kerb.sh b/continuous_integration/docker/hadoop/files/root/setup-kerb.sh deleted file mode 100755 index 78e1205a..00000000 --- a/continuous_integration/docker/hadoop/files/root/setup-kerb.sh +++ /dev/null @@ -1,29 +0,0 @@ -#! /bin/bash - -create_keytabs() { - HOST="$1.example.com" - KEYTABS="/etc/hadoop/conf.kerberos/$1-keytabs" - kadmin.local -q "addprinc -randkey hdfs/$HOST@EXAMPLE.COM" \ - && kadmin.local -q "addprinc -randkey mapred/$HOST@EXAMPLE.COM" \ - && kadmin.local -q "addprinc -randkey yarn/$HOST@EXAMPLE.COM" \ - && kadmin.local -q "addprinc -randkey HTTP/$HOST@EXAMPLE.COM" \ - && mkdir "$KEYTABS" \ - && kadmin.local -q "xst -norandkey -k $KEYTABS/hdfs.keytab hdfs/$HOST HTTP/$HOST" \ - && kadmin.local -q "xst -norandkey -k $KEYTABS/mapred.keytab mapred/$HOST HTTP/$HOST" \ - && kadmin.local -q "xst -norandkey -k $KEYTABS/yarn.keytab yarn/$HOST HTTP/$HOST" \ - && kadmin.local -q "xst -norandkey -k $KEYTABS/HTTP.keytab HTTP/$HOST" \ - && chown hdfs:hadoop $KEYTABS/hdfs.keytab \ - && chown mapred:hadoop $KEYTABS/mapred.keytab \ - && chown yarn:hadoop $KEYTABS/yarn.keytab \ - && chown hdfs:hadoop $KEYTABS/HTTP.keytab \ - && chmod 440 $KEYTABS/*.keytab -} - -kdb5_util create -s -P testpass \ -&& create_keytabs master \ -&& kadmin.local -q "addprinc -pw adminpass root/admin" \ -&& kadmin.local -q "addprinc -pw testpass dask" \ -&& kadmin.local -q "addprinc -pw testpass alice" \ -&& kadmin.local -q "addprinc -pw testpass bob" \ -&& kadmin.local -q "xst -norandkey -k /home/dask/dask.keytab dask HTTP/master.example.com" \ -&& chown dask:dask /home/dask/dask.keytab diff --git a/continuous_integration/docker/hadoop/files/root/init-hdfs.sh b/continuous_integration/docker/hadoop/files/scripts/init-hdfs.sh similarity index 98% rename from continuous_integration/docker/hadoop/files/root/init-hdfs.sh rename to continuous_integration/docker/hadoop/files/scripts/init-hdfs.sh index 9d93a6d9..fc89f094 100755 --- a/continuous_integration/docker/hadoop/files/root/init-hdfs.sh +++ b/continuous_integration/docker/hadoop/files/scripts/init-hdfs.sh @@ -1,4 +1,5 @@ -#! /bin/bash +#!/bin/bash +set -x # Exponential backoff on testing hdfs status, then run init script echo "Waiting to connect to HDFS" diff --git a/continuous_integration/docker/hadoop/files/scripts/setup-hadoop.sh b/continuous_integration/docker/hadoop/files/scripts/setup-hadoop.sh new file mode 100755 index 00000000..74b4aab5 --- /dev/null +++ b/continuous_integration/docker/hadoop/files/scripts/setup-hadoop.sh @@ -0,0 +1,75 @@ +#!/bin/bash +set -ex + +# Tweak hadoop configuration and permissions: +# +# - hadoop is unpacked with default configuration in etc/hadoop, we relocate +# that to /etc/hadoop/conf.empty. +# +mv /opt/hadoop/etc/hadoop /etc/hadoop/conf.empty +# +# - log4j.properties is a requirement to have in the hadoop configuration +# directory that we don't wan't to redefine, so we copy it from the default +# configuration to our configurations. +# +cp /etc/hadoop/conf.empty/log4j.properties /etc/hadoop/conf.simple/ +cp /etc/hadoop/conf.empty/log4j.properties /etc/hadoop/conf.kerberos/ +# +# - Create /opt/hadoop/logs directory with high group permissions to ensure it +# isn't created with narrow permissions later when running "hdfs namenode". +# +mkdir -p /opt/hadoop/logs +chmod g+w /opt/hadoop/logs +# +# - Create /var/tmp/dfs/name directory with (?) permissions to ensure it isn't +# created with wrong permissions later. +# +mkdir -p /var/tmp/dfs/name +chown -R root:hadoop /var/tmp +chmod -R 6770 /var/tmp +# +# - Generate a key to authenticate web access during the brief time we use the +# /etc/hadoop/conf.simple configuration as part of building the docker image. +# +dd if=/dev/urandom bs=64 count=1 > /etc/hadoop/conf.simple/http-secret-file +chown root:hadoop /etc/hadoop/conf.simple/http-secret-file +chmod 440 /etc/hadoop/conf.simple/http-secret-file +# +# - Declare HDFS configuration to use temporarily, let /opt/hadoop/etc/hadoop +# point to /etc/hadoop/conf.simple. +# +alternatives --install /opt/hadoop/etc/hadoop hadoop-conf /etc/hadoop/conf.simple 50 +alternatives --set hadoop-conf /etc/hadoop/conf.simple + + + + +# Initialize HDFS filesystem with content to test against +# +# 1. Delete all hdfs files and start with a clean slate. +# +sudo --preserve-env --user hdfs \ + hdfs namenode -format -force +# +# 2. Add to hosts to resolve a domain name, /etc/hosts will be cleared when the +# container starts though, see https://stackoverflow.com/a/25613983. This +# container is supposed to start with "--hostname master.example.com". +# +echo "127.0.0.1 master.example.com" >> /etc/hosts +# +# 3. Start "hdfs namenode" and "hdfs datanode" but detach with "&" to continue +# doing other things. +# +sudo --preserve-env --user hdfs \ + hdfs namenode & +sudo --preserve-env --user hdfs \ + hdfs datanode & +# +# 4. Run a script to bootstrap the HDFS filesystem with content for testing. +# +sudo --preserve-env --user hdfs \ + /scripts/init-hdfs.sh +# +# 5. Shut down started "hdfs namenode" and "hdfs datanode" processes. +# +pkill java diff --git a/continuous_integration/docker/hadoop/files/scripts/setup-kerb.sh b/continuous_integration/docker/hadoop/files/scripts/setup-kerb.sh new file mode 100755 index 00000000..06ad8c27 --- /dev/null +++ b/continuous_integration/docker/hadoop/files/scripts/setup-kerb.sh @@ -0,0 +1,88 @@ +#!bin/bash +set -ex + +# This scripts configures file system permissions and initializes kerberos with +# principals and passwords for later tests using kdb5_util and kadmin.local. +# +# References: +# - kadmin.local: https://web.mit.edu/kerberos/krb5-1.12/doc/admin/admin_commands/kadmin_local.html?highlight=kadmin#options +# - kdb5_util: https://web.mit.edu/kerberos/krb5-1.12/doc/admin/admin_commands/kdb5_util.html +# + +# Tweak file system permissions and switch to new hadoop config at +# /etc/hadoop/conf.kerberos. +# +# - The /opt/hadoop/bin/container-executor binary has stringent permissions +# requirements on itself and its configuration. +# +# About binary: +# - user-owned by root / group-owned by special group (chown root:hadoop) +# - others do not have any permissions (chmod xxx0) +# - be setuid/setgid (chmod 6xxx) +# - Reference: https://github.com/apache/hadoop/blob/907ef6c2858dc42d83cc228d13409830a7d7b163/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/container-executor.h#L78-L88 +# +# About config: +# - user-owned by root +# - not writable by group / world +# - Reference: https://github.com/apache/hadoop/blob/03cfc852791c14fad39db4e5b14104a276c08e59/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/configuration.c#L78-L100 +# +chmod 6050 /opt/hadoop/bin/container-executor +chmod 440 /etc/hadoop/conf.kerberos/container-executor.cfg +# +# - Create directories declared both in /etc/hadoop/conf.kerberos/yarn-site.xml +# and /etc/hadoop/conf.kerberos/container-executor.cfg about: +# +# - yarn.nodemanager.local-dirs +# - yarn.nodemanager.log-dirs +# +# These directories must have 751 permissions and have yarn:hadoop owners. +# +# Reference: https://hadoop.apache.org/docs/stable/hadoop-yarn/hadoop-yarn-site/SecureContainer.html#Configuration +# +mkdir -p \ + /var/tmp/hadoop-yarn/local \ + /var/tmp/hadoop-yarn/log +chown -R yarn:hadoop \ + /var/tmp/hadoop-yarn/local \ + /var/tmp/hadoop-yarn/log +chmod 751 \ + /var/tmp/hadoop-yarn/local \ + /var/tmp/hadoop-yarn/log +# +# - Declare HDFS configuration to use, let /opt/hadoop/etc/hadoop point to +# /etc/hadoop/conf.kerberos now instead of /etc/hadoop/conf.simple that was +# used to bootstrap hdfs with folders and files. +# +alternatives --install /opt/hadoop/etc/hadoop hadoop-conf /etc/hadoop/conf.kerberos 50 +alternatives --set hadoop-conf /etc/hadoop/conf.kerberos + + + +# Initialize kereberos with principals and keytables +# +kdb5_util create -s -P testpass + +HOST="master.example.com" +KEYTABS="/etc/hadoop/conf.kerberos/master-keytabs" +kadmin.local -q "addprinc -randkey hdfs/$HOST@EXAMPLE.COM" +kadmin.local -q "addprinc -randkey mapred/$HOST@EXAMPLE.COM" +kadmin.local -q "addprinc -randkey yarn/$HOST@EXAMPLE.COM" +kadmin.local -q "addprinc -randkey HTTP/$HOST@EXAMPLE.COM" +mkdir "$KEYTABS" +kadmin.local -q "xst -norandkey -k $KEYTABS/hdfs.keytab hdfs/$HOST HTTP/$HOST" +kadmin.local -q "xst -norandkey -k $KEYTABS/mapred.keytab mapred/$HOST HTTP/$HOST" +kadmin.local -q "xst -norandkey -k $KEYTABS/yarn.keytab yarn/$HOST HTTP/$HOST" +kadmin.local -q "xst -norandkey -k $KEYTABS/HTTP.keytab HTTP/$HOST" +chown hdfs:hadoop $KEYTABS/hdfs.keytab +chown mapred:hadoop $KEYTABS/mapred.keytab +chown yarn:hadoop $KEYTABS/yarn.keytab +chown hdfs:hadoop $KEYTABS/HTTP.keytab +chmod 440 $KEYTABS/*.keytab + +kadmin.local -q "addprinc -pw adminpass root/admin" +kadmin.local -q "addprinc -pw testpass dask" +kadmin.local -q "addprinc -pw testpass alice" +kadmin.local -q "addprinc -pw testpass bob" +kadmin.local -q "xst -norandkey -k /home/dask/dask.keytab dask HTTP/master.example.com" +chown dask:dask /home/dask/dask.keytab +chmod 440 /home/dask/dask.keytab diff --git a/continuous_integration/docker/hadoop/files/var/kerberos/krb5kdc/kdc.conf b/continuous_integration/docker/hadoop/files/var/kerberos/krb5kdc/kdc.conf index aa0ae10f..956d7b03 100644 --- a/continuous_integration/docker/hadoop/files/var/kerberos/krb5kdc/kdc.conf +++ b/continuous_integration/docker/hadoop/files/var/kerberos/krb5kdc/kdc.conf @@ -1,11 +1,11 @@ [kdcdefaults] - kdc_ports = 88 - kdc_tcp_ports = 88 +kdc_ports = 88 +kdc_tcp_ports = 88 [realms] - EXAMPLE.COM = { - acl_file = /var/kerberos/krb5kdc/kadm5.acl - dict_file = /usr/share/dict/words - admin_keytab = /var/kerberos/krb5kdc/kadm5.keytab - supported_enctypes = des3-hmac-sha1:normal arcfour-hmac:normal des-hmac-sha1:normal des-cbc-md5:normal des-cbc-crc:normal +EXAMPLE.COM = { + acl_file = /var/kerberos/krb5kdc/kadm5.acl + dict_file = /usr/share/dict/words + admin_keytab = /var/kerberos/krb5kdc/kadm5.keytab + supported_enctypes = des3-hmac-sha1:normal arcfour-hmac:normal des-hmac-sha1:normal des-cbc-md5:normal des-cbc-crc:normal } diff --git a/continuous_integration/docker/hadoop/start.sh b/continuous_integration/docker/hadoop/start.sh index c4afb5e8..b962d55c 100755 --- a/continuous_integration/docker/hadoop/start.sh +++ b/continuous_integration/docker/hadoop/start.sh @@ -12,4 +12,4 @@ docker run --rm -d \ -p 8000:8000 \ -p 8786:8786 \ -p 8088:8088 \ - daskgateway/testing-hadoop + ghcr.io/dask/dask-gateway-ci-hadoop diff --git a/continuous_integration/docker/pbs/Dockerfile b/continuous_integration/docker/pbs/Dockerfile index 851619e4..b590f10d 100644 --- a/continuous_integration/docker/pbs/Dockerfile +++ b/continuous_integration/docker/pbs/Dockerfile @@ -1,20 +1,46 @@ -FROM daskgateway/testing-base:latest - -# Install pbspro (now openpbs) -RUN yum install -y unzip \ - && curl -L -o /tmp/pbspro.zip https://github.com/openpbs/openpbs/releases/download/v18.1.4/pbspro_1.8.4.centos7.zip \ - && unzip /tmp/pbspro.zip -d /tmp/pbspro \ - && yum install -y sudo /tmp/pbspro/pbspro*/pbspro-server-*.rpm \ - && yum remove -y unzip \ - && yum clean all \ - && rm -rf /var/cache/yum +# See continuous_integration/docker/README.md for details about this and other +# Dockerfiles under the continuous_integration/docker folder on their purpose +# and how to work with them. +# +FROM ghcr.io/dask/dask-gateway-ci-base:latest # Notify dask-gateway tests that PBS is available ENV TEST_DASK_GATEWAY_PBS true -ENV PBS_MASTER pbs_master + +# Install openpbs +# +# 1. Download and install .rpm +# +# OpenPBS versions: https://github.com/openpbs/openpbs/releases +# +# We use an old version because there isn't a modern one pre-built for +# centos:7 as used in the base image. The old version was called propbs, so +# there is a change needed in the download url related to that if switching +# to a newwer version. +# +RUN INSTALL_OPENPBS_VERSION=19.1.3 \ + && yum install -y unzip \ + \ + && curl -sL -o /tmp/openpbs.zip https://github.com/openpbs/openpbs/releases/download/v${INSTALL_OPENPBS_VERSION}/pbspro_${INSTALL_OPENPBS_VERSION}.centos_7.zip \ + && unzip /tmp/openpbs.zip -d /opt/openpbs \ + && rm /tmp/openpbs.zip \ + && yum install -y \ + /opt/openpbs/*pbs*/*-server-*.rpm \ + \ + && yum remove -y unzip \ + && yum clean all \ + && rm -rf /var/cache/yum +# +# 2. Update PATH environment variable +# +# Note that this PATH environment will be preserved when sudo is used to +# switch to other users thanks to changes to /etc/sudoers.d/preserve_path, +# which is configured in the base Dockerfile. +# +ENV PATH=/opt/pbs/bin:$PATH # Copy over files COPY ./files / -ENTRYPOINT ["/opt/miniconda/bin/tini", "-g", "--"] -CMD ["/root/start.sh"] +ENTRYPOINT ["/opt/python/bin/tini", "-g", "--"] +CMD ["/scripts/start.sh"] diff --git a/continuous_integration/docker/pbs/_install.sh b/continuous_integration/docker/pbs/_install.sh index 2b3d10dc..a68881ec 100755 --- a/continuous_integration/docker/pbs/_install.sh +++ b/continuous_integration/docker/pbs/_install.sh @@ -1,30 +1,16 @@ #!/usr/bin/env bash -source ~/.bashrc - set -xe cd /working -conda install psutil -conda install -c conda-forge python=3.8 - -pip install \ - aiohttp \ - colorlog \ - dask \ - distributed \ - cryptography \ - traitlets \ - sqlalchemy \ - pytest \ - pytest-asyncio - -pushd dask-gateway -python setup.py develop +# This installs everything besides compiling +# dask-gateway-server/dask-gateway-proxy +pushd tests +pip install -r requirements.txt popd -pushd dask-gateway-server -python setup.py develop -popd +# This ensures we also have a compiled dask-gateway-server/dask-gateway-proxy +# bundled with dask-gateway-proxy. +pip install --editable dask-gateway-server pip list diff --git a/continuous_integration/docker/pbs/_script.sh b/continuous_integration/docker/pbs/_script.sh index 974bc28e..41972f0b 100755 --- a/continuous_integration/docker/pbs/_script.sh +++ b/continuous_integration/docker/pbs/_script.sh @@ -1,8 +1,5 @@ #!/usr/bin/env bash -source ~/.bashrc - set -xe cd /working - -py.test tests/test_pbs_backend.py -v +pytest -v tests/test_pbs_backend.py diff --git a/continuous_integration/docker/pbs/files/etc/sudoers.d/dask b/continuous_integration/docker/pbs/files/etc/sudoers.d/dask index 4124a81a..19a93ee4 100644 --- a/continuous_integration/docker/pbs/files/etc/sudoers.d/dask +++ b/continuous_integration/docker/pbs/files/etc/sudoers.d/dask @@ -1,4 +1,4 @@ -Cmnd_Alias DASK_GATEWAY_JOBQUEUE_LAUNCHER = /opt/miniconda/bin/dask-gateway-jobqueue-launcher +Cmnd_Alias DASK_GATEWAY_JOBQUEUE_LAUNCHER = /opt/python/bin/dask-gateway-jobqueue-launcher %dask_users ALL=(dask) /usr/bin/sudo dask ALL=(%dask_users) NOPASSWD:DASK_GATEWAY_JOBQUEUE_LAUNCHER diff --git a/continuous_integration/docker/pbs/files/root/start.sh b/continuous_integration/docker/pbs/files/scripts/start.sh similarity index 100% rename from continuous_integration/docker/pbs/files/root/start.sh rename to continuous_integration/docker/pbs/files/scripts/start.sh diff --git a/continuous_integration/docker/pbs/start.sh b/continuous_integration/docker/pbs/start.sh index bb6fb28f..06ce7f0e 100755 --- a/continuous_integration/docker/pbs/start.sh +++ b/continuous_integration/docker/pbs/start.sh @@ -13,4 +13,4 @@ docker run --rm -d \ -p 8786:8786 \ -p 8088:8088 \ --cap-add=SYS_RESOURCE \ - daskgateway/testing-pbs + ghcr.io/dask/dask-gateway-ci-pbs diff --git a/continuous_integration/docker/slurm/Dockerfile b/continuous_integration/docker/slurm/Dockerfile index cb8aaf36..ad8d8602 100644 --- a/continuous_integration/docker/slurm/Dockerfile +++ b/continuous_integration/docker/slurm/Dockerfile @@ -1,31 +1,51 @@ -FROM daskgateway/testing-base:latest +# See continuous_integration/docker/README.md for details about this and other +# Dockerfiles under the continuous_integration/docker folder on their purpose +# and how to work with them. +# +FROM ghcr.io/dask/dask-gateway-ci-base:latest -# Build and install slurm -RUN yum install -y epel-release \ - && yum install -y \ - bzip2 \ +# Notify dask-gateway tests that Slurm is available +ENV TEST_DASK_GATEWAY_SLURM true + +# Install Slurm +# +# 1. Download and compile slurm +# +# Slurm versions: https://download.schedmd.com/slurm/ +# Slurm release notes: https://github.com/SchedMD/slurm/blame/HEAD/RELEASE_NOTES +# +RUN INSTALL_SLURM_VERSION=21.08.6 \ + && yum install -y \ + # required to install supervisor (and more?) epel-release \ + && yum install -y \ + # temporary installation dependencies later uninstalled + bzip2 \ gcc \ - man2html \ - mariadb-server \ mariadb-devel \ - munge \ munge-devel \ ncurses-devel \ - openssl \ openssl-devel \ - perl \ readline-devel \ - && curl -L https://download.schedmd.com/slurm/slurm-19.05.0.tar.bz2 -o /tmp/slurm.tar.bz2 \ - && cd /tmp \ - && tar -jxf /tmp/slurm.tar.bz2 \ - && cd slurm-19.05.0 \ - && ./configure --sysconfdir=/etc/slurm --with-mysql_config=/usr/bin --libdir=/usr/lib64 \ - && make install \ - && cd .. \ - && rm -rf slurm-19.05.0 \ - && rm -r slurm.tar.bz2 \ - && yum remove -y \ + # persistent installation dependencies + man2html \ + mariadb-server \ + munge \ + openssl \ + perl \ + supervisor \ + \ + && curl -sL https://download.schedmd.com/slurm/slurm-${INSTALL_SLURM_VERSION}.tar.bz2 \ + | tar --extract --verbose --bzip2 --directory=/tmp \ + && cd /tmp/slurm-* \ + && ./configure \ + --sysconfdir=/etc/slurm \ + --with-mysql_config=/usr/bin \ + --libdir=/usr/lib64 \ + && make install \ + && rm -rf /tmp/slurm-* \ + \ + && yum remove -y \ bzip2 \ gcc \ mariadb-devel \ @@ -33,34 +53,41 @@ RUN yum install -y epel-release \ ncurses-devel \ openssl-devel \ readline-devel \ - && yum clean all \ - && rm -rf /var/cache/yum - -# Install supervisor and sudo -RUN yum install -y supervisor sudo \ - && yum clean all \ - && rm -rf /var/cache/yum - -# Setup Slurm -RUN groupadd -r slurm \ - && useradd -r -g slurm slurm \ - && mkdir /etc/sysconfig/slurm \ - /var/spool/slurmd \ - /var/run/slurmd \ + && yum clean all \ + && rm -rf /var/cache/yum +# +# 2. Setup Slurm +# +RUN groupadd --system slurm \ + && useradd --system --gid slurm slurm \ + && mkdir \ + /etc/sysconfig/slurm \ /var/lib/slurmd \ /var/log/slurm \ - && chown slurm:slurm /var/spool/slurmd \ /var/run/slurmd \ + /var/spool/slurmd \ + && chown slurm:slurm \ /var/lib/slurmd \ /var/log/slurm \ - && /sbin/create-munge-key - -# Copy over files -COPY ./files / - -RUN /root/init-mysql.sh - -# Notify dask-gateway tests that Slurm is available -ENV TEST_DASK_GATEWAY_SLURM true + /var/run/slurmd \ + /var/spool/slurmd \ + && /sbin/create-munge-key +# +# 3. Copy misc configuration files +# +COPY --chown=slurm:slurm ./files/etc/slurm /etc/slurm/ +COPY ./files/etc/sudoers.d /etc/sudoers.d/ +COPY ./files/etc/supervisord.conf /etc/ +COPY ./files/etc/supervisord.d /etc/supervisord.d/ +RUN chmod 644 /etc/slurm/slurm.conf \ + && chmod 600 /etc/slurm/slurmdbd.conf \ + && chmod 440 /etc/sudoers.d/dask \ + && chmod 644 /etc/supervisord.conf \ + && chmod 644 /etc/supervisord.d/* +# +# 4. Initialize a Slurm database +# +COPY ./files/scripts /scripts/ +RUN /scripts/init-mysql.sh -CMD ["/usr/bin/supervisord", "--configuration", "/etc/supervisord.conf"] +ENTRYPOINT ["/usr/bin/supervisord", "--configuration", "/etc/supervisord.conf"] diff --git a/continuous_integration/docker/slurm/_install.sh b/continuous_integration/docker/slurm/_install.sh index 2b3d10dc..a68881ec 100755 --- a/continuous_integration/docker/slurm/_install.sh +++ b/continuous_integration/docker/slurm/_install.sh @@ -1,30 +1,16 @@ #!/usr/bin/env bash -source ~/.bashrc - set -xe cd /working -conda install psutil -conda install -c conda-forge python=3.8 - -pip install \ - aiohttp \ - colorlog \ - dask \ - distributed \ - cryptography \ - traitlets \ - sqlalchemy \ - pytest \ - pytest-asyncio - -pushd dask-gateway -python setup.py develop +# This installs everything besides compiling +# dask-gateway-server/dask-gateway-proxy +pushd tests +pip install -r requirements.txt popd -pushd dask-gateway-server -python setup.py develop -popd +# This ensures we also have a compiled dask-gateway-server/dask-gateway-proxy +# bundled with dask-gateway-proxy. +pip install --editable dask-gateway-server pip list diff --git a/continuous_integration/docker/slurm/_script.sh b/continuous_integration/docker/slurm/_script.sh index 24b59bf9..fb3f4690 100755 --- a/continuous_integration/docker/slurm/_script.sh +++ b/continuous_integration/docker/slurm/_script.sh @@ -1,8 +1,5 @@ #!/usr/bin/env bash -source ~/.bashrc - set -xe cd /working - -py.test tests/test_slurm_backend.py -v +pytest -v tests/test_slurm_backend.py diff --git a/continuous_integration/docker/slurm/files/etc/slurm/slurm.conf b/continuous_integration/docker/slurm/files/etc/slurm/slurm.conf index dc9e6a53..bdcbd1e3 100644 --- a/continuous_integration/docker/slurm/files/etc/slurm/slurm.conf +++ b/continuous_integration/docker/slurm/files/etc/slurm/slurm.conf @@ -1,3 +1,5 @@ +# Configuration reference: https://slurm.schedmd.com/slurm.conf.html +# ClusterName=linux ControlMachine=slurm SlurmUser=slurm @@ -11,7 +13,6 @@ MpiDefault=none SlurmctldPidFile=/var/run/slurmd/slurmctld.pid SlurmdPidFile=/var/run/slurmd/slurmd.pid ProctrackType=proctrack/pgid -CacheGroups=0 ReturnToService=0 SlurmctldTimeout=300 SlurmdTimeout=300 @@ -29,7 +30,7 @@ SlurmdLogFile=/var/log/slurm/slurmd.log JobCompType=jobcomp/none AccountingStorageType=accounting_storage/slurmdbd # Nodes -FastSchedule=2 +SlurmdParameters=config_overrides NodeName=slurm RealMemory=4096 Sockets=4 CoresPerSocket=4 ThreadsPerCore=4 # Partitions PartitionName=DEFAULT Nodes=ALL OverSubscribe=FORCE:8 MaxTime=INFINITE State=UP diff --git a/continuous_integration/docker/slurm/files/etc/sudoers.d/dask b/continuous_integration/docker/slurm/files/etc/sudoers.d/dask index 4124a81a..19a93ee4 100644 --- a/continuous_integration/docker/slurm/files/etc/sudoers.d/dask +++ b/continuous_integration/docker/slurm/files/etc/sudoers.d/dask @@ -1,4 +1,4 @@ -Cmnd_Alias DASK_GATEWAY_JOBQUEUE_LAUNCHER = /opt/miniconda/bin/dask-gateway-jobqueue-launcher +Cmnd_Alias DASK_GATEWAY_JOBQUEUE_LAUNCHER = /opt/python/bin/dask-gateway-jobqueue-launcher %dask_users ALL=(dask) /usr/bin/sudo dask ALL=(%dask_users) NOPASSWD:DASK_GATEWAY_JOBQUEUE_LAUNCHER diff --git a/continuous_integration/docker/slurm/files/etc/supervisord.d/slurm.conf b/continuous_integration/docker/slurm/files/etc/supervisord.d/slurm.conf index 7971d509..928af3b4 100644 --- a/continuous_integration/docker/slurm/files/etc/supervisord.d/slurm.conf +++ b/continuous_integration/docker/slurm/files/etc/supervisord.d/slurm.conf @@ -46,6 +46,7 @@ user=root command=/bin/bash -c "until 2>/dev/null >/dev/tcp/localhost/6819; do sleep 1; done && /usr/local/sbin/slurmctld -Dvvv" autostart=true autorestart=false +startsecs=3 exitcodes=0,1,2 stdout_logfile=/var/log/supervisor/slurmctld.log stdout_logfile_maxbytes=1MB diff --git a/continuous_integration/docker/slurm/files/root/init-mysql.sh b/continuous_integration/docker/slurm/files/scripts/init-mysql.sh similarity index 94% rename from continuous_integration/docker/slurm/files/root/init-mysql.sh rename to continuous_integration/docker/slurm/files/scripts/init-mysql.sh index 88332c73..cd9a8e29 100755 --- a/continuous_integration/docker/slurm/files/root/init-mysql.sh +++ b/continuous_integration/docker/slurm/files/scripts/init-mysql.sh @@ -1,7 +1,5 @@ #!/usr/bin/env bash -yum install -y psmisc - if [ ! -f "/var/lib/mysql/ibdata1" ]; then echo "- Initializing database" /usr/bin/mysql_install_db &> /dev/null @@ -35,7 +33,7 @@ if [ ! -d "/var/lib/mysql/slurm_acct_db" ]; then mysql -NBe "GRANT ALL PRIVILEGES on slurm_acct_db.* to 'slurm'@'localhost'" mysql -NBe "FLUSH PRIVILEGES" echo "- Slurm acct database created. Stopping MariaDB" - killall mysqld + pkill mysqld for count in {30..0}; do if echo "SELECT 1" | mysql &> /dev/null; then sleep 1 @@ -48,6 +46,3 @@ if [ ! -d "/var/lib/mysql/slurm_acct_db" ]; then exit 1 fi fi - -yum remove -y psmisc -rm -rf /var/cache/yum diff --git a/continuous_integration/docker/slurm/start.sh b/continuous_integration/docker/slurm/start.sh index fa330cee..fc5e41d2 100755 --- a/continuous_integration/docker/slurm/start.sh +++ b/continuous_integration/docker/slurm/start.sh @@ -12,4 +12,4 @@ docker run --rm -d \ -p 8000:8000 \ -p 8786:8786 \ -p 8088:8088 \ - daskgateway/testing-slurm + ghcr.io/dask/dask-gateway-ci-slurm diff --git a/continuous_integration/install.sh b/continuous_integration/install.sh deleted file mode 100755 index cafefe80..00000000 --- a/continuous_integration/install.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env bash -set -xe - -npm install -g configurable-http-proxy - -# if you make updates here, please update the -# entries in ../dev-environment.yaml -pip install -U \ - aiohttp \ - colorlog \ - cryptography \ - dask \ - distributed \ - ipywidgets \ - jupyterhub \ - notebook \ - pytest \ - pytest-asyncio \ - sqlalchemy \ - tornado \ - traitlets \ - trustme - -pushd dask-gateway -python setup.py develop -popd - -pushd dask-gateway-server -python setup.py develop -popd - -pip list - -set +xe diff --git a/continuous_integration/kubernetes/install.sh b/continuous_integration/kubernetes/install.sh deleted file mode 100755 index 7f0197fd..00000000 --- a/continuous_integration/kubernetes/install.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env bash -set -xe - -pip install -U \ - aiohttp \ - colorlog \ - cryptography \ - dask \ - distributed \ - pytest \ - pytest-asyncio \ - kubernetes-asyncio \ - sqlalchemy \ - traitlets - -pushd dask-gateway -sudo python setup.py develop -popd - -pushd dask-gateway-server -sudo python setup.py develop --no-build-proxy -popd - -pip list diff --git a/continuous_integration/kubernetes/script.sh b/continuous_integration/kubernetes/script.sh deleted file mode 100755 index afbbf3c8..00000000 --- a/continuous_integration/kubernetes/script.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/usr/bin/env bash -TEST_DASK_GATEWAY_KUBE=true TEST_DASK_GATEWAY_KUBE_ADDRESS=http://localhost:30200/services/dask-gateway/ py.test tests/kubernetes/ -vv diff --git a/dask-gateway-server/setup.py b/dask-gateway-server/setup.py index 4c9016c8..b89b79e5 100644 --- a/dask-gateway-server/setup.py +++ b/dask-gateway-server/setup.py @@ -99,14 +99,27 @@ def run(self): # NOTE: changes to the dependencies here must also be reflected # in ../dev-environment.yaml -install_requires = ["aiohttp", "colorlog", "cryptography", "tornado", "traitlets"] +install_requires = [ + "aiohttp", + "colorlog", + "cryptography", + "tornado", + "traitlets", +] extras_require = { + # pykerberos is tricky to install and requires a system package to + # successfully compile some C code, on ubuntu this is libkrb5-dev. "kerberos": ["pykerberos"], "jobqueue": ["sqlalchemy"], "local": ["sqlalchemy"], "yarn": ["sqlalchemy", "skein >= 0.7.3"], "kubernetes": ["kubernetes_asyncio"], + "all_backends": [ + "sqlalchemy", + "skein >= 0.7.3", + "kubernetes_asyncio", + ], } # Due to quirks in setuptools/distutils dependency ordering, to get the go diff --git a/dask-gateway/setup.py b/dask-gateway/setup.py index 03d1c55d..2728be21 100644 --- a/dask-gateway/setup.py +++ b/dask-gateway/setup.py @@ -9,13 +9,25 @@ # NOTE: changes to the dependencies here must also be reflected # in ../dev-environment.yaml -install_requires = ["aiohttp", "dask>=2.2.0", "distributed>=2.2.0", "tornado"] +install_requires = [ + "aiohttp", + "dask >= 2.2.0", + "distributed >= 2.2.0", + "pyyaml", + "tornado", +] extras_require = { "kerberos": [ 'pykerberos;platform_system!="Windows"', 'winkerberos;platform_system=="Windows"', - ] + ], + # all should represent all parts of extra_require and is used when building + # docs and running tests. + "all": [ + 'pykerberos;platform_system!="Windows"', + 'winkerberos;platform_system=="Windows"', + ], } setup( diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 00000000..25b8b40c --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,20 @@ +# This file describes the requirements to build the documentation, which you can +# do by the following commands: +# +# pip install -r requirements.txt +# make html +# +sphinx +dask-sphinx-theme + +# sphinx-autobuild enables the "make devenv" command defined in the Makefile to +# automatically rebuild the documentation on changes and update live-reload a +# browser. +sphinx-autobuild + +# autodoc-traits will inspect the dask-gateway and dask-gateway-server's Python +# code to generate reference documentation. It will omit files if ImportErrors +# are thrown so we install these packages with all dependencies to avoid that. +autodoc-traits +--editable="../dask-gateway" +--editable="../dask-gateway-server[all_backends]" --install-option="--no-build-proxy" diff --git a/docs/source/develop.rst b/docs/source/develop.rst index f990c1f5..c606e0e3 100644 --- a/docs/source/develop.rst +++ b/docs/source/develop.rst @@ -132,7 +132,7 @@ The tests can then be run as: .. code-block:: shell # Run the test suite - $ py.test tests -vv + $ pytest -v tests In addition to the main tests, additional tests for the various backends are diff --git a/tests/requirements.txt b/tests/requirements.txt new file mode 100644 index 00000000..151a385e --- /dev/null +++ b/tests/requirements.txt @@ -0,0 +1,65 @@ +# This file describes the requirements to test the Python code in dask-gateway +# and dask-gateway server. +# +# This is how you would install and run most tests: +# +# pip install -r requirements.txt +# pytest +# +# +# FIXME: +# - Make kubernetes test like other backend tests, something you opt into rather +# than out out of. +# +pytest +pytest-asyncio + +# dask-gateway and dask-gateway-server and all their dependencies are assumed to +# be installed. +--editable="../dask-gateway" +--editable="../dask-gateway-server[all_backends]" --install-option="--no-build-proxy" + +# ipython and ipywidget is optional integrations allowing for fancy rendering of +# end user provided configuration options. Tests in test_options.py will be +# skipped without this installed. +ipython +ipywidgets + +# bokeh needs to be installed for test_dashboard_link_from_public_address to not +# be skipped. +# +# FIXME: clarify why bokeh is needed for this test. +# +bokeh + +# trustme is a utility used in the code of the test ca_and_tls_proxy in +# test_proxies.py. +trustme + +# IMPORTANT: These environment variables indicating tests should be run with +# integration against external dask cluster providers (backends). +# +# For this to work, there needs to be various things running in the +# background. +# +# TEST_DASK_GATEWAY_YARN - test_yarn_backend.py, and test_kerberos_auth in test_auth.py +# TEST_DASK_GATEWAY_PBS - test_pbs_backend.py +# TEST_DASK_GATEWAY_SLURM - test_slurm_backend.py +# TEST_DASK_GATEWAY_KUBE - kubernetes/test_integration.py +# +# TEST_DASK_GATEWAY_KUBE_ADDRESS is also used to describe how to reach the +# traefik pod used as a proxy to access dask-gateway-server running in the api +# pod. +# + +# IMPORTANT: Not installed Python packages with system dependencies +# +# - To run tests related to KerberosAuthenticator, you need to install +# pykerberos which is tricky to install with pip but easy with conda. For +# example, to install pykerberos with pip on ubunutu, you need to first +# install the apt package libkrb5-dev. +# - To run tests related to JupyterHubAuthenticator, you need to install +# jupyterhub and the Node npm package configurable-http-proxy that JupyterHub +# depends on to route traffic. +# - To run tests related to the Helm chart, you need to install the helm CLI. +# diff --git a/tests/test_pbs_backend.py b/tests/test_pbs_backend.py index d7fdfe64..b033d632 100644 --- a/tests/test_pbs_backend.py +++ b/tests/test_pbs_backend.py @@ -61,8 +61,8 @@ async def do_stop_cluster(self, cluster): async def test_pbs_backend(): c = Config() - c.PBSClusterConfig.scheduler_cmd = "/opt/miniconda/bin/dask-scheduler" - c.PBSClusterConfig.worker_cmd = "/opt/miniconda/bin/dask-worker" + c.PBSClusterConfig.scheduler_cmd = "/opt/python/bin/dask-scheduler" + c.PBSClusterConfig.worker_cmd = "/opt/python/bin/dask-worker" c.PBSClusterConfig.scheduler_memory = "256M" c.PBSClusterConfig.worker_memory = "256M" c.PBSClusterConfig.scheduler_cores = 1 diff --git a/tests/test_slurm_backend.py b/tests/test_slurm_backend.py index 3b9a1abf..5df40fb0 100644 --- a/tests/test_slurm_backend.py +++ b/tests/test_slurm_backend.py @@ -68,8 +68,8 @@ async def do_stop_cluster(self, cluster): async def test_slurm_backend(): c = Config() - c.SlurmClusterConfig.scheduler_cmd = "/opt/miniconda/bin/dask-scheduler" - c.SlurmClusterConfig.worker_cmd = "/opt/miniconda/bin/dask-worker" + c.SlurmClusterConfig.scheduler_cmd = "/opt/python/bin/dask-scheduler" + c.SlurmClusterConfig.worker_cmd = "/opt/python/bin/dask-worker" c.SlurmClusterConfig.scheduler_memory = "256M" c.SlurmClusterConfig.worker_memory = "256M" c.SlurmClusterConfig.scheduler_cores = 1 diff --git a/tests/test_yarn_backend.py b/tests/test_yarn_backend.py index e7e75980..69dec95b 100644 --- a/tests/test_yarn_backend.py +++ b/tests/test_yarn_backend.py @@ -51,8 +51,8 @@ async def do_stop_cluster(self, cluster): async def test_yarn_backend(): c = Config() - c.YarnClusterConfig.scheduler_cmd = "/opt/miniconda/bin/dask-scheduler" - c.YarnClusterConfig.worker_cmd = "/opt/miniconda/bin/dask-worker" + c.YarnClusterConfig.scheduler_cmd = "/opt/python/bin/dask-scheduler" + c.YarnClusterConfig.worker_cmd = "/opt/python/bin/dask-worker" c.YarnClusterConfig.scheduler_memory = "512M" c.YarnClusterConfig.worker_memory = "512M" c.YarnClusterConfig.scheduler_cores = 1