Skip to content

Commit

Permalink
ci: de-duplicate deps and get ci images to build again
Browse files Browse the repository at this point in the history
  • Loading branch information
consideRatio committed Apr 4, 2022
1 parent 76f43c0 commit 9ec4a39
Show file tree
Hide file tree
Showing 53 changed files with 980 additions and 513 deletions.
56 changes: 8 additions & 48 deletions .github/workflows/build-publish-docs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,12 @@ on:
- "docs/**"
- "dask-gateway/**"
- "dask-gateway-server/**"
- "continuous_integration/install.sh"
- ".github/workflows/build-publish-docs.yaml"
push:
paths:
- "docs/**"
- "dask-gateway/**"
- "dask-gateway-server/**"
- "continuous_integration/install.sh"
- ".github/workflows/build-publish-docs.yaml"
branches-ignore:
- "dependabot/**"
Expand All @@ -39,44 +37,18 @@ jobs:

steps:
- uses: actions/checkout@v3

- uses: actions/setup-python@v3
with:
python-version: "3.10"

# FIXME: go is used to compile dask-gateway-server/dask-gateway-proxy, but
# that isn't relevant for us when building documentation, but, we
# end up doing it anyhow currently because we re-use a course
# installation script.
#
# If this is fixed, also update the job below.
#
- uses: actions/setup-go@v3
with:
go-version: "1.18"

# FIXME: node is used to install configurable-http-proxy, used by
# JupyterHub when started for us to run tests. As we don't need
# this to setup docs, we can get rid of this if we update our
# installation of dependencies to be a bit more scoped.
#
# If this is fixed, also update the job below.
#
- uses: actions/setup-node@v3
with:
node-version: "16"

- name: Install docs requirements
- name: Install Python docs requirements
run: |
pushd dask-gateway-server/dask-gateway-proxy
go get github.com/stretchr/testify/assert
popd
continuous_integration/install.sh
pip install sphinx dask-sphinx-theme sphinx-autobuild autodoc-traits kubernetes_asyncio skein sqlalchemy
cd docs
pip install -r requirements.txt
- name: Build docs (make html)
run: |
pushd docs
cd docs
make html SPHINXOPTS='--color -W --keep-going'
- name: Push built docs to gh-pages branch
Expand All @@ -92,28 +64,16 @@ jobs:

steps:
- uses: actions/checkout@v3

- uses: actions/setup-python@v3
with:
python-version: "3.10"

- uses: actions/setup-go@v3
with:
go-version: "1.18"

- uses: actions/setup-node@v3
with:
node-version: "16"

- name: Install docs requirements
- name: Install Python docs requirements
run: |
pushd dask-gateway-server/dask-gateway-proxy
go get github.com/stretchr/testify/assert
popd
continuous_integration/install.sh
pip install sphinx dask-sphinx-theme sphinx-autobuild autodoc-traits kubernetes_asyncio skein sqlalchemy
cd docs
pip install -r requirements.txt
- name: Linkcheck docs (make linkcheck)
run: |
pushd docs
cd docs
make linkcheck SPHINXOPTS='--color -W --keep-going'
101 changes: 81 additions & 20 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -67,28 +67,34 @@ jobs:

steps:
- uses: actions/checkout@v3

- uses: actions/setup-python@v3
with:
python-version: "${{ matrix.python-version }}"

- uses: actions/setup-go@v3
with:
go-version: "${{ matrix.go-version }}"

- uses: actions/setup-node@v3

# jupyterhub will when being mock started as part of running tests depend
# on the Node npm package configurable-http-proxy.
#
- name: Install jupyterhub with system dependencies
run: |
npm install -g configurable-http-proxy
pip install jupyterhub
- name: Install Python test requirements
run: |
continuous_integration/install.sh
cd tests
pip install -r requirements.txt
- name: List Python packages
run: |
pip freeze
- name: Run Python tests
run: |
py.test tests/ -k 'not kubernetes' -v
pytest -v tests/ -k 'not kubernetes'
- name: Install Go test requirements
run: |
Expand All @@ -100,17 +106,6 @@ jobs:
cd dask-gateway-server/dask-gateway-proxy
go test
hadoop-tests:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Hadoop Install/Start
if: ${{ env.commit_msg != 'skip-tests' }}
run: |
./continuous_integration/docker/hadoop/start.sh
./continuous_integration/docker/hadoop/install.sh
./continuous_integration/docker/hadoop/script.sh
kubernetes-tests:
runs-on: ubuntu-latest
strategy:
Expand Down Expand Up @@ -150,11 +145,21 @@ jobs:
- name: Helm Install
run: |
./continuous_integration/kubernetes/helm-install.sh
./continuous_integration/kubernetes/install.sh
- name: Install Python test requirements
run: |
cd tests
pip install -r requirements.txt
- name: List Python packages
run: |
pip list
- name: Kubernetes Tests
run: |
./continuous_integration/kubernetes/script.sh
TEST_DASK_GATEWAY_KUBE=true \
TEST_DASK_GATEWAY_KUBE_ADDRESS=http://localhost:30200/services/dask-gateway/ \
pytest -v tests/kubernetes/
# ref: https://github.com/jupyterhub/action-k8s-namespace-report
- name: Kubernetes namespace report
Expand All @@ -166,22 +171,78 @@ jobs:
deploy/controller-test-dask-gateway
deploy/traefik-test-dask-gateway
# The tests run in this job rely on by setting up a development environment in
# a pre-built container.
#
# - start.sh - starts the container
# - install.sh - setups for testing in the container
# - script.sh - runs tests in the container
#
hadoop-tests:
runs-on: ubuntu-latest
permissions:
contents: read
packages: read

steps:
- uses: actions/checkout@v3

- name: Login to ghcr.io read access to CI image
run: echo "${{ secrets.github_token }}" | docker login ghcr.io -u $ --password-stdin

- name: Hadoop tests (Yarn and Kerberos involved)
if: ${{ env.commit_msg != 'skip-tests' }}
run: |
./continuous_integration/docker/hadoop/start.sh
./continuous_integration/docker/hadoop/install.sh
./continuous_integration/docker/hadoop/script.sh
# The tests run in this job rely on by setting up a development environment in
# a pre-built container.
#
# - start.sh - starts the container
# - install.sh - setups for testing in the container
# - script.sh - runs tests in the container
#
pbs-tests:
runs-on: ubuntu-latest
permissions:
contents: read
packages: read

steps:
- uses: actions/checkout@v3
- name: PBS Tests

- name: Login to ghcr.io read access to CI image
run: echo "${{ secrets.github_token }}" | docker login ghcr.io -u $ --password-stdin

- name: PBS tests (A jobqueue backend)
if: ${{ env.commit_msg != 'skip-tests' }}
run: |
./continuous_integration/docker/pbs/start.sh
./continuous_integration/docker/pbs/install.sh
./continuous_integration/docker/pbs/script.sh
# The tests run in this job rely on by setting up a development environment in
# a pre-built container.
#
# - start.sh - starts the container
# - install.sh - setups for testing in the container
# - script.sh - runs tests in the container
#
slurm-tests:
runs-on: ubuntu-latest
permissions:
contents: read
packages: read

steps:
- uses: actions/checkout@v3
- name: Slurm Tests

- name: Login to ghcr.io read access to CI image
run: echo "${{ secrets.github_token }}" | docker login ghcr.io -u $ --password-stdin

- name: Slurm tests (A jobqueue backend)
if: ${{ env.commit_msg != 'skip-tests' }}
run: |
./continuous_integration/docker/slurm/start.sh
Expand Down
137 changes: 137 additions & 0 deletions continuous_integration/docker/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
# About these Dockerfiles

As dask-gateway can be used to start different kinds of dask clusters, we need
to be able to test against those dask cluster backends. To do that we maintain
docker images setup to run the various dask cluster backends so we can test
against them.

The images doesn't install `dask-gateway-server` within them as then we would
need to rebuild the images all the time with the specific version of
`dask-gateway-server` we want to test. Instead, the idea is to mount the local
code to a container and install dependencies before that before running the
tests. For example the `start.sh` script starts a container, and
`install.sh`/`script.sh` are wrappers to run `_install.sh`/`_script.py` scripts
in the started container.

## Manual build and update of images

For now these images are built and updated manually. Below are instructions for
a maintainer of the dask/dask-gateway repo on how to do it.

1. Create a personal access token (PAT) for your account with `write:packages`
permissions at https://github.com/settings/tokens/new.

1. Login to the ghcr.io container registry with the PAT:

```shell
docker login ghcr.io -u your-username
```

1. Build the images:

```shell
docker build --no-cache -t ghcr.io/dask/dask-gateway-ci-base ./base
docker build --no-cache -t ghcr.io/dask/dask-gateway-ci-hadoop ./hadoop
docker build --no-cache -t ghcr.io/dask/dask-gateway-ci-pbs ./pbs
docker build --no-cache -t ghcr.io/dask/dask-gateway-ci-slurm ./slurm
```

1. Verify that images seem to work

```shell
# hadoop: verify that the supervisord programs starts successfully
docker run --hostname=master.example.com --rm ghcr.io/dask/dask-gateway-ci-hadoop

# pbs: verify that logs doesn't include errors
docker run --hostname=pbs --rm ghcr.io/dask/dask-gateway-ci-pbs

# slurm: verify that the supervisord programs starts successfully
docker run --hostname=slurm --rm ghcr.io/dask/dask-gateway-ci-slurm
```

1. Push the images:

```shell
docker push ghcr.io/dask/dask-gateway-ci-base
docker push ghcr.io/dask/dask-gateway-ci-hadoop
docker push ghcr.io/dask/dask-gateway-ci-pbs
docker push ghcr.io/dask/dask-gateway-ci-slurm
```

## Debugging

### General advice

1. If you get a `docker build` error, you can do `docker run -it --rm <hash>` to
a saved layer before the erroring step and then manually do the next `RUN`
step or inspect the file system of its current state. Note that intermediary
layers are not saved if you have set `export DOCKER_BUILDKIT=1`, so this
trick can only be used without buildkit.
1. A Dockerfile's `COPY` command can update permissions of folders if you let it
copy nested folders. For example, `COPY ./files /` would update the
permissions of `/etc` based on the permissions set on the folder and files in
this git repo locally.
1. File permissions you have set in this git repo locally won't be version
controlled, besides the execute bit. Due to that, you must avoid relying on
local file permissions when building images.

### The hadoop image

Setting up the YARN backend, part of Hadoop, was very tricky. Here are some
commands of relevance to debug the container.

```shell
# Build the base image
docker build --tag ghcr.io/dask/dask-gateway-ci-base ./base

# Build the hadoop image
docker build --tag ghcr.io/dask/dask-gateway-ci-hadoop ./hadoop

# Start a container and watch logs from supervisord that starts the various
# programs we need to configure and run successfully.
docker run --hostname master.example.com --rm ghcr.io/dask/dask-gateway-ci-hadoop

# Start a container and inspect the container from a shell if something doesn't
# start correctly.
docker stop hadoop --timeout=0
docker run --name hadoop --hostname master.example.com --detach --rm ghcr.io/dask/dask-gateway-ci-hadoop
docker exec -it hadoop bash

# Useful commands to run INSIDE the built and started container
supervisorctl status
cat /var/log/supervisor/hdfs-namenode.log
cat /var/log/supervisor/hdfs-datanode.log
cat /var/log/supervisor/yarn-nodemanager.log
cat /var/log/supervisor/yarn-resourcemanager.log
cat /var/log/supervisor/krb5kdc.log
cat /var/log/supervisor/kadmind.log
```

### The slurm image

If you upgrade `slurm` to a new version, you may very well run into breaking
changes in your `slurm.conf`.

```shell
# Build the base image
docker build --tag ghcr.io/dask/dask-gateway-ci-base ./base

# Build the slurm image
docker build --tag ghcr.io/dask/dask-gateway-ci-slurm ./slurm

# Start a container and watch logs from supervisord that starts the various
# programs we need to configure and run successfully.
docker run --hostname slurm --rm ghcr.io/dask/dask-gateway-ci-slurm

# Start a container and inspect the container from a shell if something doesn't
# start correctly.
docker stop slurm --timeout=0
docker run --name slurm --hostname slurm --detach --rm ghcr.io/dask/dask-gateway-ci-slurm
docker exec -it slurm bash

# Useful commands to run INSIDE the built and started container
supervisorctl status
cat /var/log/supervisord.log
cat /var/log/supervisor/slurmdbd.log
cat /var/log/supervisor/slurmctld.log
```
Loading

0 comments on commit 9ec4a39

Please sign in to comment.