From 062deb3ab39525c741907893dca402aa480b8b0b Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Wed, 15 Nov 2023 15:46:18 +0000 Subject: [PATCH] fix: include `_kernel_signatures.py` in package (#2819) * fix: include _kernel_signatures.py in package * test: fix exception handling on 3.8 * chore: add GPU test executor * chore: bundle entrypoint * chore: add helpers for GPU testing in containers * refactor: add README * refactor: add README * docs: nvidia-ctk * docs: cleanup comments * fix: enable awkward reporting, make service oneshot --- dev/cuda-tests/README.md | 34 +++++++++++++++++++++ dev/cuda-tests/cuda-tests-entrypoint | 44 ++++++++++++++++++++++++++++ dev/cuda-tests/cuda-tests.Dockerfile | 12 ++++++++ dev/cuda-tests/cuda-tests.service | 23 +++++++++++++++ dev/cuda-tests/cuda-tests.timer | 19 ++++++++++++ dev/generate-kernel-signatures.py | 2 +- pyproject.toml | 5 ++-- tests-cuda/test_1381_check_errors.py | 2 +- 8 files changed, 137 insertions(+), 4 deletions(-) create mode 100644 dev/cuda-tests/README.md create mode 100755 dev/cuda-tests/cuda-tests-entrypoint create mode 100644 dev/cuda-tests/cuda-tests.Dockerfile create mode 100644 dev/cuda-tests/cuda-tests.service create mode 100644 dev/cuda-tests/cuda-tests.timer diff --git a/dev/cuda-tests/README.md b/dev/cuda-tests/README.md new file mode 100644 index 0000000000..ac317a72ae --- /dev/null +++ b/dev/cuda-tests/README.md @@ -0,0 +1,34 @@ +# CUDA Tests Container + +This directory containers resources for running the Awkward CUDA tests inside a Docker container. It is possible to use other container runtimes e.g. podman. + +## Build Container + +1. Build Container + + Only required if not using the pre-built container image + ```bash + docker build -f cuda-tests.Dockerfile -t awkward/cuda-tests:latest . + ``` +2. Install systemd units (optional) + ```bash + sudo cp cuda-tests.service cuda-tests.timer /etc/systemd/system/ + ``` +3. Activate systemd units (optional) + ```bash + sudo systemctl enable cuda-tests.service cuda-tests.timer + ``` +4. Store GitHub API token with `repo` credentials in `/etc/cuda-gh-token` + ```bash + sudo echo "ghp_..." > /etc/cuda-gh-token + ``` +5. Install NVIDIA Container Toolkit (see https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) +6. Run container (if not using systemd) + ```bash + docker run --rm \ + --runtime=nvidia \ + --gpus all \ + -v "/etc:/creds" \ + -e GH_TOKEN_PATH=/creds/cuda-gh-token \ + agoose77/cuda-tests:latest + ``` diff --git a/dev/cuda-tests/cuda-tests-entrypoint b/dev/cuda-tests/cuda-tests-entrypoint new file mode 100755 index 0000000000..479b07ef94 --- /dev/null +++ b/dev/cuda-tests/cuda-tests-entrypoint @@ -0,0 +1,44 @@ +#!/usr/bin/env bash + +set -o nounset + +# Try loading token from path if it's not set +if [[ ! -v GH_TOKEN ]]; then + TOKEN_FROM_FILE="$(cat "${GH_TOKEN_PATH}")" + export GH_TOKEN="${TOKEN_FROM_FILE}" +fi + +# Setup +{ + cd "$(mktemp -d)"; + + # Clone awkward + git clone https://github.com/scikit-hep/awkward --depth=1; + cd awkward; + + # Generate missing files + /usr/bin/nox -s prepare -- --headers --signatures --tests; + + # Prepare environment + python3 -m venv /opt/build-venv; + export PATH="/opt/build-venv/bin:$PATH"; + + # Prepare build + python3 -m pip install wheel build; + + # Install awkward and dependencies + python3 -m pip install -v --only-binary "numpy" . ./awkward-cpp cupy-cuda11x pytest>=6; +} || gh issue create --title "GPU Tests Setup Failed" --body "The test-runner for the GPU tests failed before hitting pytest." -R scikit-hep/awkward; + +# Test +{ + # Run pytest + python3 -m pytest -vv -rs tests-cuda tests-cuda-kernels > test-output.txt; +} || { + # Prepare issue body + printf "The GPU tests failed for commit %s with the following pytest output:\n\n\`\`\`\n" "$(git rev-parse HEAD)" > issue-body.txt; + tail -c 64000 test-output.txt >> issue-body.txt; + printf "\n\`\`\`" >> issue-body.txt; + # File report + gh issue create --title "GPU Tests Failed" --body-file issue-body.txt -R scikit-hep/awkward; +} diff --git a/dev/cuda-tests/cuda-tests.Dockerfile b/dev/cuda-tests/cuda-tests.Dockerfile new file mode 100644 index 0000000000..bd15bed95f --- /dev/null +++ b/dev/cuda-tests/cuda-tests.Dockerfile @@ -0,0 +1,12 @@ +FROM nvidia/cuda:11.2.2-devel-ubuntu20.04 +WORKDIR /app + +ARG DEBIAN_FRONTEND=noninteractive +RUN apt-get update \ + && apt-get install -y python3 python3-pip python3-venv python3-dev python3-wheel g++ git cmake make patch curl nox \ + && curl https://github.com/cli/cli/releases/download/v2.39.1/gh_2.39.1_linux_amd64.deb -L -o /tmp/gh.deb \ + && apt-get install -y /tmp/gh.deb \ + && rm -rf /var/lib/apt/lists/* + +COPY cuda-tests-entrypoint /app/entrypoint +ENTRYPOINT ["/app/entrypoint"] diff --git a/dev/cuda-tests/cuda-tests.service b/dev/cuda-tests/cuda-tests.service new file mode 100644 index 0000000000..e83eb4ab4b --- /dev/null +++ b/dev/cuda-tests/cuda-tests.service @@ -0,0 +1,23 @@ +[Unit] +Description=Awkward CUDA Tests Service +After=docker.service +Requires=docker.service + +[Service] +TimeoutStartSec=0 +Restart=never +Type=oneshot +ExecStartPre=-/usr/bin/docker exec %n stop +ExecStartPre=-/usr/bin/docker rm %n +# Allow pull to fail if the image is e.g. only local (- prefix to this command) +ExecStartPre=-/usr/bin/docker pull agoose77/cuda-tests:latest +ExecStart=/usr/bin/docker run --rm --name %n \ + --runtime=nvidia \ + --gpus all \ + -v "${CREDENTIALS_DIRECTORY}:/creds" \ + -e GH_TOKEN_PATH=/creds/gh-token \ + agoose77/cuda-tests:latest +LoadCredential=gh-token:/etc/cuda-gh-token + +[Install] +WantedBy=default.target diff --git a/dev/cuda-tests/cuda-tests.timer b/dev/cuda-tests/cuda-tests.timer new file mode 100644 index 0000000000..073ec1faf4 --- /dev/null +++ b/dev/cuda-tests/cuda-tests.timer @@ -0,0 +1,19 @@ +[Unit] +Description=Schedule a CUDA test job every day +# Allow manual starts +RefuseManualStart=no +# Allow manual stops +RefuseManualStop=no + +[Timer] +# Execute job if it missed a run due to machine being off +Persistent=true +# Run 120 seconds after boot for the first time +OnBootSec=240 +# Run every day (doesn't matter when due to persistent) +OnCalendar=daily +# File describing job to execute +Unit=cuda-tests.service + +[Install] +WantedBy=timers.target diff --git a/dev/generate-kernel-signatures.py b/dev/generate-kernel-signatures.py index f95d1dada0..b043911fc3 100644 --- a/dev/generate-kernel-signatures.py +++ b/dev/generate-kernel-signatures.py @@ -454,7 +454,7 @@ def f(grid, block, args): """ ) - print("Done with src/awkward/connect/cuda/_kernel_signatures.py...") + print("Done with src/awkward/_connect/cuda/_kernel_signatures.py...") if __name__ == "__main__": diff --git a/pyproject.toml b/pyproject.toml index fe4e10ac61..e2c03a2621 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -64,7 +64,8 @@ init = "awkward.numba:_register" [tool.hatch.build] artifacts = [ - "/src/awkward/_connect/header-only" + "/src/awkward/_connect/header-only", + "/src/awkward/_connect/cuda/_kernel_signatures.py" ] [tool.hatch.build.targets.wheel] @@ -86,7 +87,7 @@ include = [ "/requirements-test.txt" ] artifacts = [ - "/tests-cuda-kernels", + "/tests-cuda-kernels" ] [tool.hatch.metadata.hooks.fancy-pypi-readme] diff --git a/tests-cuda/test_1381_check_errors.py b/tests-cuda/test_1381_check_errors.py index 332dff0b05..34eb689cc1 100644 --- a/tests-cuda/test_1381_check_errors.py +++ b/tests-cuda/test_1381_check_errors.py @@ -33,7 +33,7 @@ def test(): assert isinstance(err.value, ValueError) - message = "".join(traceback.format_exception(err.value)) + message = "".join(traceback.format_exception(err.type, err.value, err.tb)) assert ( "ValueError: index out of range in compiled CUDA code " "(awkward_RegularArray_getitem_next_at)\n"