From 57d6f42a941a895a70027b7c4b5bf0949a86286f Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Wed, 15 Nov 2023 10:27:18 +0000 Subject: [PATCH 01/10] fix: include _kernel_signatures.py in package --- pyproject.toml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index fe4e10ac61..e2c03a2621 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -64,7 +64,8 @@ init = "awkward.numba:_register" [tool.hatch.build] artifacts = [ - "/src/awkward/_connect/header-only" + "/src/awkward/_connect/header-only", + "/src/awkward/_connect/cuda/_kernel_signatures.py" ] [tool.hatch.build.targets.wheel] @@ -86,7 +87,7 @@ include = [ "/requirements-test.txt" ] artifacts = [ - "/tests-cuda-kernels", + "/tests-cuda-kernels" ] [tool.hatch.metadata.hooks.fancy-pypi-readme] From 318300246eac9726439507ae443185621bac2f68 Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Wed, 15 Nov 2023 10:38:30 +0000 Subject: [PATCH 02/10] test: fix exception handling on 3.8 --- tests-cuda/test_1381_check_errors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests-cuda/test_1381_check_errors.py b/tests-cuda/test_1381_check_errors.py index 332dff0b05..34eb689cc1 100644 --- a/tests-cuda/test_1381_check_errors.py +++ b/tests-cuda/test_1381_check_errors.py @@ -33,7 +33,7 @@ def test(): assert isinstance(err.value, ValueError) - message = "".join(traceback.format_exception(err.value)) + message = "".join(traceback.format_exception(err.type, err.value, err.tb)) assert ( "ValueError: index out of range in compiled CUDA code " "(awkward_RegularArray_getitem_next_at)\n" From 32e81cb1ad642146474fde309b713eca3a50c64f Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Wed, 15 Nov 2023 11:42:28 +0000 Subject: [PATCH 03/10] chore: add GPU test executor --- dev/cuda-tests-entrypoint | 35 +++++++++++++++++++++++++++++++++++ dev/cuda-tests.Dockerfile | 11 +++++++++++ 2 files changed, 46 insertions(+) create mode 100755 dev/cuda-tests-entrypoint create mode 100644 dev/cuda-tests.Dockerfile diff --git a/dev/cuda-tests-entrypoint b/dev/cuda-tests-entrypoint new file mode 100755 index 0000000000..82f2971523 --- /dev/null +++ b/dev/cuda-tests-entrypoint @@ -0,0 +1,35 @@ +#!/usr/bin/env bash + +set -u + +# Setup +{ + cd "$(mktemp -d)"; + + # Clone awkward + git clone https://github.com/scikit-hep/awkward --depth=1; + cd awkward; + + # Generate and install dependencies + /usr/bin/nox -s prepare -- --headers --signatures --tests; + + # Prepare environment + python3 -m venv /opt/build-venv; + export PATH="/opt/build-venv/bin:$PATH"; + + # Prepare build + python3 -m pip install wheel build; + + # Run build + python3 -m pip install -v --only-binary "numpy" . ./awkward-cpp cupy-cuda11x pytest>=6; +} || gh issue create --title "GPU Tests Setup Failed" --body "The test-runner for the GPU tests failed before hitting pytest." -R agoose77/test-ccache-gha; + +# Test +{ + python3 -m pytest -vv -rs tests-cuda tests-cuda-kernels > test-output.txt; +} || { + printf "The GPU tests failed for commit %s with the following pytest output:\n\n\`\`\`\n" "$(git rev-parse HEAD)" > issue-body.txt; + tail -c 64000 test-output.txt >> issue-body.txt; + printf "\n\`\`\`" >> issue-body.txt; + gh issue create --title "GPU Tests Failed" --body-file issue-body.txt -R agoose77/test-ccache-gha; +} diff --git a/dev/cuda-tests.Dockerfile b/dev/cuda-tests.Dockerfile new file mode 100644 index 0000000000..fa1b87a842 --- /dev/null +++ b/dev/cuda-tests.Dockerfile @@ -0,0 +1,11 @@ +FROM nvidia/cuda:11.2.2-devel-ubuntu20.04 +WORKDIR /app + +ARG DEBIAN_FRONTEND=noninteractive +RUN apt-get update \ + && apt-get install -y python3 python3-pip python3-venv python3-dev python3-wheel g++ git cmake make patch curl nox \ + && curl https://github.com/cli/cli/releases/download/v2.39.1/gh_2.39.1_linux_amd64.deb -L -o /tmp/gh.deb \ + && apt-get install -y /tmp/gh.deb \ + && rm -rf /var/lib/apt/lists/* + +ENTRYPOINT ["/usr/bin/bash"] From 253ca11bb0ccea3daa60be5a31b48cda84fee6f4 Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Wed, 15 Nov 2023 11:47:43 +0000 Subject: [PATCH 04/10] chore: bundle entrypoint --- dev/cuda-tests.Dockerfile | 3 ++- dev/generate-kernel-signatures.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/dev/cuda-tests.Dockerfile b/dev/cuda-tests.Dockerfile index fa1b87a842..bd15bed95f 100644 --- a/dev/cuda-tests.Dockerfile +++ b/dev/cuda-tests.Dockerfile @@ -8,4 +8,5 @@ RUN apt-get update \ && apt-get install -y /tmp/gh.deb \ && rm -rf /var/lib/apt/lists/* -ENTRYPOINT ["/usr/bin/bash"] +COPY cuda-tests-entrypoint /app/entrypoint +ENTRYPOINT ["/app/entrypoint"] diff --git a/dev/generate-kernel-signatures.py b/dev/generate-kernel-signatures.py index f95d1dada0..b043911fc3 100644 --- a/dev/generate-kernel-signatures.py +++ b/dev/generate-kernel-signatures.py @@ -454,7 +454,7 @@ def f(grid, block, args): """ ) - print("Done with src/awkward/connect/cuda/_kernel_signatures.py...") + print("Done with src/awkward/_connect/cuda/_kernel_signatures.py...") if __name__ == "__main__": From 9bf63c85144e4f73489044ad42791bb93f4a2d11 Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Wed, 15 Nov 2023 13:22:08 +0000 Subject: [PATCH 05/10] chore: add helpers for GPU testing in containers --- dev/cuda-tests-entrypoint | 12 +++++++++--- dev/cuda-tests.service | 21 +++++++++++++++++++++ dev/cuda-tests.timer | 19 +++++++++++++++++++ 3 files changed, 49 insertions(+), 3 deletions(-) create mode 100644 dev/cuda-tests.service create mode 100644 dev/cuda-tests.timer diff --git a/dev/cuda-tests-entrypoint b/dev/cuda-tests-entrypoint index 82f2971523..31c229fbd9 100755 --- a/dev/cuda-tests-entrypoint +++ b/dev/cuda-tests-entrypoint @@ -1,6 +1,12 @@ #!/usr/bin/env bash -set -u +set -o nounset + +# Try loading token from path if it's not set +if [[ ! -v GH_TOKEN ]]; then + TOKEN_FROM_FILE="$(cat "${GH_TOKEN_PATH}")" + export GH_TOKEN="${TOKEN_FROM_FILE}" +fi # Setup { @@ -22,7 +28,7 @@ set -u # Run build python3 -m pip install -v --only-binary "numpy" . ./awkward-cpp cupy-cuda11x pytest>=6; -} || gh issue create --title "GPU Tests Setup Failed" --body "The test-runner for the GPU tests failed before hitting pytest." -R agoose77/test-ccache-gha; +} || gh issue create --title "GPU Tests Setup Failed" --body "The test-runner for the GPU tests failed before hitting pytest." -R scikit-hep/awkward; # Test { @@ -31,5 +37,5 @@ set -u printf "The GPU tests failed for commit %s with the following pytest output:\n\n\`\`\`\n" "$(git rev-parse HEAD)" > issue-body.txt; tail -c 64000 test-output.txt >> issue-body.txt; printf "\n\`\`\`" >> issue-body.txt; - gh issue create --title "GPU Tests Failed" --body-file issue-body.txt -R agoose77/test-ccache-gha; + gh issue create --title "GPU Tests Failed" --body-file issue-body.txt -R scikit-hep/awkward; } diff --git a/dev/cuda-tests.service b/dev/cuda-tests.service new file mode 100644 index 0000000000..af9ff71cfb --- /dev/null +++ b/dev/cuda-tests.service @@ -0,0 +1,21 @@ +[Unit] +Description=Awkward CUDA Tests Service +After=docker.service +Requires=docker.service + +[Service] +TimeoutStartSec=0 +Restart=always +ExecStartPre=-/usr/bin/docker exec %n stop +ExecStartPre=-/usr/bin/docker rm %n +ExecStartPre=/usr/bin/docker pull agoose77/cuda-tests:latest +ExecStart=/usr/bin/docker run --rm --name %n \ + --runtime=nvidia \ + --gpus all \ + -v "${CREDENTIALS_DIRECTORY}:/creds" \ + -e GH_TOKEN_PATH=/creds/gh-token \ + agoose77/cuda-tests:latest +LoadCredential=gh-token:/etc/cuda-gh-token + +[Install] +WantedBy=default.target diff --git a/dev/cuda-tests.timer b/dev/cuda-tests.timer new file mode 100644 index 0000000000..073ec1faf4 --- /dev/null +++ b/dev/cuda-tests.timer @@ -0,0 +1,19 @@ +[Unit] +Description=Schedule a CUDA test job every day +# Allow manual starts +RefuseManualStart=no +# Allow manual stops +RefuseManualStop=no + +[Timer] +# Execute job if it missed a run due to machine being off +Persistent=true +# Run 120 seconds after boot for the first time +OnBootSec=240 +# Run every day (doesn't matter when due to persistent) +OnCalendar=daily +# File describing job to execute +Unit=cuda-tests.service + +[Install] +WantedBy=timers.target From 21e82edc866d44ad75d34162d8766e98bc37df7f Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Wed, 15 Nov 2023 13:35:03 +0000 Subject: [PATCH 06/10] refactor: add README --- dev/cuda-tests/README.md | 32 ++++++++++++++++++++++ dev/{ => cuda-tests}/cuda-tests-entrypoint | 0 dev/{ => cuda-tests}/cuda-tests.Dockerfile | 0 dev/{ => cuda-tests}/cuda-tests.service | 3 +- dev/{ => cuda-tests}/cuda-tests.timer | 0 5 files changed, 34 insertions(+), 1 deletion(-) create mode 100644 dev/cuda-tests/README.md rename dev/{ => cuda-tests}/cuda-tests-entrypoint (100%) rename dev/{ => cuda-tests}/cuda-tests.Dockerfile (100%) rename dev/{ => cuda-tests}/cuda-tests.service (77%) rename dev/{ => cuda-tests}/cuda-tests.timer (100%) diff --git a/dev/cuda-tests/README.md b/dev/cuda-tests/README.md new file mode 100644 index 0000000000..80adf34325 --- /dev/null +++ b/dev/cuda-tests/README.md @@ -0,0 +1,32 @@ +# CUDA Tests Container + +This directory containers resources for running the Awkward CUDA tests inside a Docker container. It is possible to use other container runtimes e.g. podman. + +## Build Container + +1. Build Container + Only required if not using the pre-built container image + ```bash + docker build -f cuda-tests.Dockerfile -t awkward/cuda-tests:latest . + ``` +2. Install systemd units (optional) + ```bash + sudo cp cuda-tests.service cuda-tests.timer /etc/systemd/system/ + ``` +3. Activate systemd units (optional) + ```bash + sudo systemctl enable cuda-tests.service cuda-tests.timer + ``` +4. Store GitHub API token with `repo` credentials in `/etc/cuda-gh-token` + ```bash + sudo echo "ghp_..." > /etc/cuda-gh-token + ``` +5. Run container (if not using systemd) + ```bash + docker run --rm \ + --runtime=nvidia \ + --gpus all \ + -v "/etc:/creds" \ + -e GH_TOKEN_PATH=/creds/cuda-gh-token \ + agoose77/cuda-tests:latest + ``` diff --git a/dev/cuda-tests-entrypoint b/dev/cuda-tests/cuda-tests-entrypoint similarity index 100% rename from dev/cuda-tests-entrypoint rename to dev/cuda-tests/cuda-tests-entrypoint diff --git a/dev/cuda-tests.Dockerfile b/dev/cuda-tests/cuda-tests.Dockerfile similarity index 100% rename from dev/cuda-tests.Dockerfile rename to dev/cuda-tests/cuda-tests.Dockerfile diff --git a/dev/cuda-tests.service b/dev/cuda-tests/cuda-tests.service similarity index 77% rename from dev/cuda-tests.service rename to dev/cuda-tests/cuda-tests.service index af9ff71cfb..866b9d9ccd 100644 --- a/dev/cuda-tests.service +++ b/dev/cuda-tests/cuda-tests.service @@ -8,7 +8,8 @@ TimeoutStartSec=0 Restart=always ExecStartPre=-/usr/bin/docker exec %n stop ExecStartPre=-/usr/bin/docker rm %n -ExecStartPre=/usr/bin/docker pull agoose77/cuda-tests:latest +# Allow pull to fail if the image is e.g. only local (- prefix to this command) +ExecStartPre=-/usr/bin/docker pull agoose77/cuda-tests:latest ExecStart=/usr/bin/docker run --rm --name %n \ --runtime=nvidia \ --gpus all \ diff --git a/dev/cuda-tests.timer b/dev/cuda-tests/cuda-tests.timer similarity index 100% rename from dev/cuda-tests.timer rename to dev/cuda-tests/cuda-tests.timer From 694bb01c40e8708a649b2b80b606f0a8cc59d069 Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Wed, 15 Nov 2023 13:35:22 +0000 Subject: [PATCH 07/10] refactor: add README --- dev/cuda-tests/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/dev/cuda-tests/README.md b/dev/cuda-tests/README.md index 80adf34325..9d64313c69 100644 --- a/dev/cuda-tests/README.md +++ b/dev/cuda-tests/README.md @@ -5,6 +5,7 @@ This directory containers resources for running the Awkward CUDA tests inside a ## Build Container 1. Build Container + Only required if not using the pre-built container image ```bash docker build -f cuda-tests.Dockerfile -t awkward/cuda-tests:latest . From c10feab7648097556cd7837ad776a99fbbcf906d Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Wed, 15 Nov 2023 13:39:16 +0000 Subject: [PATCH 08/10] docs: nvidia-ctk --- dev/cuda-tests/README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dev/cuda-tests/README.md b/dev/cuda-tests/README.md index 9d64313c69..ac317a72ae 100644 --- a/dev/cuda-tests/README.md +++ b/dev/cuda-tests/README.md @@ -22,7 +22,8 @@ This directory containers resources for running the Awkward CUDA tests inside a ```bash sudo echo "ghp_..." > /etc/cuda-gh-token ``` -5. Run container (if not using systemd) +5. Install NVIDIA Container Toolkit (see https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) +6. Run container (if not using systemd) ```bash docker run --rm \ --runtime=nvidia \ From 872068fdb17b67f45b762ed7a66f015b3c504cfa Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Wed, 15 Nov 2023 13:40:11 +0000 Subject: [PATCH 09/10] docs: cleanup comments --- dev/cuda-tests/cuda-tests-entrypoint | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/dev/cuda-tests/cuda-tests-entrypoint b/dev/cuda-tests/cuda-tests-entrypoint index 31c229fbd9..7eacc7cbb8 100755 --- a/dev/cuda-tests/cuda-tests-entrypoint +++ b/dev/cuda-tests/cuda-tests-entrypoint @@ -16,7 +16,7 @@ fi git clone https://github.com/scikit-hep/awkward --depth=1; cd awkward; - # Generate and install dependencies + # Generate missing files /usr/bin/nox -s prepare -- --headers --signatures --tests; # Prepare environment @@ -26,16 +26,19 @@ fi # Prepare build python3 -m pip install wheel build; - # Run build + # Install awkward and dependencies python3 -m pip install -v --only-binary "numpy" . ./awkward-cpp cupy-cuda11x pytest>=6; } || gh issue create --title "GPU Tests Setup Failed" --body "The test-runner for the GPU tests failed before hitting pytest." -R scikit-hep/awkward; # Test { + # Run pytest python3 -m pytest -vv -rs tests-cuda tests-cuda-kernels > test-output.txt; } || { + # Prepare issue body printf "The GPU tests failed for commit %s with the following pytest output:\n\n\`\`\`\n" "$(git rev-parse HEAD)" > issue-body.txt; tail -c 64000 test-output.txt >> issue-body.txt; printf "\n\`\`\`" >> issue-body.txt; + # File report gh issue create --title "GPU Tests Failed" --body-file issue-body.txt -R scikit-hep/awkward; } From c37454a4ecf08533e67f2f01cccb24b07bd3cb50 Mon Sep 17 00:00:00 2001 From: Angus Hollands Date: Wed, 15 Nov 2023 14:00:44 +0000 Subject: [PATCH 10/10] fix: enable awkward reporting, make service oneshot --- dev/cuda-tests/cuda-tests-entrypoint | 4 ++-- dev/cuda-tests/cuda-tests.service | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/dev/cuda-tests/cuda-tests-entrypoint b/dev/cuda-tests/cuda-tests-entrypoint index 7eacc7cbb8..479b07ef94 100755 --- a/dev/cuda-tests/cuda-tests-entrypoint +++ b/dev/cuda-tests/cuda-tests-entrypoint @@ -28,7 +28,7 @@ fi # Install awkward and dependencies python3 -m pip install -v --only-binary "numpy" . ./awkward-cpp cupy-cuda11x pytest>=6; -} || gh issue create --title "GPU Tests Setup Failed" --body "The test-runner for the GPU tests failed before hitting pytest." -R scikit-hep/awkward; +} || gh issue create --title "GPU Tests Setup Failed" --body "The test-runner for the GPU tests failed before hitting pytest." -R scikit-hep/awkward; # Test { @@ -40,5 +40,5 @@ fi tail -c 64000 test-output.txt >> issue-body.txt; printf "\n\`\`\`" >> issue-body.txt; # File report - gh issue create --title "GPU Tests Failed" --body-file issue-body.txt -R scikit-hep/awkward; + gh issue create --title "GPU Tests Failed" --body-file issue-body.txt -R scikit-hep/awkward; } diff --git a/dev/cuda-tests/cuda-tests.service b/dev/cuda-tests/cuda-tests.service index 866b9d9ccd..e83eb4ab4b 100644 --- a/dev/cuda-tests/cuda-tests.service +++ b/dev/cuda-tests/cuda-tests.service @@ -5,7 +5,8 @@ Requires=docker.service [Service] TimeoutStartSec=0 -Restart=always +Restart=never +Type=oneshot ExecStartPre=-/usr/bin/docker exec %n stop ExecStartPre=-/usr/bin/docker rm %n # Allow pull to fail if the image is e.g. only local (- prefix to this command)