From f1a14050ec9827c9401c6ca90617b16c358029dd Mon Sep 17 00:00:00 2001
From: Jake Awe <jawe@nvidia.com>
Date: Mon, 20 May 2024 21:12:44 +0000
Subject: [PATCH 01/17] DOC v24.08 Updates [skip ci]

---
 .github/workflows/build.yaml                     | 10 +++++-----
 .github/workflows/pr.yaml                        | 12 ++++++------
 .github/workflows/test.yaml                      |  2 +-
 VERSION                                          |  2 +-
 ci/build_docs.sh                                 |  2 +-
 conda/environments/all_cuda-114_arch-x86_64.yaml | 14 +++++++-------
 conda/environments/all_cuda-118_arch-x86_64.yaml | 14 +++++++-------
 conda/environments/all_cuda-122_arch-x86_64.yaml | 14 +++++++-------
 dependencies.yaml                                | 14 +++++++-------
 docs/source/explicit_comms.rst                   |  2 +-
 pyproject.toml                                   | 10 +++++-----
 11 files changed, 48 insertions(+), 48 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 56fed450..69b0de5f 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   conda-python-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -38,7 +38,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -51,7 +51,7 @@ jobs:
   upload-conda:
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -59,7 +59,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -72,7 +72,7 @@ jobs:
   wheel-publish:
     needs: wheel-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 6688d0ff..4e56d24d 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -18,26 +18,26 @@ jobs:
       - docs-build
       - wheel-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.08
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.08
   conda-python-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -46,7 +46,7 @@ jobs:
       run_script: "ci/build_docs.sh"
   wheel-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
     with:
       build_type: pull-request
       # Package is pure Python and only ever requires one build.
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 2424729d..7a884c5c 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/VERSION b/VERSION
index 0bff6981..ec8489fd 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-24.06.00
+24.08.00
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index a727d6da..21d57837 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -23,7 +23,7 @@ rapids-mamba-retry install \
     --channel "${PYTHON_CHANNEL}" \
     dask-cuda
 
-export RAPIDS_VERSION_NUMBER="24.06"
+export RAPIDS_VERSION_NUMBER="24.08"
 export RAPIDS_DOCS_DIR="$(mktemp -d)"
 
 rapids-logger "Build Python docs"
diff --git a/conda/environments/all_cuda-114_arch-x86_64.yaml b/conda/environments/all_cuda-114_arch-x86_64.yaml
index afdb516f..57f475a2 100644
--- a/conda/environments/all_cuda-114_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-114_arch-x86_64.yaml
@@ -10,10 +10,10 @@ dependencies:
 - click >=8.1
 - cuda-version=11.4
 - cudatoolkit
-- cudf==24.6.*
-- dask-cudf==24.6.*
-- distributed-ucxx==0.38.*
-- kvikio==24.6.*
+- cudf==24.8.*
+- dask-cudf==24.8.*
+- distributed-ucxx==0.39.*
+- kvikio==24.8.*
 - numactl-devel-cos7-x86_64
 - numba>=0.57
 - numpy>=1.23,<2.0a0
@@ -24,13 +24,13 @@ dependencies:
 - pytest
 - pytest-cov
 - python>=3.9,<3.12
-- rapids-dask-dependency==24.6.*
+- rapids-dask-dependency==24.8.*
 - setuptools>=64.0.0
 - sphinx
 - sphinx-click>=2.7.1
 - sphinx-rtd-theme>=0.5.1
 - ucx-proc=*=gpu
-- ucx-py==0.38.*
-- ucxx==0.38.*
+- ucx-py==0.39.*
+- ucxx==0.39.*
 - zict>=2.0.0
 name: all_cuda-114_arch-x86_64
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index a25b7e7a..627df99c 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -10,10 +10,10 @@ dependencies:
 - click >=8.1
 - cuda-version=11.8
 - cudatoolkit
-- cudf==24.6.*
-- dask-cudf==24.6.*
-- distributed-ucxx==0.38.*
-- kvikio==24.6.*
+- cudf==24.8.*
+- dask-cudf==24.8.*
+- distributed-ucxx==0.39.*
+- kvikio==24.8.*
 - numactl-devel-cos7-x86_64
 - numba>=0.57
 - numpy>=1.23,<2.0a0
@@ -24,13 +24,13 @@ dependencies:
 - pytest
 - pytest-cov
 - python>=3.9,<3.12
-- rapids-dask-dependency==24.6.*
+- rapids-dask-dependency==24.8.*
 - setuptools>=64.0.0
 - sphinx
 - sphinx-click>=2.7.1
 - sphinx-rtd-theme>=0.5.1
 - ucx-proc=*=gpu
-- ucx-py==0.38.*
-- ucxx==0.38.*
+- ucx-py==0.39.*
+- ucxx==0.39.*
 - zict>=2.0.0
 name: all_cuda-118_arch-x86_64
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml
index ff2dea69..e122bd43 100644
--- a/conda/environments/all_cuda-122_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-122_arch-x86_64.yaml
@@ -11,10 +11,10 @@ dependencies:
 - cuda-nvcc-impl
 - cuda-nvrtc
 - cuda-version=12.2
-- cudf==24.6.*
-- dask-cudf==24.6.*
-- distributed-ucxx==0.38.*
-- kvikio==24.6.*
+- cudf==24.8.*
+- dask-cudf==24.8.*
+- distributed-ucxx==0.39.*
+- kvikio==24.8.*
 - numactl-devel-cos7-x86_64
 - numba>=0.57
 - numpy>=1.23,<2.0a0
@@ -25,13 +25,13 @@ dependencies:
 - pytest
 - pytest-cov
 - python>=3.9,<3.12
-- rapids-dask-dependency==24.6.*
+- rapids-dask-dependency==24.8.*
 - setuptools>=64.0.0
 - sphinx
 - sphinx-click>=2.7.1
 - sphinx-rtd-theme>=0.5.1
 - ucx-proc=*=gpu
-- ucx-py==0.38.*
-- ucxx==0.38.*
+- ucx-py==0.39.*
+- ucxx==0.39.*
 - zict>=2.0.0
 name: all_cuda-122_arch-x86_64
diff --git a/dependencies.yaml b/dependencies.yaml
index 20c6ca05..3a21a1d3 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -153,23 +153,23 @@ dependencies:
           - numpy>=1.23,<2.0a0
           - pandas>=1.3
           - pynvml>=11.0.0,<11.5
-          - rapids-dask-dependency==24.6.*
+          - rapids-dask-dependency==24.8.*
           - zict>=2.0.0
   test_python:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - cudf==24.6.*
-          - dask-cudf==24.6.*
-          - kvikio==24.6.*
+          - cudf==24.8.*
+          - dask-cudf==24.8.*
+          - kvikio==24.8.*
           - pytest
           - pytest-cov
-          - ucx-py==0.38.*
+          - ucx-py==0.39.*
       - output_types: [conda]
         packages:
-          - distributed-ucxx==0.38.*
+          - distributed-ucxx==0.39.*
           - ucx-proc=*=gpu
-          - ucxx==0.38.*
+          - ucxx==0.39.*
     specific:
       - output_types: conda
         matrices:
diff --git a/docs/source/explicit_comms.rst b/docs/source/explicit_comms.rst
index aecbc1fd..9fde8756 100644
--- a/docs/source/explicit_comms.rst
+++ b/docs/source/explicit_comms.rst
@@ -14,4 +14,4 @@ Usage
 In order to use explicit-comms in Dask/Distributed automatically, simply define the environment variable ``DASK_EXPLICIT_COMMS=True`` or setting the ``"explicit-comms"``
 key in the `Dask configuration <https://docs.dask.org/en/latest/configuration.html>`_.
 
-It is also possible to use explicit-comms in tasks manually, see the `API <../api/#explicit-comms>`_ and our `implementation of shuffle <https://github.com/rapidsai/dask-cuda/blob/branch-24.06/dask_cuda/explicit_comms/dataframe/shuffle.py>`_ for guidance.
+It is also possible to use explicit-comms in tasks manually, see the `API <../api/#explicit-comms>`_ and our `implementation of shuffle <https://github.com/rapidsai/dask-cuda/blob/branch-24.08/dask_cuda/explicit_comms/dataframe/shuffle.py>`_ for guidance.
diff --git a/pyproject.toml b/pyproject.toml
index 35d48538..e0f45381 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,7 +20,7 @@ dependencies = [
     "numpy>=1.23,<2.0a0",
     "pandas>=1.3",
     "pynvml>=11.0.0,<11.5",
-    "rapids-dask-dependency==24.6.*",
+    "rapids-dask-dependency==24.8.*",
     "zict>=2.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
@@ -49,12 +49,12 @@ docs = [
     "sphinx-rtd-theme>=0.5.1",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`.
 test = [
-    "cudf==24.6.*",
-    "dask-cudf==24.6.*",
-    "kvikio==24.6.*",
+    "cudf==24.8.*",
+    "dask-cudf==24.8.*",
+    "kvikio==24.8.*",
     "pytest",
     "pytest-cov",
-    "ucx-py==0.38.*",
+    "ucx-py==0.39.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]

From 4fc6df2496d2f7e95813b29e9405935c8b679591 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 7 Jun 2024 14:58:20 -0500
Subject: [PATCH 02/17] Adopt CI/packaging codeowners (#1347)

---
 .github/CODEOWNERS | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 9bfa630e..be9daacf 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1,10 +1,14 @@
 #python code owners
 dask_cuda/  @rapidsai/daskcuda-python-codeowners
 
-#build/ops code owners
-.github/           @rapidsai/ops-codeowners
-ci/                @rapidsai/ops-codeowners
-conda/             @rapidsai/ops-codeowners
-**/Dockerfile      @rapidsai/ops-codeowners
-**/.dockerignore   @rapidsai/ops-codeowners
-dependencies.yaml  @rapidsai/ops-codeowners
+#CI code owners
+/.github/                @rapidsai/ci-codeowners
+/ci/                     @rapidsai/ci-codeowners
+/.pre-commit-config.yaml @rapidsai/ci-codeowners
+
+#packaging code owners
+/.devcontainer/    @rapidsai/packaging-codeowners
+/conda/            @rapidsai/packaging-codeowners
+/dependencies.yaml @rapidsai/packaging-codeowners
+/build.sh          @rapidsai/packaging-codeowners
+pyproject.toml     @rapidsai/packaging-codeowners

From c1e27de1daf068e223ea3cd0cbc30772daeda17d Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 10 Jun 2024 06:11:18 -0700
Subject: [PATCH 03/17] Remove text builds of documentation (#1346)

This PR removes text builds of the documentation, which we do not currently use for anything. Contributes to https://github.com/rapidsai/build-planning/issues/71.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/dask-cuda/pull/1346
---
 ci/build_docs.sh | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 21d57837..18d96c9a 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -29,10 +29,8 @@ export RAPIDS_DOCS_DIR="$(mktemp -d)"
 rapids-logger "Build Python docs"
 pushd docs
 sphinx-build -b dirhtml ./source _html
-sphinx-build -b text ./source _text
-mkdir -p "${RAPIDS_DOCS_DIR}/dask-cuda/"{html,txt}
+mkdir -p "${RAPIDS_DOCS_DIR}/dask-cuda/"html
 mv _html/* "${RAPIDS_DOCS_DIR}/dask-cuda/html"
-mv _text/* "${RAPIDS_DOCS_DIR}/dask-cuda/txt"
 popd
 
 rapids-upload-docs

From abded3a767d95f13fbea35af772109c25d6df385 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Mon, 10 Jun 2024 08:53:39 -0500
Subject: [PATCH 04/17] use rapids-build-backend (#1343)

Contributes to https://github.com/rapidsai/build-planning/issues/31
Contributes to https://github.com/rapidsai/dependency-file-generator/issues/89

Proposes introducing `rapids-build-backend` as this project's build backend, to reduce the complexity of various CI/build scripts.

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Lawrence Mitchell (https://github.com/wence-)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/dask-cuda/pull/1343
---
 .pre-commit-config.yaml                       |  2 +-
 ci/build_docs.sh                              |  2 +-
 ci/build_python.sh                            | 10 ++----
 ci/build_wheel.sh                             | 17 +--------
 ci/check_style.sh                             |  2 +-
 ci/release/update-version.sh                  |  6 ++--
 ci/test_python.sh                             |  2 +-
 .../all_cuda-114_arch-x86_64.yaml             | 15 ++++----
 .../all_cuda-118_arch-x86_64.yaml             | 15 ++++----
 .../all_cuda-122_arch-x86_64.yaml             | 15 ++++----
 conda/recipes/dask-cuda/meta.yaml             |  1 +
 dask_cuda/_version.py                         | 14 ++++++--
 dask_cuda/tests/test_version.py               | 12 +++++++
 dependencies.yaml                             | 35 +++++++++++++++----
 pyproject.toml                                | 18 ++++++----
 15 files changed, 99 insertions(+), 67 deletions(-)
 create mode 100644 dask_cuda/tests/test_version.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 492c96f2..b10be12a 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -33,7 +33,7 @@ repos:
                 args: ["--module=dask_cuda", "--ignore-missing-imports"]
                 pass_filenames: false
       - repo: https://github.com/rapidsai/dependency-file-generator
-        rev: v1.8.0
+        rev: v1.13.11
         hooks:
             - id: rapids-dependency-file-generator
               args: ["--clean"]
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 18d96c9a..c2a65a41 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -7,7 +7,7 @@ rapids-logger "Create test conda environment"
 
 rapids-dependency-file-generator \
     --output conda \
-    --file_key docs \
+    --file-key docs \
     --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml
 
 rapids-mamba-retry env create --yes -f env.yaml -n docs
diff --git a/ci/build_python.sh b/ci/build_python.sh
index e2429e98..48cece32 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -13,18 +13,12 @@ export CMAKE_GENERATOR=Ninja
 
 rapids-print-env
 
-package_name="dask_cuda"
-
-version=$(rapids-generate-version)
-commit=$(git rev-parse HEAD)
-
-echo "${version}" | tr -d '"' > VERSION
-sed -i "/^__git_commit__/ s/= .*/= \"${commit}\"/g" "${package_name}/_version.py"
+rapids-generate-version > ./VERSION
 
 rapids-logger "Begin py build"
 conda config --set path_conflict prevent
 
-RAPIDS_PACKAGE_VERSION=${version} rapids-conda-retry mambabuild \
+RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
   conda/recipes/dask-cuda
 
 rapids-upload-conda-to-s3 python
diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index 9ec82673..828972dc 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -6,22 +6,7 @@ set -euo pipefail
 source rapids-configure-sccache
 source rapids-date-string
 
-version=$(rapids-generate-version)
-commit=$(git rev-parse HEAD)
-
-echo "${version}" | tr -d '"' > VERSION
-sed -i "/^__git_commit__/ s/= .*/= \"${commit}\"/g" "dask_cuda/_version.py"
-
-# For nightlies we want to ensure that we're pulling in alphas as well. The
-# easiest way to do so is to augment the spec with a constraint containing a
-# min alpha version that doesn't affect the version bounds but does allow usage
-# of alpha versions for that dependency without --pre
-alpha_spec=''
-if ! rapids-is-release-build; then
-        alpha_spec=',>=0.0.0a0'
-fi
-
-sed -r -i "s/rapids-dask-dependency==(.*)\"/rapids-dask-dependency==\1${alpha_spec}\"/g" pyproject.toml
+rapids-generate-version > ./VERSION
 
 python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check
 
diff --git a/ci/check_style.sh b/ci/check_style.sh
index 9bc26fe7..f8bc1652 100755
--- a/ci/check_style.sh
+++ b/ci/check_style.sh
@@ -8,7 +8,7 @@ rapids-logger "Create checks conda environment"
 
 rapids-dependency-file-generator \
   --output conda \
-  --file_key checks \
+  --file-key checks \
   --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml
 
 rapids-mamba-retry env create --yes -f env.yaml -n checks
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 0d1b8b1a..ac834e5e 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -36,8 +36,8 @@ function sed_runner() {
 echo "${NEXT_FULL_TAG}" | tr -d '"' > VERSION
 
 # Bump testing dependencies
-sed_runner "s/ucx-py==.*/ucx-py==${NEXT_UCXPY_VERSION}.*/g" dependencies.yaml
-sed_runner "s/ucxx==.*/ucxx==${NEXT_UCXPY_VERSION}.*/g" dependencies.yaml
+sed_runner "s/ucx-py==.*/ucx-py==${NEXT_UCXPY_VERSION}.*,>=0.0.0a0/g" dependencies.yaml
+sed_runner "s/ucxx==.*/ucxx==${NEXT_UCXPY_VERSION}.*,>=0.0.0a0/g" dependencies.yaml
 
 DEPENDENCIES=(
   cudf
@@ -47,7 +47,7 @@ DEPENDENCIES=(
 )
 for FILE in dependencies.yaml conda/environments/*.yaml; do
   for DEP in "${DEPENDENCIES[@]}"; do
-    sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*/g" "${FILE}"
+    sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0/g" "${FILE}"
   done
 done
 
diff --git a/ci/test_python.sh b/ci/test_python.sh
index b52cbb6d..ef24c848 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -8,7 +8,7 @@ set -euo pipefail
 rapids-logger "Generate Python testing dependencies"
 rapids-dependency-file-generator \
   --output conda \
-  --file_key test_python \
+  --file-key test_python \
   --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml
 
 rapids-mamba-retry env create --yes -f env.yaml -n test
diff --git a/conda/environments/all_cuda-114_arch-x86_64.yaml b/conda/environments/all_cuda-114_arch-x86_64.yaml
index 57f475a2..c0fed8e5 100644
--- a/conda/environments/all_cuda-114_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-114_arch-x86_64.yaml
@@ -10,10 +10,10 @@ dependencies:
 - click >=8.1
 - cuda-version=11.4
 - cudatoolkit
-- cudf==24.8.*
-- dask-cudf==24.8.*
-- distributed-ucxx==0.39.*
-- kvikio==24.8.*
+- cudf==24.8.*,>=0.0.0a0
+- dask-cudf==24.8.*,>=0.0.0a0
+- distributed-ucxx==0.39.*,>=0.0.0a0
+- kvikio==24.8.*,>=0.0.0a0
 - numactl-devel-cos7-x86_64
 - numba>=0.57
 - numpy>=1.23,<2.0a0
@@ -24,13 +24,14 @@ dependencies:
 - pytest
 - pytest-cov
 - python>=3.9,<3.12
-- rapids-dask-dependency==24.8.*
+- rapids-build-backend>=0.3.0,<0.4.0dev0
+- rapids-dask-dependency==24.8.*,>=0.0.0a0
 - setuptools>=64.0.0
 - sphinx
 - sphinx-click>=2.7.1
 - sphinx-rtd-theme>=0.5.1
 - ucx-proc=*=gpu
-- ucx-py==0.39.*
-- ucxx==0.39.*
+- ucx-py==0.39.*,>=0.0.0a0
+- ucxx==0.39.*,>=0.0.0a0
 - zict>=2.0.0
 name: all_cuda-114_arch-x86_64
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 627df99c..d1f6933c 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -10,10 +10,10 @@ dependencies:
 - click >=8.1
 - cuda-version=11.8
 - cudatoolkit
-- cudf==24.8.*
-- dask-cudf==24.8.*
-- distributed-ucxx==0.39.*
-- kvikio==24.8.*
+- cudf==24.8.*,>=0.0.0a0
+- dask-cudf==24.8.*,>=0.0.0a0
+- distributed-ucxx==0.39.*,>=0.0.0a0
+- kvikio==24.8.*,>=0.0.0a0
 - numactl-devel-cos7-x86_64
 - numba>=0.57
 - numpy>=1.23,<2.0a0
@@ -24,13 +24,14 @@ dependencies:
 - pytest
 - pytest-cov
 - python>=3.9,<3.12
-- rapids-dask-dependency==24.8.*
+- rapids-build-backend>=0.3.0,<0.4.0dev0
+- rapids-dask-dependency==24.8.*,>=0.0.0a0
 - setuptools>=64.0.0
 - sphinx
 - sphinx-click>=2.7.1
 - sphinx-rtd-theme>=0.5.1
 - ucx-proc=*=gpu
-- ucx-py==0.39.*
-- ucxx==0.39.*
+- ucx-py==0.39.*,>=0.0.0a0
+- ucxx==0.39.*,>=0.0.0a0
 - zict>=2.0.0
 name: all_cuda-118_arch-x86_64
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml
index e122bd43..4db52a6d 100644
--- a/conda/environments/all_cuda-122_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-122_arch-x86_64.yaml
@@ -11,10 +11,10 @@ dependencies:
 - cuda-nvcc-impl
 - cuda-nvrtc
 - cuda-version=12.2
-- cudf==24.8.*
-- dask-cudf==24.8.*
-- distributed-ucxx==0.39.*
-- kvikio==24.8.*
+- cudf==24.8.*,>=0.0.0a0
+- dask-cudf==24.8.*,>=0.0.0a0
+- distributed-ucxx==0.39.*,>=0.0.0a0
+- kvikio==24.8.*,>=0.0.0a0
 - numactl-devel-cos7-x86_64
 - numba>=0.57
 - numpy>=1.23,<2.0a0
@@ -25,13 +25,14 @@ dependencies:
 - pytest
 - pytest-cov
 - python>=3.9,<3.12
-- rapids-dask-dependency==24.8.*
+- rapids-build-backend>=0.3.0,<0.4.0dev0
+- rapids-dask-dependency==24.8.*,>=0.0.0a0
 - setuptools>=64.0.0
 - sphinx
 - sphinx-click>=2.7.1
 - sphinx-rtd-theme>=0.5.1
 - ucx-proc=*=gpu
-- ucx-py==0.39.*
-- ucxx==0.39.*
+- ucx-py==0.39.*,>=0.0.0a0
+- ucxx==0.39.*,>=0.0.0a0
 - zict>=2.0.0
 name: all_cuda-122_arch-x86_64
diff --git a/conda/recipes/dask-cuda/meta.yaml b/conda/recipes/dask-cuda/meta.yaml
index 357e6ded..877290d4 100644
--- a/conda/recipes/dask-cuda/meta.yaml
+++ b/conda/recipes/dask-cuda/meta.yaml
@@ -29,6 +29,7 @@ requirements:
   host:
     - python
     - pip
+    - rapids-build-backend>=0.3.0,<0.4.0.dev0
   run:
     - python
     {% for r in data.get("project", {}).get("dependencies", []) %}
diff --git a/dask_cuda/_version.py b/dask_cuda/_version.py
index c54072ba..820bf10b 100644
--- a/dask_cuda/_version.py
+++ b/dask_cuda/_version.py
@@ -15,6 +15,16 @@
 import importlib.resources
 
 __version__ = (
-    importlib.resources.files("dask_cuda").joinpath("VERSION").read_text().strip()
+    importlib.resources.files(__package__).joinpath("VERSION").read_text().strip()
 )
-__git_commit__ = ""
+try:
+    __git_commit__ = (
+        importlib.resources.files(__package__)
+        .joinpath("GIT_COMMIT")
+        .read_text()
+        .strip()
+    )
+except FileNotFoundError:
+    __git_commit__ = ""
+
+__all__ = ["__git_commit__", "__version__"]
diff --git a/dask_cuda/tests/test_version.py b/dask_cuda/tests/test_version.py
new file mode 100644
index 00000000..f30b2847
--- /dev/null
+++ b/dask_cuda/tests/test_version.py
@@ -0,0 +1,12 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import dask_cuda
+
+
+def test_version_constants_are_populated():
+    # __git_commit__ will only be non-empty in a built distribution
+    assert isinstance(dask_cuda.__git_commit__, str)
+
+    # __version__ should always be non-empty
+    assert isinstance(dask_cuda.__version__, str)
+    assert len(dask_cuda.__version__) > 0
diff --git a/dependencies.yaml b/dependencies.yaml
index 3a21a1d3..c7f55283 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -74,6 +74,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
+          - rapids-build-backend>=0.3.0,<0.4.0dev0
           - setuptools>=64.0.0
   cuda_version:
     specific:
@@ -153,23 +154,23 @@ dependencies:
           - numpy>=1.23,<2.0a0
           - pandas>=1.3
           - pynvml>=11.0.0,<11.5
-          - rapids-dask-dependency==24.8.*
+          - rapids-dask-dependency==24.8.*,>=0.0.0a0
           - zict>=2.0.0
   test_python:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - cudf==24.8.*
-          - dask-cudf==24.8.*
-          - kvikio==24.8.*
           - pytest
           - pytest-cov
-          - ucx-py==0.39.*
       - output_types: [conda]
         packages:
-          - distributed-ucxx==0.39.*
+          - &cudf_conda cudf==24.8.*,>=0.0.0a0
+          - &dask_cudf_conda dask-cudf==24.8.*,>=0.0.0a0
+          - distributed-ucxx==0.39.*,>=0.0.0a0
+          - &kvikio_conda kvikio==24.8.*,>=0.0.0a0
+          - &ucx_py_conda ucx-py==0.39.*,>=0.0.0a0
           - ucx-proc=*=gpu
-          - ucxx==0.39.*
+          - ucxx==0.39.*,>=0.0.0a0
     specific:
       - output_types: conda
         matrices:
@@ -181,3 +182,23 @@ dependencies:
               arch: aarch64
             packages:
               - numactl-devel-cos7-aarch64
+      - output_types: [requirements, pyproject]
+        matrices:
+          # kvikio should be added to the CUDA-version-specific matrices once there are wheels available
+          # ref: https://github.com/rapidsai/kvikio/pull/369
+          - matrix: {cuda: "12.*"}
+            packages:
+              - cudf-cu12==24.8.*,>=0.0.0a0
+              - dask-cudf-cu12==24.8.*,>=0.0.0a0
+              - ucx-py-cu12==0.39.*,>=0.0.0a0
+          - matrix: {cuda: "11.*"}
+            packages:
+              - cudf-cu11==24.8.*,>=0.0.0a0
+              - dask-cudf-cu11==24.8.*,>=0.0.0a0
+              - ucx-py-cu11==0.39.*,>=0.0.0a0
+          - matrix:
+            packages:
+              - *cudf_conda
+              - *dask_cudf_conda
+              - *kvikio_conda
+              - *ucx_py_conda
diff --git a/pyproject.toml b/pyproject.toml
index e0f45381..126efba6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,7 @@
 [build-system]
-build-backend = "setuptools.build_meta"
+build-backend = "rapids_build_backend.build"
 requires = [
+    "rapids-build-backend>=0.3.0,<0.4.0dev0",
     "setuptools>=64.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`.
 
@@ -20,7 +21,7 @@ dependencies = [
     "numpy>=1.23,<2.0a0",
     "pandas>=1.3",
     "pynvml>=11.0.0,<11.5",
-    "rapids-dask-dependency==24.8.*",
+    "rapids-dask-dependency==24.8.*,>=0.0.0a0",
     "zict>=2.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
@@ -49,12 +50,12 @@ docs = [
     "sphinx-rtd-theme>=0.5.1",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`.
 test = [
-    "cudf==24.8.*",
-    "dask-cudf==24.8.*",
-    "kvikio==24.8.*",
+    "cudf==24.8.*,>=0.0.0a0",
+    "dask-cudf==24.8.*,>=0.0.0a0",
+    "kvikio==24.8.*,>=0.0.0a0",
     "pytest",
     "pytest-cov",
-    "ucx-py==0.39.*",
+    "ucx-py==0.39.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]
@@ -129,6 +130,11 @@ filterwarnings = [
     "ignore:Dask DataFrame implementation is deprecated:DeprecationWarning",
 ]
 
+[tool.rapids-build-backend]
+build-backend = "setuptools.build_meta"
+dependencies-file = "dependencies.yaml"
+disable-cuda = true
+
 [tool.setuptools]
 license-files = ["LICENSE"]
 

From 098109aef0793b43b94f9746ed9edbde44c2d761 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Thu, 13 Jun 2024 09:43:11 -0500
Subject: [PATCH 05/17] make conda recipe data-loading stricter (#1349)

Contributes to https://github.com/rapidsai/build-planning/issues/72

Proposes using `[]` subsetting instead of `.get()` in templating statements in the conda recipe that read data out of `pyproject.toml`. That'll ensure that we get a big loud build error if changes to `pyproject.toml` remove some sections that the conda recipe expects to exist.

## Notes for Reviewers

### How I tested this

Rendered the recipe.

```shell
git fetch upstream --tags

RAPIDS_DATE_STRING="2408" \
RAPIDS_PACKAGE_VERSION="24.8.0" \
conda render \
  -c conda-forge \
  -c rapidsai-nightly \
  conda/recipes/dask-cuda
```

<details><summary>It looks correct to me (click for details)</summary>

```text
--------------
Hash contents:
--------------
{}
----------
meta.yaml:
----------
package:
  name: dask-cuda
  version: 24.8.0
source:
  path: /Users/jlamb/repos/dask-cuda
build:
  entry_points:
    - dask-cuda-worker = dask_cuda.cli:worker
    - dask-cuda-config = dask_cuda.cli:config
  number: '10'
  script:
    - /Users/jlamb/miniforge3/conda-bld/dask-cuda_1718216576022/_h_env_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_/bin/python
      -m pip install . -vv
  string: py310_2408_g3a04719_10
requirements:
  host:
    - bzip2 1.0.8 h93a5062_5
    - ca-certificates 2024.6.2 hf0a4a13_0
    - libffi 3.4.2 h3422bc3_5
    - libzlib 1.3.1 hfb2fe0b_1
    - ncurses 6.5 hb89a1cb_0
    - python_abi 3.10 4_cp310
    - tzdata 2024a h0c530f3_0
    - xz 5.2.6 h57fd34a_0
    - yaml 0.2.5 h3422bc3_2
    - libsqlite 3.46.0 hfb93653_0
    - openssl 3.3.1 hfb2fe0b_0
    - readline 8.2 h92ec313_1
    - tk 8.6.13 h5083fa2_1
    - python 3.10.14 h2469fbe_0_cpython
    - attrs 23.2.0 pyh71513ae_0
    - packaging 24.1 pyhd8ed1ab_0
    - pkgutil-resolve-name 1.3.10 pyhd8ed1ab_1
    - pyyaml 6.0.1 py310h2aa6e3c_1
    - rpds-py 0.18.1 py310h947b723_0
    - setuptools 70.0.0 pyhd8ed1ab_0
    - tomlkit 0.12.5 pyha770c72_0
    - wheel 0.43.0 pyhd8ed1ab_1
    - zipp 3.19.2 pyhd8ed1ab_0
    - importlib_resources 6.4.0 pyhd8ed1ab_0
    - pip 24.0 pyhd8ed1ab_0
    - referencing 0.35.1 pyhd8ed1ab_0
    - jsonschema-specifications 2023.12.1 pyhd8ed1ab_0
    - jsonschema 4.22.0 pyhd8ed1ab_0
    - rapids-dependency-file-generator 1.13.11 py_0
    - rapids-build-backend 0.3.1 py_0
  run:
    - pynvml>=11.0.0,<11.5
    - numpy>=1.23,<2.0a0
    - python_abi 3.10.* *_cp310
    - click >=8.1
    - rapids-dask-dependency==24.8.*,>=0.0.0a0
    - numba>=0.57
    - python >=3.10,<3.11.0a0
    - pandas>=1.3
    - zict>=2.0.0
test:
  commands:
    - dask cuda --help
    - dask-cuda-worker --help
    - dask cuda worker --help
    - dask-cuda-config --help
    - dask cuda config --help
  imports:
    - dask_cuda
about:
  dev_url: https://github.com/rapidsai/dask-cuda
  doc_url: https://docs.rapids.ai/api/dask-cuda/stable/
  home: https://github.com/rapidsai/dask-cuda
  license: Apache 2.0
  license_file:
    - ../../../LICENSE
  summary: Utilities for Dask and CUDA interactions
extra:
  copy_test_source_files: true
  final: true
```

</details>

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/dask-cuda/pull/1349
---
 conda/recipes/dask-cuda/meta.yaml | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/conda/recipes/dask-cuda/meta.yaml b/conda/recipes/dask-cuda/meta.yaml
index 877290d4..eba1a4fc 100644
--- a/conda/recipes/dask-cuda/meta.yaml
+++ b/conda/recipes/dask-cuda/meta.yaml
@@ -21,8 +21,8 @@ build:
   script:
     - {{ PYTHON }} -m pip install . -vv
   entry_points:
-    {% for e in data.get("project", {}).get("scripts", {}).items() %}
-    - {{ e|join(" = ") }}
+    {% for entrypoint in data["project"]["scripts"] %}
+    - {{ entrypoint ~ ' = ' ~ data["project"]["scripts"][entrypoint] }}
     {% endfor %}
 
 requirements:
@@ -32,7 +32,7 @@ requirements:
     - rapids-build-backend>=0.3.0,<0.4.0.dev0
   run:
     - python
-    {% for r in data.get("project", {}).get("dependencies", []) %}
+    {% for r in data["project"]["dependencies"] %}
     - {{ r }}
     {% endfor %}
 
@@ -41,18 +41,18 @@ test:
     - dask_cuda
   commands:
     - dask cuda --help
-    {% for e in data.get("project", {}).get("scripts", {}).keys() %}
-    - {{ e }} --help
-    - {{ e|replace("-", " ") }} --help
+    {% for entrypoint in data["project"]["scripts"] %}
+    - {{ entrypoint }} --help
+    - {{ entrypoint|replace("-", " ") }} --help
     {% endfor %}
 
 about:
-  home: {{ data.get("project", {}).get("urls", {}).get("Homepage", "") }}
-  license: {{ data.get("project", {}).get("license", {}).get("text", "") }}
+  home: {{ data["project"]["urls"]["Homepage"] }}
+  license: {{ data["project"]["license"]["text"] }}
   license_file:
-    {% for e in data.get("tool", {}).get("setuptools", {}).get("license-files", []) %}
+    {% for e in data["tool"]["setuptools"]["license-files"] %}
     - ../../../{{ e }}
     {% endfor %}
-  summary: {{ data.get("project", {}).get("description", "") }}
-  dev_url: {{ data.get("project", {}).get("urls", {}).get("Source", "") }}
-  doc_url: {{ data.get("project", {}).get("urls", {}).get("Documentation", "") }}
+  summary: {{ data["project"]["description"] }}
+  dev_url: {{ data["project"]["urls"]["Source"] }}
+  doc_url: {{ data["project"]["urls"]["Documentation"] }}

From 7363b0cc930e9aca0253dda82d8a60fd50670f6d Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Mon, 24 Jun 2024 10:13:55 -0500
Subject: [PATCH 06/17] remove .gitattributes (#1350)

Contributes to https://github.com/rapidsai/build-planning/issues/31

Removes `.gitattributes` file.

That was added in #88 for use with `versioneer`. Per the `git` docs ([link](https://git-scm.com/docs/gitattributes#_export_subst)), setting the attribute `export-subst` on a file via a `.gitattributes` tell `git` to replace placeholders in the file with some `git` information.

This is no longer done in `_version.py` files in this project, and this project no longer uses `versioneer`. `rapids-build-backend` handles storing git commit information in the published packages.

## Notes for Reviewers

Created based on this conversation: https://github.com/rapidsai/kvikio/pull/369#discussion_r1644861520

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)
  - Mike Sarahan (https://github.com/msarahan)

URL: https://github.com/rapidsai/dask-cuda/pull/1350
---
 .gitattributes | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 .gitattributes

diff --git a/.gitattributes b/.gitattributes
deleted file mode 100644
index cf10aa23..00000000
--- a/.gitattributes
+++ /dev/null
@@ -1 +0,0 @@
-dask_cuda/_version.py export-subst

From d8d87f30c27f8628939bc2d010588d67e1b4e078 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Wed, 26 Jun 2024 13:09:22 +0200
Subject: [PATCH 07/17] Update cuDF's `assert_eq` import (#1353)

https://github.com/rapidsai/cudf/pull/16063 has updated the import location of `assert_eq` to the public `cudf.testing.assert_eq`, this change updates imports accordingly.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/dask-cuda/pull/1353
---
 dask_cuda/tests/test_cudf_builtin_spilling.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dask_cuda/tests/test_cudf_builtin_spilling.py b/dask_cuda/tests/test_cudf_builtin_spilling.py
index d4c28ba0..80b1d482 100644
--- a/dask_cuda/tests/test_cudf_builtin_spilling.py
+++ b/dask_cuda/tests/test_cudf_builtin_spilling.py
@@ -20,7 +20,7 @@
     get_global_manager,
     set_global_manager,
 )
-from cudf.testing._utils import assert_eq  # noqa: E402
+from cudf.testing import assert_eq  # noqa: E402
 
 if get_global_manager() is not None:
     pytest.skip(

From 0a3cc7f127f4c9df277c68562d9a6ae8c6ed7a54 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Thu, 27 Jun 2024 18:55:53 +0200
Subject: [PATCH 08/17] Allow disabling RMM in benchmarks (#1352)

Allows disabling RMM in benchmarks via a new option `--disable-rmm`. This change makes benchmarks a little more similar to RMM setup in `LocalCUDACluster`/`dask cuda worker`, where not specifying `rmm-pool-size` or specifying `None` as its value entirely disables setting up RMM as the default allocator. Since for benchmarks it's desired that the default is having an RMM pool we cannot change the default `--rmm-pool-size` to `None` as that would make benchmarks run much slower by default, therefore `--disable-rmm` is the closest we can make this to the rest of Dask-CUDA.

Additionally add `--rmm-maximum-pool-size` for benchmarks.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)
  - Ayush Dattagupta (https://github.com/ayushdg)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/dask-cuda/pull/1352
---
 ci/test_python.sh               |  53 +++++++++++++
 dask_cuda/benchmarks/common.py  |  22 +++---
 dask_cuda/benchmarks/utils.py   | 131 ++++++++++++++++++++++++--------
 dask_cuda/cli.py                |   4 +
 dask_cuda/local_cuda_cluster.py |   4 +
 5 files changed, 174 insertions(+), 40 deletions(-)

diff --git a/ci/test_python.sh b/ci/test_python.sh
index ef24c848..78330a40 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -99,6 +99,59 @@ python dask_cuda/benchmarks/local_cudf_shuffle.py \
   --runs 1 \
   --backend explicit-comms
 
+DASK_DATAFRAME__QUERY_PLANNING=True \
+python dask_cuda/benchmarks/local_cudf_shuffle.py \
+  --disable-rmm \
+  --partition-size="1 KiB" \
+  -d 0 \
+  --runs 1 \
+  --backend explicit-comms
+
+DASK_DATAFRAME__QUERY_PLANNING=True \
+python dask_cuda/benchmarks/local_cudf_shuffle.py \
+  --disable-rmm-pool \
+  --partition-size="1 KiB" \
+  -d 0 \
+  --runs 1 \
+  --backend explicit-comms
+
+DASK_DATAFRAME__QUERY_PLANNING=True \
+python dask_cuda/benchmarks/local_cudf_shuffle.py \
+  --rmm-pool-size 2GiB \
+  --partition-size="1 KiB" \
+  -d 0 \
+  --runs 1 \
+  --backend explicit-comms
+
+DASK_DATAFRAME__QUERY_PLANNING=True \
+python dask_cuda/benchmarks/local_cudf_shuffle.py \
+  --rmm-pool-size 2GiB \
+  --rmm-maximum-pool-size 4GiB \
+  --partition-size="1 KiB" \
+  -d 0 \
+  --runs 1 \
+  --backend explicit-comms
+
+DASK_DATAFRAME__QUERY_PLANNING=True \
+python dask_cuda/benchmarks/local_cudf_shuffle.py \
+  --rmm-pool-size 2GiB \
+  --rmm-maximum-pool-size 4GiB \
+  --enable-rmm-async \
+  --partition-size="1 KiB" \
+  -d 0 \
+  --runs 1 \
+  --backend explicit-comms
+
+DASK_DATAFRAME__QUERY_PLANNING=True \
+python dask_cuda/benchmarks/local_cudf_shuffle.py \
+  --rmm-pool-size 2GiB \
+  --rmm-maximum-pool-size 4GiB \
+  --enable-rmm-managed \
+  --partition-size="1 KiB" \
+  -d 0 \
+  --runs 1 \
+  --backend explicit-comms
+
 rapids-logger "Run local benchmark (legacy dd)"
 DASK_DATAFRAME__QUERY_PLANNING=False \
 python dask_cuda/benchmarks/local_cudf_shuffle.py \
diff --git a/dask_cuda/benchmarks/common.py b/dask_cuda/benchmarks/common.py
index 1335334a..7f48d4fa 100644
--- a/dask_cuda/benchmarks/common.py
+++ b/dask_cuda/benchmarks/common.py
@@ -117,16 +117,18 @@ def run(client: Client, args: Namespace, config: Config):
     wait_for_cluster(client, shutdown_on_failure=True)
     assert len(client.scheduler_info()["workers"]) > 0
     setup_memory_pools(
-        client,
-        args.type == "gpu",
-        args.rmm_pool_size,
-        args.disable_rmm_pool,
-        args.enable_rmm_async,
-        args.enable_rmm_managed,
-        args.rmm_release_threshold,
-        args.rmm_log_directory,
-        args.enable_rmm_statistics,
-        args.enable_rmm_track_allocations,
+        client=client,
+        is_gpu=args.type == "gpu",
+        disable_rmm=args.disable_rmm,
+        disable_rmm_pool=args.disable_rmm_pool,
+        pool_size=args.rmm_pool_size,
+        maximum_pool_size=args.rmm_maximum_pool_size,
+        rmm_async=args.enable_rmm_async,
+        rmm_managed=args.enable_rmm_managed,
+        release_threshold=args.rmm_release_threshold,
+        log_directory=args.rmm_log_directory,
+        statistics=args.enable_rmm_statistics,
+        rmm_track_allocations=args.enable_rmm_track_allocations,
     )
     address_to_index, results, message_data = gather_bench_results(client, args, config)
     p2p_bw = peer_to_peer_bandwidths(message_data, address_to_index)
diff --git a/dask_cuda/benchmarks/utils.py b/dask_cuda/benchmarks/utils.py
index 5ac79a88..48e4755f 100644
--- a/dask_cuda/benchmarks/utils.py
+++ b/dask_cuda/benchmarks/utils.py
@@ -17,6 +17,7 @@
 from distributed.comm.addressing import get_address_host
 
 from dask_cuda.local_cuda_cluster import LocalCUDACluster
+from dask_cuda.utils import parse_device_memory_limit
 
 
 def as_noop(dsk):
@@ -93,15 +94,41 @@ def parse_benchmark_args(
         "'forkserver' can be used to avoid issues with fork not being allowed "
         "after the networking stack has been initialised.",
     )
+    cluster_args.add_argument(
+        "--disable-rmm",
+        action="store_true",
+        help="Disable RMM.",
+    )
+    cluster_args.add_argument(
+        "--disable-rmm-pool",
+        action="store_true",
+        help="Uses RMM for allocations but without a memory pool.",
+    )
     cluster_args.add_argument(
         "--rmm-pool-size",
         default=None,
         type=parse_bytes,
         help="The size of the RMM memory pool. Can be an integer (bytes) or a string "
-        "(like '4GB' or '5000M'). By default, 1/2 of the total GPU memory is used.",
+        "(like '4GB' or '5000M'). By default, 1/2 of the total GPU memory is used."
+        ""
+        ".. note::"
+        "    This size is a per-worker configuration, and not cluster-wide.",
     )
     cluster_args.add_argument(
-        "--disable-rmm-pool", action="store_true", help="Disable the RMM memory pool"
+        "--rmm-maximum-pool-size",
+        default=None,
+        help="When ``--rmm-pool-size`` is specified, this argument indicates the "
+        "maximum pool size.  Can be an integer (bytes), or a string (like '4GB' or "
+        "'5000M'). By default, the total available memory on the GPU is used. "
+        "``rmm_pool_size`` must be specified to use RMM pool and to set the maximum "
+        "pool size."
+        ""
+        ".. note::"
+        "    When paired with `--enable-rmm-async` the maximum size cannot be "
+        "    guaranteed due to fragmentation."
+        ""
+        ".. note::"
+        "    This size is a per-worker configuration, and not cluster-wide.",
     )
     cluster_args.add_argument(
         "--enable-rmm-managed",
@@ -407,10 +434,29 @@ def get_worker_device():
         return -1
 
 
+def setup_rmm_resources(statistics=False, rmm_track_allocations=False):
+    import cupy
+
+    import rmm
+    from rmm.allocators.cupy import rmm_cupy_allocator
+
+    cupy.cuda.set_allocator(rmm_cupy_allocator)
+    if statistics:
+        rmm.mr.set_current_device_resource(
+            rmm.mr.StatisticsResourceAdaptor(rmm.mr.get_current_device_resource())
+        )
+    if rmm_track_allocations:
+        rmm.mr.set_current_device_resource(
+            rmm.mr.TrackingResourceAdaptor(rmm.mr.get_current_device_resource())
+        )
+
+
 def setup_memory_pool(
     dask_worker=None,
+    disable_rmm=None,
+    disable_rmm_pool=None,
     pool_size=None,
-    disable_pool=False,
+    maximum_pool_size=None,
     rmm_async=False,
     rmm_managed=False,
     release_threshold=None,
@@ -418,45 +464,66 @@ def setup_memory_pool(
     statistics=False,
     rmm_track_allocations=False,
 ):
-    import cupy
-
     import rmm
-    from rmm.allocators.cupy import rmm_cupy_allocator
 
     from dask_cuda.utils import get_rmm_log_file_name
 
     logging = log_directory is not None
 
-    if rmm_async:
-        rmm.mr.set_current_device_resource(
-            rmm.mr.CudaAsyncMemoryResource(
-                initial_pool_size=pool_size, release_threshold=release_threshold
-            )
-        )
-    else:
-        rmm.reinitialize(
-            pool_allocator=not disable_pool,
-            managed_memory=rmm_managed,
-            initial_pool_size=pool_size,
-            logging=logging,
-            log_file_name=get_rmm_log_file_name(dask_worker, logging, log_directory),
-        )
-    cupy.cuda.set_allocator(rmm_cupy_allocator)
-    if statistics:
-        rmm.mr.set_current_device_resource(
-            rmm.mr.StatisticsResourceAdaptor(rmm.mr.get_current_device_resource())
+    if pool_size is not None:
+        pool_size = parse_device_memory_limit(pool_size, alignment_size=256)
+
+    if maximum_pool_size is not None:
+        maximum_pool_size = parse_device_memory_limit(
+            maximum_pool_size, alignment_size=256
         )
-    if rmm_track_allocations:
-        rmm.mr.set_current_device_resource(
-            rmm.mr.TrackingResourceAdaptor(rmm.mr.get_current_device_resource())
+
+    if release_threshold is not None:
+        release_threshold = parse_device_memory_limit(
+            release_threshold, alignment_size=256
         )
 
+    if not disable_rmm:
+        if rmm_async:
+            mr = rmm.mr.CudaAsyncMemoryResource(
+                initial_pool_size=pool_size,
+                release_threshold=release_threshold,
+            )
+
+            if maximum_pool_size is not None:
+                mr = rmm.mr.LimitingResourceAdaptor(
+                    mr, allocation_limit=maximum_pool_size
+                )
+
+            rmm.mr.set_current_device_resource(mr)
+
+            setup_rmm_resources(
+                statistics=statistics, rmm_track_allocations=rmm_track_allocations
+            )
+        else:
+            rmm.reinitialize(
+                pool_allocator=not disable_rmm_pool,
+                managed_memory=rmm_managed,
+                initial_pool_size=pool_size,
+                maximum_pool_size=maximum_pool_size,
+                logging=logging,
+                log_file_name=get_rmm_log_file_name(
+                    dask_worker, logging, log_directory
+                ),
+            )
+
+            setup_rmm_resources(
+                statistics=statistics, rmm_track_allocations=rmm_track_allocations
+            )
+
 
 def setup_memory_pools(
     client,
     is_gpu,
+    disable_rmm,
+    disable_rmm_pool,
     pool_size,
-    disable_pool,
+    maximum_pool_size,
     rmm_async,
     rmm_managed,
     release_threshold,
@@ -468,8 +535,10 @@ def setup_memory_pools(
         return
     client.run(
         setup_memory_pool,
+        disable_rmm=disable_rmm,
+        disable_rmm_pool=disable_rmm_pool,
         pool_size=pool_size,
-        disable_pool=disable_pool,
+        maximum_pool_size=maximum_pool_size,
         rmm_async=rmm_async,
         rmm_managed=rmm_managed,
         release_threshold=release_threshold,
@@ -482,7 +551,9 @@ def setup_memory_pools(
     client.run_on_scheduler(
         setup_memory_pool,
         pool_size=1e9,
-        disable_pool=disable_pool,
+        disable_rmm=disable_rmm,
+        disable_rmm_pool=disable_rmm_pool,
+        maximum_pool_size=maximum_pool_size,
         rmm_async=rmm_async,
         rmm_managed=rmm_managed,
         release_threshold=release_threshold,
diff --git a/dask_cuda/cli.py b/dask_cuda/cli.py
index cc2d0843..ba58fe3e 100644
--- a/dask_cuda/cli.py
+++ b/dask_cuda/cli.py
@@ -120,6 +120,10 @@ def cuda():
     memory on the GPU is used. ``rmm_pool_size`` must be specified to use RMM pool and
     to set the maximum pool size.
 
+    .. note::
+        When paired with `--enable-rmm-async` the maximum size cannot be guaranteed due
+        to fragmentation.
+
     .. note::
         This size is a per-worker configuration, and not cluster-wide.""",
 )
diff --git a/dask_cuda/local_cuda_cluster.py b/dask_cuda/local_cuda_cluster.py
index 7a5c8c13..1b81c770 100644
--- a/dask_cuda/local_cuda_cluster.py
+++ b/dask_cuda/local_cuda_cluster.py
@@ -114,6 +114,10 @@ class LocalCUDACluster(LocalCluster):
         memory on the GPU is used. ``rmm_pool_size`` must be specified to use RMM pool
         and to set the maximum pool size.
 
+        .. note::
+            When paired with `--enable-rmm-async` the maximum size cannot be guaranteed
+            due to fragmentation.
+
         .. note::
             This size is a per-worker configuration, and not cluster-wide.
     rmm_managed_memory : bool, default False

From 3e0f7c3fb618b5419e0ddc561b480f440dfbbe18 Mon Sep 17 00:00:00 2001
From: jakirkham <jakirkham@gmail.com>
Date: Mon, 1 Jul 2024 06:57:19 -0700
Subject: [PATCH 09/17] Drop `setup.py` (#1354)

This is just calling `setup` from `setuptools`, which should already happen with the `setuptools` backend. So go ahead and drop `setup.py`.

Authors:
  - https://github.com/jakirkham

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/dask-cuda/pull/1354
---
 setup.py | 3 ---
 1 file changed, 3 deletions(-)
 delete mode 100644 setup.py

diff --git a/setup.py b/setup.py
deleted file mode 100644
index 60684932..00000000
--- a/setup.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from setuptools import setup
-
-setup()

From fe23e45ab4ae69be193676fe98bda383c43f9e53 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Tue, 9 Jul 2024 07:54:17 -0500
Subject: [PATCH 10/17] Fix partitioning in explicit-comms shuffle (#1356)

Closes https://github.com/rapidsai/dask-cuda/issues/1355

Current version of the explicit-comms shuffle does not produce partitioning that is consistent with `dask.dataframe`.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/1356
---
 dask_cuda/explicit_comms/dataframe/shuffle.py | 44 ++++++++++--------
 dask_cuda/tests/test_explicit_comms.py        | 46 +++++++++++++++----
 2 files changed, 62 insertions(+), 28 deletions(-)

diff --git a/dask_cuda/explicit_comms/dataframe/shuffle.py b/dask_cuda/explicit_comms/dataframe/shuffle.py
index 3f7b7951..70f12335 100644
--- a/dask_cuda/explicit_comms/dataframe/shuffle.py
+++ b/dask_cuda/explicit_comms/dataframe/shuffle.py
@@ -8,6 +8,9 @@
 from operator import getitem
 from typing import Any, Callable, Dict, List, Optional, Set, TypeVar
 
+import numpy as np
+import pandas as pd
+
 import dask
 import dask.config
 import dask.dataframe
@@ -155,9 +158,16 @@ def compute_map_index(
     if column_names[0] == "_partitions":
         ind = df[column_names[0]]
     else:
-        ind = hash_object_dispatch(
-            df[column_names] if column_names else df, index=False
-        )
+        # Need to cast numerical dtypes to be consistent
+        # with `dask.dataframe.shuffle.partitioning_index`
+        dtypes = {}
+        index = df[column_names] if column_names else df
+        for col, dtype in index.dtypes.items():
+            if pd.api.types.is_numeric_dtype(dtype):
+                dtypes[col] = np.float64
+        if dtypes:
+            index = index.astype(dtypes, errors="ignore")
+        ind = hash_object_dispatch(index, index=False)
     return ind % npartitions
 
 
@@ -187,15 +197,8 @@ def partition_dataframe(
     partitions
         Dict of dataframe-partitions, mapping partition-ID to dataframe
     """
-    if column_names[0] != "_partitions" and hasattr(df, "partition_by_hash"):
-        return dict(
-            zip(
-                range(npartitions),
-                df.partition_by_hash(
-                    column_names, npartitions, keep_index=not ignore_index
-                ),
-            )
-        )
+    # TODO: Use `partition_by_hash` if/when dtype-casting is added
+    # (See: https://github.com/rapidsai/cudf/issues/16221)
     map_index = compute_map_index(df, column_names, npartitions)
     return group_split_dispatch(df, map_index, npartitions, ignore_index=ignore_index)
 
@@ -529,18 +532,19 @@ def shuffle(
     # TODO: can we do this without using `submit()` to avoid the overhead
     #       of creating a Future for each dataframe partition?
 
-    futures = []
+    _futures = {}
     for rank in ranks:
         for part_id in rank_to_out_part_ids[rank]:
-            futures.append(
-                c.client.submit(
-                    getitem,
-                    shuffle_result[rank],
-                    part_id,
-                    workers=[c.worker_addresses[rank]],
-                )
+            _futures[part_id] = c.client.submit(
+                getitem,
+                shuffle_result[rank],
+                part_id,
+                workers=[c.worker_addresses[rank]],
             )
 
+    # Make sure partitions are properly ordered
+    futures = [_futures.pop(i) for i in range(npartitions)]
+
     # Create a distributed Dataframe from all the pieces
     divs = [None] * (len(futures) + 1)
     kwargs = {"meta": df_meta, "divisions": divs, "prefix": "explicit-comms-shuffle"}
diff --git a/dask_cuda/tests/test_explicit_comms.py b/dask_cuda/tests/test_explicit_comms.py
index f495648e..2806dc1c 100644
--- a/dask_cuda/tests/test_explicit_comms.py
+++ b/dask_cuda/tests/test_explicit_comms.py
@@ -109,7 +109,14 @@ def test_dataframe_merge_empty_partitions():
 
 def check_partitions(df, npartitions):
     """Check that all values in `df` hashes to the same"""
-    hashes = partitioning_index(df, npartitions)
+    dtypes = {}
+    for col, dtype in df.dtypes.items():
+        if pd.api.types.is_numeric_dtype(dtype):
+            dtypes[col] = np.float64
+    if not dtypes:
+        dtypes = None
+
+    hashes = partitioning_index(df, npartitions, cast_dtype=dtypes)
     if len(hashes) > 0:
         return len(hashes.unique()) == 1
     else:
@@ -128,11 +135,10 @@ def _test_dataframe_shuffle(backend, protocol, n_workers, _partitions):
         worker_class=IncreasedCloseTimeoutNanny,
         processes=True,
     ) as cluster:
-        with Client(cluster) as client:
-            all_workers = list(client.get_worker_logs().keys())
+        with Client(cluster):
             comms.default_comms()
             np.random.seed(42)
-            df = pd.DataFrame({"key": np.random.random(100)})
+            df = pd.DataFrame({"key": np.random.randint(0, high=100, size=100)})
             if backend == "cudf":
                 df = cudf.DataFrame.from_pandas(df)
 
@@ -141,15 +147,13 @@ def _test_dataframe_shuffle(backend, protocol, n_workers, _partitions):
 
             for input_nparts in range(1, 5):
                 for output_nparts in range(1, 5):
-                    ddf = dd.from_pandas(df.copy(), npartitions=input_nparts).persist(
-                        workers=all_workers
-                    )
+                    ddf1 = dd.from_pandas(df.copy(), npartitions=input_nparts)
                     # To reduce test runtime, we change the batchsizes here instead
                     # of using a test parameter.
                     for batchsize in (-1, 1, 2):
                         with dask.config.set(explicit_comms_batchsize=batchsize):
                             ddf = explicit_comms_shuffle(
-                                ddf,
+                                ddf1,
                                 ["_partitions"] if _partitions else ["key"],
                                 npartitions=output_nparts,
                                 batchsize=batchsize,
@@ -177,6 +181,32 @@ def _test_dataframe_shuffle(backend, protocol, n_workers, _partitions):
                                 got = ddf.compute().sort_values("key")
                                 assert_eq(got, expected)
 
+                                # Check that partitioning is consistent with "tasks"
+                                ddf_tasks = ddf1.shuffle(
+                                    ["key"],
+                                    npartitions=output_nparts,
+                                    shuffle_method="tasks",
+                                )
+                                for i in range(output_nparts):
+                                    expected_partition = ddf_tasks.partitions[
+                                        i
+                                    ].compute()["key"]
+                                    actual_partition = ddf.partitions[i].compute()[
+                                        "key"
+                                    ]
+                                    if backend == "cudf":
+                                        expected_partition = (
+                                            expected_partition.values_host
+                                        )
+                                        actual_partition = actual_partition.values_host
+                                    else:
+                                        expected_partition = expected_partition.values
+                                        actual_partition = actual_partition.values
+                                    assert all(
+                                        np.sort(expected_partition)
+                                        == np.sort(actual_partition)
+                                    )
+
 
 @pytest.mark.parametrize("nworkers", [1, 2, 3])
 @pytest.mark.parametrize("backend", ["pandas", "cudf"])

From c31aaac30ae07f8b57c80380e6d517c48f591fe0 Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Tue, 16 Jul 2024 16:03:06 -0400
Subject: [PATCH 11/17] Build and test with CUDA 12.5.1 (#1357)

This PR updates the latest CUDA build/test version 12.2.2 to 12.5.1.

Contributes to https://github.com/rapidsai/build-planning/issues/73

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/dask-cuda/pull/1357
---
 .github/workflows/build.yaml                         | 10 +++++-----
 .github/workflows/pr.yaml                            | 12 ++++++------
 .github/workflows/test.yaml                          |  2 +-
 ...rch-x86_64.yaml => all_cuda-125_arch-x86_64.yaml} |  4 ++--
 dependencies.yaml                                    |  6 +++++-
 docs/source/install.rst                              |  4 ++--
 6 files changed, 21 insertions(+), 17 deletions(-)
 rename conda/environments/{all_cuda-122_arch-x86_64.yaml => all_cuda-125_arch-x86_64.yaml} (94%)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 69b0de5f..237f5595 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   conda-python-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@cuda-12.5.1
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -38,7 +38,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -51,7 +51,7 @@ jobs:
   upload-conda:
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@cuda-12.5.1
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -59,7 +59,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -72,7 +72,7 @@ jobs:
   wheel-publish:
     needs: wheel-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-12.5.1
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 4e56d24d..e7fbb292 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -18,26 +18,26 @@ jobs:
       - docs-build
       - wheel-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@cuda-12.5.1
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@cuda-12.5.1
   conda-python-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@cuda-12.5.1
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.5.1
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -46,7 +46,7 @@ jobs:
       run_script: "ci/build_docs.sh"
   wheel-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1
     with:
       build_type: pull-request
       # Package is pure Python and only ever requires one build.
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 7a884c5c..f5bd04c5 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.5.1
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
similarity index 94%
rename from conda/environments/all_cuda-122_arch-x86_64.yaml
rename to conda/environments/all_cuda-125_arch-x86_64.yaml
index 4db52a6d..a27dea72 100644
--- a/conda/environments/all_cuda-122_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -10,7 +10,7 @@ dependencies:
 - click >=8.1
 - cuda-nvcc-impl
 - cuda-nvrtc
-- cuda-version=12.2
+- cuda-version=12.5
 - cudf==24.8.*,>=0.0.0a0
 - dask-cudf==24.8.*,>=0.0.0a0
 - distributed-ucxx==0.39.*,>=0.0.0a0
@@ -35,4 +35,4 @@ dependencies:
 - ucx-py==0.39.*,>=0.0.0a0
 - ucxx==0.39.*,>=0.0.0a0
 - zict>=2.0.0
-name: all_cuda-122_arch-x86_64
+name: all_cuda-125_arch-x86_64
diff --git a/dependencies.yaml b/dependencies.yaml
index c7f55283..910edc08 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -3,7 +3,7 @@ files:
   all:
     output: conda
     matrix:
-      cuda: ["11.4", "11.8", "12.2"]
+      cuda: ["11.4", "11.8", "12.5"]
       arch: [x86_64]
     includes:
       - build_python
@@ -100,6 +100,10 @@ dependencies:
               cuda: "12.2"
             packages:
               - cuda-version=12.2
+          - matrix:
+              cuda: "12.5"
+            packages:
+              - cuda-version=12.5
   cuda:
     specific:
       - output_types: conda
diff --git a/docs/source/install.rst b/docs/source/install.rst
index e522ae3c..43082a67 100644
--- a/docs/source/install.rst
+++ b/docs/source/install.rst
@@ -12,11 +12,11 @@ To use Dask-CUDA on your system, you will need:
 - A version of NVIDIA CUDA Toolkit compatible with the installed driver version; see Table 1 of `CUDA Compatibility -- Binary Compatibility <https://docs.nvidia.com/deploy/cuda-compatibility/index.html#binary-compatibility>`_ for an overview of CUDA Toolkit driver requirements
 
 Once the proper CUDA Toolkit version has been determined, it can be installed using along with Dask-CUDA using ``conda``.
-To install the latest version of Dask-CUDA along with CUDA Toolkit 12.0:
+To install the latest version of Dask-CUDA along with CUDA Toolkit 12.5:
 
 .. code-block:: bash
 
-    conda install -c rapidsai -c conda-forge -c nvidia dask-cuda cuda-version=12.0
+    conda install -c rapidsai -c conda-forge -c nvidia dask-cuda cuda-version=12.5
 
 Pip
 ---

From b27920021a085de5bd12fe317fa2585248665f5d Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Fri, 19 Jul 2024 12:57:59 -0400
Subject: [PATCH 12/17] Use workflow branch 24.08 again (#1359)

After updating everything to CUDA 12.5.1, use `shared-workflows@branch-24.08` again.

Contributes to https://github.com/rapidsai/build-planning/issues/73

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/dask-cuda/pull/1359
---
 .github/workflows/build.yaml | 10 +++++-----
 .github/workflows/pr.yaml    | 12 ++++++------
 .github/workflows/test.yaml  |  2 +-
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 237f5595..69b0de5f 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   conda-python-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -38,7 +38,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -51,7 +51,7 @@ jobs:
   upload-conda:
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -59,7 +59,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -72,7 +72,7 @@ jobs:
   wheel-publish:
     needs: wheel-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index e7fbb292..4e56d24d 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -18,26 +18,26 @@ jobs:
       - docs-build
       - wheel-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.08
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.08
   conda-python-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -46,7 +46,7 @@ jobs:
       run_script: "ci/build_docs.sh"
   wheel-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
     with:
       build_type: pull-request
       # Package is pure Python and only ever requires one build.
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index f5bd04c5..7a884c5c 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}

From fa226b1f1ea788c44db0de202098700f6a5a73e3 Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Fri, 19 Jul 2024 18:27:08 -0400
Subject: [PATCH 13/17] Use verify-alpha-spec hook (#1360)

With the deployment of rapids-build-backend, we need to make sure our dependencies have alpha specs.

Contributes to https://github.com/rapidsai/build-planning/issues/31

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - https://github.com/jakirkham
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/dask-cuda/pull/1360
---
 .pre-commit-config.yaml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index b10be12a..33508081 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -32,6 +32,10 @@ repos:
                 additional_dependencies: [types-cachetools]
                 args: ["--module=dask_cuda", "--ignore-missing-imports"]
                 pass_filenames: false
+      - repo: https://github.com/rapidsai/pre-commit-hooks
+        rev: v0.3.0
+        hooks:
+            - id: verify-alpha-spec
       - repo: https://github.com/rapidsai/dependency-file-generator
         rev: v1.13.11
         hooks:

From d6cafc152f8dfb46201c644dfcbdcf11d7c14f3e Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Wed, 24 Jul 2024 13:50:54 +0200
Subject: [PATCH 14/17] Add arguments to enable cuDF spilling and set
 statistics (#1362)

Add arguments to enable cuDF spilling and set statistics in `dask cuda worker`/`LocalCUDACluster`. This is implemented as a Dask plugin, and does not require users anymore to rely on `client.run` to do that.

Closes #1280

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/dask-cuda/pull/1362
---
 dask_cuda/cli.py                           | 18 +++++++
 dask_cuda/cuda_worker.py                   | 11 +++-
 dask_cuda/local_cuda_cluster.py            | 21 +++++++-
 dask_cuda/plugins.py                       | 15 ++++++
 dask_cuda/tests/test_dask_cuda_worker.py   | 58 ++++++++++++++++++++++
 dask_cuda/tests/test_local_cuda_cluster.py | 48 ++++++++++++++++++
 6 files changed, 169 insertions(+), 2 deletions(-)

diff --git a/dask_cuda/cli.py b/dask_cuda/cli.py
index ba58fe3e..6a3518e0 100644
--- a/dask_cuda/cli.py
+++ b/dask_cuda/cli.py
@@ -101,6 +101,20 @@ def cuda():
     total device memory), string (like ``"5GB"`` or ``"5000M"``), or ``"auto"`` or 0 to
     disable spilling to host (i.e. allow full device memory usage).""",
 )
+@click.option(
+    "--enable-cudf-spill/--disable-cudf-spill",
+    default=False,
+    show_default=True,
+    help="""Enable automatic cuDF spilling. WARNING: This should NOT be used with
+    JIT-Unspill.""",
+)
+@click.option(
+    "--cudf-spill-stats",
+    type=int,
+    default=0,
+    help="""Set the cuDF spilling statistics level. This option has no effect if
+    `--enable-cudf-spill` is not specified.""",
+)
 @click.option(
     "--rmm-pool-size",
     default=None,
@@ -330,6 +344,8 @@ def worker(
     name,
     memory_limit,
     device_memory_limit,
+    enable_cudf_spill,
+    cudf_spill_stats,
     rmm_pool_size,
     rmm_maximum_pool_size,
     rmm_managed_memory,
@@ -402,6 +418,8 @@ def worker(
             name,
             memory_limit,
             device_memory_limit,
+            enable_cudf_spill,
+            cudf_spill_stats,
             rmm_pool_size,
             rmm_maximum_pool_size,
             rmm_managed_memory,
diff --git a/dask_cuda/cuda_worker.py b/dask_cuda/cuda_worker.py
index e25a7c14..b88c9bc9 100644
--- a/dask_cuda/cuda_worker.py
+++ b/dask_cuda/cuda_worker.py
@@ -20,7 +20,7 @@
 
 from .device_host_file import DeviceHostFile
 from .initialize import initialize
-from .plugins import CPUAffinity, PreImport, RMMSetup
+from .plugins import CPUAffinity, CUDFSetup, PreImport, RMMSetup
 from .proxify_host_file import ProxifyHostFile
 from .utils import (
     cuda_visible_devices,
@@ -41,6 +41,8 @@ def __init__(
         name=None,
         memory_limit="auto",
         device_memory_limit="auto",
+        enable_cudf_spill=False,
+        cudf_spill_stats=0,
         rmm_pool_size=None,
         rmm_maximum_pool_size=None,
         rmm_managed_memory=False,
@@ -166,6 +168,12 @@ def del_pid_file():
         if device_memory_limit is None and memory_limit is None:
             data = lambda _: {}
         elif jit_unspill:
+            if enable_cudf_spill:
+                warnings.warn(
+                    "Enabling cuDF spilling and JIT-Unspill together is not "
+                    "safe, consider disabling JIT-Unspill."
+                )
+
             data = lambda i: (
                 ProxifyHostFile,
                 {
@@ -217,6 +225,7 @@ def del_pid_file():
                         track_allocations=rmm_track_allocations,
                     ),
                     PreImport(pre_import),
+                    CUDFSetup(spill=enable_cudf_spill, spill_stats=cudf_spill_stats),
                 },
                 name=name if nprocs == 1 or name is None else str(name) + "-" + str(i),
                 local_directory=local_directory,
diff --git a/dask_cuda/local_cuda_cluster.py b/dask_cuda/local_cuda_cluster.py
index 1b81c770..202373e9 100644
--- a/dask_cuda/local_cuda_cluster.py
+++ b/dask_cuda/local_cuda_cluster.py
@@ -10,7 +10,7 @@
 
 from .device_host_file import DeviceHostFile
 from .initialize import initialize
-from .plugins import CPUAffinity, PreImport, RMMSetup
+from .plugins import CPUAffinity, CUDFSetup, PreImport, RMMSetup
 from .proxify_host_file import ProxifyHostFile
 from .utils import (
     cuda_visible_devices,
@@ -73,6 +73,14 @@ class LocalCUDACluster(LocalCluster):
         starts spilling to host memory. Can be an integer (bytes), float (fraction of
         total device memory), string (like ``"5GB"`` or ``"5000M"``), or ``"auto"``, 0,
         or ``None`` to disable spilling to host (i.e. allow full device memory usage).
+    enable_cudf_spill : bool, default False
+        Enable automatic cuDF spilling.
+
+        .. warning::
+            This should NOT be used together with JIT-Unspill.
+    cudf_spill_stats : int, default 0
+        Set the cuDF spilling statistics level. This option has no effect if
+        ``enable_cudf_spill=False``.
     local_directory : str or None, default None
         Path on local machine to store temporary files. Can be a string (like
         ``"path/to/files"``) or ``None`` to fall back on the value of
@@ -209,6 +217,8 @@ def __init__(
         threads_per_worker=1,
         memory_limit="auto",
         device_memory_limit=0.8,
+        enable_cudf_spill=False,
+        cudf_spill_stats=0,
         data=None,
         local_directory=None,
         shared_filesystem=None,
@@ -259,6 +269,8 @@ def __init__(
         self.device_memory_limit = parse_device_memory_limit(
             device_memory_limit, device_index=nvml_device_index(0, CUDA_VISIBLE_DEVICES)
         )
+        self.enable_cudf_spill = enable_cudf_spill
+        self.cudf_spill_stats = cudf_spill_stats
 
         self.rmm_pool_size = rmm_pool_size
         self.rmm_maximum_pool_size = rmm_maximum_pool_size
@@ -302,6 +314,12 @@ def __init__(
             if device_memory_limit is None and memory_limit is None:
                 data = {}
             elif jit_unspill:
+                if enable_cudf_spill:
+                    warnings.warn(
+                        "Enabling cuDF spilling and JIT-Unspill together is not "
+                        "safe, consider disabling JIT-Unspill."
+                    )
+
                 data = (
                     ProxifyHostFile,
                     {
@@ -414,6 +432,7 @@ def new_worker_spec(self):
                         track_allocations=self.rmm_track_allocations,
                     ),
                     PreImport(self.pre_import),
+                    CUDFSetup(self.enable_cudf_spill, self.cudf_spill_stats),
                 },
             }
         )
diff --git a/dask_cuda/plugins.py b/dask_cuda/plugins.py
index 4eba97f2..122f93ff 100644
--- a/dask_cuda/plugins.py
+++ b/dask_cuda/plugins.py
@@ -14,6 +14,21 @@ def setup(self, worker=None):
         os.sched_setaffinity(0, self.cores)
 
 
+class CUDFSetup(WorkerPlugin):
+    def __init__(self, spill, spill_stats):
+        self.spill = spill
+        self.spill_stats = spill_stats
+
+    def setup(self, worker=None):
+        try:
+            import cudf
+
+            cudf.set_option("spill", self.spill)
+            cudf.set_option("spill_stats", self.spill_stats)
+        except ImportError:
+            pass
+
+
 class RMMSetup(WorkerPlugin):
     def __init__(
         self,
diff --git a/dask_cuda/tests/test_dask_cuda_worker.py b/dask_cuda/tests/test_dask_cuda_worker.py
index 974ad131..505af12f 100644
--- a/dask_cuda/tests/test_dask_cuda_worker.py
+++ b/dask_cuda/tests/test_dask_cuda_worker.py
@@ -231,6 +231,64 @@ def test_rmm_logging(loop):  # noqa: F811
                     assert v is rmm.mr.LoggingResourceAdaptor
 
 
+def test_cudf_spill_disabled(loop):  # noqa: F811
+    cudf = pytest.importorskip("cudf")
+    with popen(["dask", "scheduler", "--port", "9369", "--no-dashboard"]):
+        with popen(
+            [
+                "dask",
+                "cuda",
+                "worker",
+                "127.0.0.1:9369",
+                "--host",
+                "127.0.0.1",
+                "--no-dashboard",
+            ]
+        ):
+            with Client("127.0.0.1:9369", loop=loop) as client:
+                assert wait_workers(client, n_gpus=get_n_gpus())
+
+                cudf_spill = client.run(
+                    cudf.get_option,
+                    "spill",
+                )
+                for v in cudf_spill.values():
+                    assert v is False
+
+                cudf_spill_stats = client.run(cudf.get_option, "spill_stats")
+                for v in cudf_spill_stats.values():
+                    assert v == 0
+
+
+def test_cudf_spill(loop):  # noqa: F811
+    cudf = pytest.importorskip("cudf")
+    with popen(["dask", "scheduler", "--port", "9369", "--no-dashboard"]):
+        with popen(
+            [
+                "dask",
+                "cuda",
+                "worker",
+                "127.0.0.1:9369",
+                "--host",
+                "127.0.0.1",
+                "--no-dashboard",
+                "--enable-cudf-spill",
+                "--cudf-spill-stats",
+                "2",
+            ]
+        ):
+            with Client("127.0.0.1:9369", loop=loop) as client:
+                assert wait_workers(client, n_gpus=get_n_gpus())
+
+                cudf_spill = client.run(cudf.get_option, "spill")
+                for v in cudf_spill.values():
+                    assert v is True
+
+                cudf_spill_stats = client.run(cudf.get_option, "spill_stats")
+                for v in cudf_spill_stats.values():
+                    assert v == 2
+
+
 @patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"})
 def test_dashboard_address(loop):  # noqa: F811
     with popen(["dask", "scheduler", "--port", "9369", "--no-dashboard"]):
diff --git a/dask_cuda/tests/test_local_cuda_cluster.py b/dask_cuda/tests/test_local_cuda_cluster.py
index b05389e4..b144d111 100644
--- a/dask_cuda/tests/test_local_cuda_cluster.py
+++ b/dask_cuda/tests/test_local_cuda_cluster.py
@@ -500,6 +500,54 @@ async def test_worker_fraction_limits():
             )
 
 
+@gen_test(timeout=20)
+async def test_cudf_spill_disabled():
+    cudf = pytest.importorskip("cudf")
+
+    async with LocalCUDACluster(
+        asynchronous=True,
+    ) as cluster:
+        async with Client(cluster, asynchronous=True) as client:
+            cudf_spill = await client.run(
+                cudf.get_option,
+                "spill",
+            )
+            for v in cudf_spill.values():
+                assert v is False
+
+            cudf_spill_stats = await client.run(
+                cudf.get_option,
+                "spill_stats",
+            )
+            for v in cudf_spill_stats.values():
+                assert v == 0
+
+
+@gen_test(timeout=20)
+async def test_cudf_spill():
+    cudf = pytest.importorskip("cudf")
+
+    async with LocalCUDACluster(
+        enable_cudf_spill=True,
+        cudf_spill_stats=2,
+        asynchronous=True,
+    ) as cluster:
+        async with Client(cluster, asynchronous=True) as client:
+            cudf_spill = await client.run(
+                cudf.get_option,
+                "spill",
+            )
+            for v in cudf_spill.values():
+                assert v is True
+
+            cudf_spill_stats = await client.run(
+                cudf.get_option,
+                "spill_stats",
+            )
+            for v in cudf_spill_stats.values():
+                assert v == 2
+
+
 @pytest.mark.parametrize(
     "protocol",
     ["ucx", "ucxx"],

From 13a5f474c02399a975b9f315ab6010d933bdf077 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Wed, 24 Jul 2024 10:03:56 -0500
Subject: [PATCH 15/17] split up CUDA-suffixed dependencies in
 dependencies.yaml (#1364)

Contributes to https://github.com/rapidsai/build-planning/issues/31

In short, RAPIDS DLFW builds want to produce wheels with unsuffixed dependencies, e.g. `cudf` depending on `rmm`, not `rmm-cu12`.

This PR is part of a series across all of RAPIDS to try to support that type of build by setting up CUDA-suffixed and CUDA-unsuffixed dependency lists in `dependencies.yaml`.

For more details, see:
* https://github.com/rapidsai/build-planning/issues/31#issuecomment-2245815818
* https://github.com/rapidsai/cudf/pull/16183

## Notes for Reviewers

### Why target 24.08?

This is targeting 24.08 because:

1. it should be very low-risk
2. getting these changes into 24.08 prevents the need to carry around patches for every library in DLFW builds using RAPIDS 24.08

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

URL: https://github.com/rapidsai/dask-cuda/pull/1364
---
 dependencies.yaml | 22 ++++++++++++++++++++--
 pyproject.toml    |  1 +
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/dependencies.yaml b/dependencies.yaml
index 910edc08..f547df6b 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -190,16 +190,34 @@ dependencies:
         matrices:
           # kvikio should be added to the CUDA-version-specific matrices once there are wheels available
           # ref: https://github.com/rapidsai/kvikio/pull/369
-          - matrix: {cuda: "12.*"}
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
             packages:
               - cudf-cu12==24.8.*,>=0.0.0a0
               - dask-cudf-cu12==24.8.*,>=0.0.0a0
               - ucx-py-cu12==0.39.*,>=0.0.0a0
-          - matrix: {cuda: "11.*"}
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "false"
+            packages:
+              - *cudf_conda
+              - *dask_cudf_conda
+              - *ucx_py_conda
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
             packages:
               - cudf-cu11==24.8.*,>=0.0.0a0
               - dask-cudf-cu11==24.8.*,>=0.0.0a0
               - ucx-py-cu11==0.39.*,>=0.0.0a0
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "false"
+            packages:
+              - *cudf_conda
+              - *dask_cudf_conda
+              - *ucx_py_conda
           - matrix:
             packages:
               - *cudf_conda
diff --git a/pyproject.toml b/pyproject.toml
index 126efba6..b6c431d6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -134,6 +134,7 @@ filterwarnings = [
 build-backend = "setuptools.build_meta"
 dependencies-file = "dependencies.yaml"
 disable-cuda = true
+matrix-entry = "cuda_suffixed=true"
 
 [tool.setuptools]
 license-files = ["LICENSE"]

From 064e2544e3545a4f7478b41a1c7459212a7768ce Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Wed, 24 Jul 2024 18:31:32 -0500
Subject: [PATCH 16/17] consolidate cuda_suffixed=false blocks in
 dependencies.yaml, fix update-version.sh (#1367)

Contributes to https://github.com/rapidsai/build-planning/issues/31.

Follow-up to #1364.

Implements some of the suggestions made in https://github.com/rapidsai/cudf/pull/16183 (after #1364 was already merged):

* removing `cuda_suffixed: "false"` blocks in `dependencies.yaml` wherever they're identical to each other and the fallback matrix
* changing `dependencies.yaml` anchors with names like `*_conda` to `*_unsuffixed`, to reflect the fact that they're not conda-specific
* checking that `update-version.sh` catches all changes to versions

## Notes for Reviewers

### How I tested this

Looked for `update-versions.sh` issues manually like this:

```shell
git fetch upstream --tags
ci/release/update-version.sh '24.10.0'
git grep -E '24\.8|24\.08|0\.39'
```

The did find a few problems (like UCX dependency versions not being updated). This fixes those issues.

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/dask-cuda/pull/1367
---
 ci/release/update-version.sh | 17 +++++++++++++++--
 dependencies.yaml            | 30 ++++++++----------------------
 2 files changed, 23 insertions(+), 24 deletions(-)

diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index ac834e5e..a9fe1d02 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -45,10 +45,23 @@ DEPENDENCIES=(
   kvikio
   rapids-dask-dependency
 )
-for FILE in dependencies.yaml conda/environments/*.yaml; do
-  for DEP in "${DEPENDENCIES[@]}"; do
+for DEP in "${DEPENDENCIES[@]}"; do
+  for FILE in dependencies.yaml conda/environments/*.yaml; do
     sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0/g" "${FILE}"
   done
+  sed_runner "/\"${DEP}==/ s/==.*\"/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0\"/g" pyproject.toml
+done
+
+UCX_DEPENDENCIES=(
+  distributed-ucxx
+  ucx-py
+  ucxx
+)
+for DEP in "${UCX_DEPENDENCIES[@]}"; do
+  for FILE in dependencies.yaml conda/environments/*.yaml; do
+    sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_UCXPY_VERSION}.*,>=0.0.0a0/g" "${FILE}"
+  done
+  sed_runner "/\"${DEP}==/ s/==.*\"/==${NEXT_UCXPY_VERSION}.*,>=0.0.0a0\"/g" pyproject.toml
 done
 
 # CI files
diff --git a/dependencies.yaml b/dependencies.yaml
index f547df6b..c3b62965 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -168,11 +168,11 @@ dependencies:
           - pytest-cov
       - output_types: [conda]
         packages:
-          - &cudf_conda cudf==24.8.*,>=0.0.0a0
-          - &dask_cudf_conda dask-cudf==24.8.*,>=0.0.0a0
+          - &cudf_unsuffixed cudf==24.8.*,>=0.0.0a0
+          - &dask_cudf_unsuffixed dask-cudf==24.8.*,>=0.0.0a0
           - distributed-ucxx==0.39.*,>=0.0.0a0
-          - &kvikio_conda kvikio==24.8.*,>=0.0.0a0
-          - &ucx_py_conda ucx-py==0.39.*,>=0.0.0a0
+          - &kvikio_unsuffixed kvikio==24.8.*,>=0.0.0a0
+          - &ucx_py_unsuffixed ucx-py==0.39.*,>=0.0.0a0
           - ucx-proc=*=gpu
           - ucxx==0.39.*,>=0.0.0a0
     specific:
@@ -197,13 +197,6 @@ dependencies:
               - cudf-cu12==24.8.*,>=0.0.0a0
               - dask-cudf-cu12==24.8.*,>=0.0.0a0
               - ucx-py-cu12==0.39.*,>=0.0.0a0
-          - matrix:
-              cuda: "12.*"
-              cuda_suffixed: "false"
-            packages:
-              - *cudf_conda
-              - *dask_cudf_conda
-              - *ucx_py_conda
           - matrix:
               cuda: "11.*"
               cuda_suffixed: "true"
@@ -211,16 +204,9 @@ dependencies:
               - cudf-cu11==24.8.*,>=0.0.0a0
               - dask-cudf-cu11==24.8.*,>=0.0.0a0
               - ucx-py-cu11==0.39.*,>=0.0.0a0
-          - matrix:
-              cuda: "11.*"
-              cuda_suffixed: "false"
-            packages:
-              - *cudf_conda
-              - *dask_cudf_conda
-              - *ucx_py_conda
           - matrix:
             packages:
-              - *cudf_conda
-              - *dask_cudf_conda
-              - *kvikio_conda
-              - *ucx_py_conda
+              - *cudf_unsuffixed
+              - *dask_cudf_unsuffixed
+              - *kvikio_unsuffixed
+              - *ucx_py_unsuffixed

From bf84f99de5f477c4a230a6b9890b49c06908f55f Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Wed, 7 Aug 2024 10:42:31 -0400
Subject: [PATCH 17/17] Update Changelog [skip ci]

---
 CHANGELOG.md | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3ea704c1..37c58851 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,29 @@
+# dask-cuda 24.08.00 (7 Aug 2024)
+
+## 🐛 Bug Fixes
+
+- Fix partitioning in explicit-comms shuffle ([#1356](https://github.com/rapidsai/dask-cuda/pull/1356)) [@rjzamora](https://github.com/rjzamora)
+- Update cuDF&#39;s `assert_eq` import ([#1353](https://github.com/rapidsai/dask-cuda/pull/1353)) [@pentschev](https://github.com/pentschev)
+
+## 🚀 New Features
+
+- Add arguments to enable cuDF spilling and set statistics ([#1362](https://github.com/rapidsai/dask-cuda/pull/1362)) [@pentschev](https://github.com/pentschev)
+- Allow disabling RMM in benchmarks ([#1352](https://github.com/rapidsai/dask-cuda/pull/1352)) [@pentschev](https://github.com/pentschev)
+
+## 🛠️ Improvements
+
+- consolidate cuda_suffixed=false blocks in dependencies.yaml, fix update-version.sh ([#1367](https://github.com/rapidsai/dask-cuda/pull/1367)) [@jameslamb](https://github.com/jameslamb)
+- split up CUDA-suffixed dependencies in dependencies.yaml ([#1364](https://github.com/rapidsai/dask-cuda/pull/1364)) [@jameslamb](https://github.com/jameslamb)
+- Use verify-alpha-spec hook ([#1360](https://github.com/rapidsai/dask-cuda/pull/1360)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Use workflow branch 24.08 again ([#1359](https://github.com/rapidsai/dask-cuda/pull/1359)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Build and test with CUDA 12.5.1 ([#1357](https://github.com/rapidsai/dask-cuda/pull/1357)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Drop `setup.py` ([#1354](https://github.com/rapidsai/dask-cuda/pull/1354)) [@jakirkham](https://github.com/jakirkham)
+- remove .gitattributes ([#1350](https://github.com/rapidsai/dask-cuda/pull/1350)) [@jameslamb](https://github.com/jameslamb)
+- make conda recipe data-loading stricter ([#1349](https://github.com/rapidsai/dask-cuda/pull/1349)) [@jameslamb](https://github.com/jameslamb)
+- Adopt CI/packaging codeowners ([#1347](https://github.com/rapidsai/dask-cuda/pull/1347)) [@bdice](https://github.com/bdice)
+- Remove text builds of documentation ([#1346](https://github.com/rapidsai/dask-cuda/pull/1346)) [@vyasr](https://github.com/vyasr)
+- use rapids-build-backend ([#1343](https://github.com/rapidsai/dask-cuda/pull/1343)) [@jameslamb](https://github.com/jameslamb)
+
 # dask-cuda 24.06.00 (5 Jun 2024)
 
 ## 🐛 Bug Fixes