From f4aeab2bda88d7338a3b2f8c7971e5f90ed88105 Mon Sep 17 00:00:00 2001
From: Raymond Douglass <ray@raydouglass.com>
Date: Thu, 10 Nov 2022 12:58:18 -0500
Subject: [PATCH 01/31] DOC

---
 CHANGELOG.md    | 4 ++++
 ci/gpu/build.sh | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 44cbac4c..399ac3c4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,7 @@
+# dask-cuda 23.02.00 (Date TBD)
+
+Please see https://github.com/rapidsai/dask-cuda/releases/tag/v23.02.00a for the latest changes to this development branch.
+
 # dask-cuda 22.12.00 (Date TBD)
 
 Please see https://github.com/rapidsai/dask-cuda/releases/tag/v22.12.00a for the latest changes to this development branch.
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index e41f9976..cb0d22fb 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -26,7 +26,7 @@ cd "$WORKSPACE"
 export GIT_DESCRIBE_TAG=`git describe --tags`
 export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`
 export UCX_PATH=$CONDA_PREFIX
-export UCXPY_VERSION=0.29.*
+export UCXPY_VERSION=0.30.*
 unset GIT_DESCRIBE_TAG
 
 # Enable NumPy's __array_function__ protocol (needed for NumPy 1.16.x,

From d6ff68daae638c30e1e2e25f2fb91ecc1ee8f6ea Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 5 Dec 2022 08:14:32 -0800
Subject: [PATCH 02/31] Enable copy_prs. [skip gpuci] (#1063)

Enables copying PRs so that GitHub Actions CI can run.
---
 .github/ops-bot.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/ops-bot.yaml b/.github/ops-bot.yaml
index 0a52b679..5808edbd 100644
--- a/.github/ops-bot.yaml
+++ b/.github/ops-bot.yaml
@@ -6,3 +6,4 @@ branch_checker: true
 label_checker: true
 release_drafter: true
 external_contributors: false
+copy_prs: true

From 3535cd35c1f41ffd60c787da476b932385eb5847 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 13 Dec 2022 00:25:12 +0530
Subject: [PATCH 03/31] Unpin `dask` and `distributed` for development (#1060)

This PR unpins `dask` and `distributed` to `2022.12.0+` for `23.02` development.


xref: https://github.com/rapidsai/cudf/pull/12302

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/dask-cuda/pull/1060
---
 ci/cpu/build.sh                 |  4 ++--
 ci/gpu/build.sh                 |  4 ++--
 dask_cuda/cuda_worker.py        | 15 ++++-----------
 dask_cuda/local_cuda_cluster.py | 21 +++++++--------------
 pyproject.toml                  |  4 ++--
 5 files changed, 17 insertions(+), 31 deletions(-)

diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh
index 5ed0a322..6b91ca9e 100755
--- a/ci/cpu/build.sh
+++ b/ci/cpu/build.sh
@@ -21,10 +21,10 @@ export GPUCI_CONDA_RETRY_SLEEP=30
 
 # Whether to keep `dask/label/dev` channel in the env. If INSTALL_DASK_MAIN=0,
 # `dask/label/dev` channel is removed.
-export INSTALL_DASK_MAIN=0
+export INSTALL_DASK_MAIN=1
 
 # Dask version to install when `INSTALL_DASK_MAIN=0`
-export DASK_STABLE_VERSION="2022.11.1"
+export DASK_STABLE_VERSION="2022.12.0"
 
 # Switch to project root; also root of repo checkout
 cd "$WORKSPACE"
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 86c41095..e71b89e4 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -35,10 +35,10 @@ export NUMPY_EXPERIMENTAL_ARRAY_FUNCTION=1
 
 # Install dask and distributed from main branch. Usually needed during
 # development time and disabled before a new dask-cuda release.
-export INSTALL_DASK_MAIN=0
+export INSTALL_DASK_MAIN=1
 
 # Dask version to install when `INSTALL_DASK_MAIN=0`
-export DASK_STABLE_VERSION="2022.11.1"
+export DASK_STABLE_VERSION="2022.12.0"
 
 # Temporary workaround for Jupyter errors.
 # See https://github.com/rapidsai/dask-cuda/issues/1040
diff --git a/dask_cuda/cuda_worker.py b/dask_cuda/cuda_worker.py
index 5e14aba8..b7682de2 100644
--- a/dask_cuda/cuda_worker.py
+++ b/dask_cuda/cuda_worker.py
@@ -16,7 +16,6 @@
     enable_proctitle_on_children,
     enable_proctitle_on_current,
 )
-from distributed.utils import has_arg
 from distributed.worker_memory import parse_memory_limit
 
 from .device_host_file import DeviceHostFile
@@ -86,16 +85,10 @@ def __init__(
             raise ValueError("nthreads must be higher than 0.")
 
         # Set nthreads=1 when parsing mem_limit since it only depends on nprocs
-        if has_arg(parse_memory_limit, "logger"):
-            # TODO: Remove has_arg check after 2022.11.1 support is dropped
-            logger = logging.getLogger(__name__)
-            memory_limit = parse_memory_limit(
-                memory_limit=memory_limit, nthreads=1, total_cores=nprocs, logger=logger
-            )
-        else:
-            memory_limit = parse_memory_limit(
-                memory_limit=memory_limit, nthreads=1, total_cores=nprocs
-            )
+        logger = logging.getLogger(__name__)
+        memory_limit = parse_memory_limit(
+            memory_limit=memory_limit, nthreads=1, total_cores=nprocs, logger=logger
+        )
 
         if pid_file:
             with open(pid_file, "w") as f:
diff --git a/dask_cuda/local_cuda_cluster.py b/dask_cuda/local_cuda_cluster.py
index ff93532d..115c419c 100644
--- a/dask_cuda/local_cuda_cluster.py
+++ b/dask_cuda/local_cuda_cluster.py
@@ -5,7 +5,6 @@
 
 import dask
 from distributed import LocalCluster, Nanny, Worker
-from distributed.utils import has_arg
 from distributed.worker_memory import parse_memory_limit
 
 from .device_host_file import DeviceHostFile
@@ -233,19 +232,13 @@ def __init__(
         if n_workers < 1:
             raise ValueError("Number of workers cannot be less than 1.")
         # Set nthreads=1 when parsing mem_limit since it only depends on n_workers
-        if has_arg(parse_memory_limit, "logger"):
-            # TODO: Remove has_arg check after 2022.11.1 support is dropped
-            logger = logging.getLogger(__name__)
-            self.memory_limit = parse_memory_limit(
-                memory_limit=memory_limit,
-                nthreads=1,
-                total_cores=n_workers,
-                logger=logger,
-            )
-        else:
-            self.memory_limit = parse_memory_limit(
-                memory_limit=memory_limit, nthreads=1, total_cores=n_workers
-            )
+        logger = logging.getLogger(__name__)
+        self.memory_limit = parse_memory_limit(
+            memory_limit=memory_limit,
+            nthreads=1,
+            total_cores=n_workers,
+            logger=logger,
+        )
         self.device_memory_limit = parse_device_memory_limit(
             device_memory_limit, device_index=nvml_device_index(0, CUDA_VISIBLE_DEVICES)
         )
diff --git a/pyproject.toml b/pyproject.toml
index 4eec772d..beb3aa1b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,8 +19,8 @@ authors = [
 license= { text = "Apache-2.0" }
 requires-python = ">=3.8"
 dependencies = [
-    "dask ==2022.11.1",
-    "distributed ==2022.11.1",
+    "dask >=2022.12.0",
+    "distributed >=2022.12.0",
     "pynvml >=11.0.0",
     "numpy >=1.18.0",
     "numba >=0.54",

From aedc9550319a6fc20602ad450ad1aad3a5f6c160 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 14 Dec 2022 13:40:58 -0800
Subject: [PATCH 04/31] Reorder channel priority. (#1067)

Aligns conda channel priority in the installation guide with changes made for the 22.10.01 hotfix.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - https://github.com/jakirkham

URL: https://github.com/rapidsai/dask-cuda/pull/1067
---
 docs/source/install.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/install.rst b/docs/source/install.rst
index eb303346..b8442b4f 100644
--- a/docs/source/install.rst
+++ b/docs/source/install.rst
@@ -12,11 +12,11 @@ To use Dask-CUDA on your system, you will need:
 - A version of NVIDIA CUDA Toolkit compatible with the installed driver version; see Table 1 of `CUDA Compatibility -- Binary Compatibility <https://docs.nvidia.com/deploy/cuda-compatibility/index.html#binary-compatibility>`_ for an overview of CUDA Toolkit driver requirements
 
 Once the proper CUDA Toolkit version has been determined, it can be installed using along with Dask-CUDA using ``conda``.
-To install the latest version of Dask-CUDA along with CUDA Toolkit 11.0:
+To install the latest version of Dask-CUDA along with CUDA Toolkit 11.5:
 
 .. code-block:: bash
 
-    conda install -c rapidsai -c nvidia -c conda-forge dask-cuda cudatoolkit=11.0
+    conda install -c rapidsai -c conda-forge -c nvidia dask-cuda cudatoolkit=11.5
 
 Pip
 ---

From 5baa89d87f550493f3fdefbd681a360d98560f09 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Tue, 3 Jan 2023 13:53:15 +0100
Subject: [PATCH 05/31] Ensure consistent results from `safe_sizeof()` in test
 (#1071)

Probe `__cuda_array_interface__` in `test_device_host_file_step_by_step`, to get consistent results from `safe_sizeof()`.

Fixes #1070

Authors:
  - Mads R. B. Kristensen (https://github.com/madsbk)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/1071
---
 dask_cuda/tests/test_cudf_builtin_spilling.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/dask_cuda/tests/test_cudf_builtin_spilling.py b/dask_cuda/tests/test_cudf_builtin_spilling.py
index 3e9519ca..c6548e42 100644
--- a/dask_cuda/tests/test_cudf_builtin_spilling.py
+++ b/dask_cuda/tests/test_cudf_builtin_spilling.py
@@ -77,6 +77,11 @@ def test_device_host_file_step_by_step(tmp_path, manager: SpillManager):
     tmpdir.mkdir()
     pdf = pandas.DataFrame({"a": [1, 2, 3]})
     cdf = cudf.DataFrame({"a": [1, 2, 3]})
+
+    # Pandas will cache the result of probing this attribute.
+    # We trigger it here, to get consistent results from `safe_sizeof()`
+    hasattr(pdf, "__cuda_array_interface__")
+
     dhf = DeviceHostFile(
         device_memory_limit=safe_sizeof(pdf),
         memory_limit=safe_sizeof(pdf),

From 10b73acec814abb41f150e53c1a22701da5e0561 Mon Sep 17 00:00:00 2001
From: Matthew Farrellee <matt@cs.wisc.edu>
Date: Tue, 3 Jan 2023 08:26:35 -0500
Subject: [PATCH 06/31] Pass missing argument to groupby benchmark compute
 (#1069)

Authors:
  - Matthew Farrellee (https://github.com/mattf)
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/1069
---
 dask_cuda/benchmarks/local_cudf_groupby.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dask_cuda/benchmarks/local_cudf_groupby.py b/dask_cuda/benchmarks/local_cudf_groupby.py
index 0a142698..4e9dea94 100644
--- a/dask_cuda/benchmarks/local_cudf_groupby.py
+++ b/dask_cuda/benchmarks/local_cudf_groupby.py
@@ -107,6 +107,7 @@ def bench_once(client, args, write_profile=None):
         t1 = clock()
         agg = apply_groupby(
             df,
+            backend=args.backend,
             sort=args.sort,
             split_out=args.split_out,
             split_every=args.split_every,

From d78c60aac4410c305fb462b1bc679889aec41e37 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 4 Jan 2023 09:04:39 -0600
Subject: [PATCH 07/31] Add GitHub Actions Workflows (#1062)

This PR adds GitHub Actions workflows to `dask-cuda`.

### Task list

Coverage required for this PR:
- [x] Python tests
- [x] Codecov
- [x] Style checks

Future work required:
- [Deploy sdist/wheels to PyPI](https://github.com/rapidsai/dask-cuda/blob/d6ff68daae638c30e1e2e25f2fb91ecc1ee8f6ea/ci/cpu/build.sh#L98)

Authors:
  - Bradley Dice (https://github.com/bdice)
  - AJ Schmidt (https://github.com/ajschmidt8)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/dask-cuda/pull/1062
---
 .github/CODEOWNERS                |   3 +-
 .github/ops-bot.yaml              |   2 +-
 .github/workflows/build.yaml      |  63 ++++++++++++++++++
 .github/workflows/pr.yaml         |  49 ++++++++++++++
 .github/workflows/test.yaml       |  24 +++++++
 ci/build_python.sh                |  17 +++++
 ci/build_python_pypi.sh           |  18 ++++++
 ci/check_style.sh                 |  18 ++++++
 ci/gpu/build.sh                   |   2 +-
 ci/release/update-version.sh      |   7 +-
 ci/test_python.sh                 |  88 +++++++++++++++++++++++++
 conda/recipes/dask-cuda/meta.yaml |   7 +-
 dependencies.yaml                 | 103 ++++++++++++++++++++++++++++++
 13 files changed, 393 insertions(+), 8 deletions(-)
 create mode 100644 .github/workflows/build.yaml
 create mode 100644 .github/workflows/pr.yaml
 create mode 100644 .github/workflows/test.yaml
 create mode 100755 ci/build_python.sh
 create mode 100755 ci/build_python_pypi.sh
 create mode 100755 ci/check_style.sh
 create mode 100755 ci/test_python.sh
 create mode 100644 dependencies.yaml

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 23d0af35..9bfa630e 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -2,8 +2,9 @@
 dask_cuda/  @rapidsai/daskcuda-python-codeowners
 
 #build/ops code owners
-.github/           @rapidsai/ops-codeowners 
+.github/           @rapidsai/ops-codeowners
 ci/                @rapidsai/ops-codeowners
 conda/             @rapidsai/ops-codeowners
 **/Dockerfile      @rapidsai/ops-codeowners
 **/.dockerignore   @rapidsai/ops-codeowners
+dependencies.yaml  @rapidsai/ops-codeowners
diff --git a/.github/ops-bot.yaml b/.github/ops-bot.yaml
index 5808edbd..2d1444c5 100644
--- a/.github/ops-bot.yaml
+++ b/.github/ops-bot.yaml
@@ -5,5 +5,5 @@ auto_merger: true
 branch_checker: true
 label_checker: true
 release_drafter: true
-external_contributors: false
 copy_prs: true
+recently_updated: true
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
new file mode 100644
index 00000000..46ba4284
--- /dev/null
+++ b/.github/workflows/build.yaml
@@ -0,0 +1,63 @@
+name: build
+
+on:
+  push:
+    branches:
+      - "branch-*"
+    tags:
+      - v[0-9][0-9].[0-9][0-9].[0-9][0-9]
+  workflow_dispatch:
+    inputs:
+      branch:
+        required: true
+        type: string
+      date:
+        required: true
+        type: string
+      sha:
+        required: true
+        type: string
+      build_type:
+        type: string
+        default: nightly
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  conda-python-build:
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-matrix-build.yaml@main
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      date: ${{ inputs.date }}
+      sha: ${{ inputs.sha }}
+  upload-conda:
+    needs: [conda-python-build]
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@main
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      date: ${{ inputs.date }}
+      sha: ${{ inputs.sha }}
+  wheel-build:
+    runs-on: ubuntu-latest
+    container:
+      image: rapidsai/ci:latest
+    defaults:
+      run:
+        shell: bash
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+      - name: Build wheel
+        run: ci/build_python_pypi.sh
+      - name: Publish distribution 📦 to PyPI
+        if: inputs.build_type == 'nightly'
+        uses: pypa/gh-action-pypi-publish@release/v1
+        with:
+          password: ${{ secrets.RAPIDSAI_PYPI_TOKEN }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
new file mode 100644
index 00000000..c48c8f7b
--- /dev/null
+++ b/.github/workflows/pr.yaml
@@ -0,0 +1,49 @@
+name: pr
+
+on:
+  push:
+    branches:
+      - "pull-request/[0-9]+"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  pr-builder:
+    needs:
+      - checks
+      - conda-python-build
+      - conda-python-tests
+      - wheel-build
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@main
+  checks:
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@main
+  conda-python-build:
+    needs: checks
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-matrix-build.yaml@main
+    with:
+      build_type: pull-request
+  conda-python-tests:
+    needs: conda-python-build
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@main
+    with:
+      build_type: pull-request
+  wheel-build:
+    needs: checks
+    runs-on: ubuntu-latest
+    container:
+      image: rapidsai/ci:latest
+    defaults:
+      run:
+        shell: bash
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+      - name: Build wheel
+        run: ci/build_python_pypi.sh
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
new file mode 100644
index 00000000..44dbd99a
--- /dev/null
+++ b/.github/workflows/test.yaml
@@ -0,0 +1,24 @@
+name: test
+
+on:
+  workflow_dispatch:
+    inputs:
+      branch:
+        required: true
+        type: string
+      date:
+        required: true
+        type: string
+      sha:
+        required: true
+        type: string
+
+jobs:
+  conda-python-tests:
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@main
+    with:
+      build_type: nightly
+      branch: ${{ inputs.branch }}
+      date: ${{ inputs.date }}
+      sha: ${{ inputs.sha }}
diff --git a/ci/build_python.sh b/ci/build_python.sh
new file mode 100755
index 00000000..4124a4c5
--- /dev/null
+++ b/ci/build_python.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+source rapids-env-update
+
+export CMAKE_GENERATOR=Ninja
+
+rapids-print-env
+
+rapids-logger "Begin py build"
+
+rapids-mamba-retry mambabuild \
+  conda/recipes/dask-cuda
+
+rapids-upload-conda-to-s3 python
diff --git a/ci/build_python_pypi.sh b/ci/build_python_pypi.sh
new file mode 100755
index 00000000..5fea926c
--- /dev/null
+++ b/ci/build_python_pypi.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+
+python -m pip install build --user
+
+# While conda provides these during conda-build, they are also necessary during
+# the setup.py build for PyPI
+export GIT_DESCRIBE_TAG=$(git describe --abbrev=0 --tags)
+export GIT_DESCRIBE_NUMBER=$(git rev-list ${GIT_DESCRIBE_TAG}..HEAD --count)
+
+# Compute/export VERSION_SUFFIX
+source rapids-env-update
+
+python -m build \
+  --sdist \
+  --wheel \
+  --outdir dist/ \
+  .
diff --git a/ci/check_style.sh b/ci/check_style.sh
new file mode 100755
index 00000000..be3ac3f4
--- /dev/null
+++ b/ci/check_style.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+rapids-logger "Create checks conda environment"
+. /opt/conda/etc/profile.d/conda.sh
+
+rapids-dependency-file-generator \
+  --output conda \
+  --file_key checks \
+  --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml
+
+rapids-mamba-retry env create --force -f env.yaml -n checks
+conda activate checks
+
+# Run pre-commit checks
+pre-commit run --hook-stage manual --all-files --show-diff-on-failure
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index e71b89e4..b9661f52 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -26,7 +26,7 @@ cd "$WORKSPACE"
 export GIT_DESCRIBE_TAG=`git describe --tags`
 export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`
 export UCX_PATH=$CONDA_PREFIX
-export UCXPY_VERSION=0.30.*
+export UCXPY_VERSION=0.30
 unset GIT_DESCRIBE_TAG
 
 # Enable NumPy's __array_function__ protocol (needed for NumPy 1.16.x,
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index afd907b5..0938bff0 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -22,7 +22,7 @@ CURRENT_SHORT_TAG=${CURRENT_MAJOR}.${CURRENT_MINOR}
 NEXT_MAJOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[1]}')
 NEXT_MINOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[2]}')
 NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR}
-NEXT_UCXPY_VERSION="$(curl -s https://version.gpuci.io/rapids/${NEXT_SHORT_TAG}).*"
+NEXT_UCXPY_VERSION="$(curl -s https://version.gpuci.io/rapids/${NEXT_SHORT_TAG})"
 
 echo "Preparing release $CURRENT_TAG => $NEXT_FULL_TAG"
 
@@ -33,3 +33,8 @@ function sed_runner() {
 
 # Update UCX-Py version
 sed_runner "s/export UCXPY_VERSION=.*/export UCXPY_VERSION="${NEXT_UCXPY_VERSION}"/g" ci/gpu/build.sh
+
+# Bump cudf and dask-cudf testing dependencies
+sed_runner "s/cudf=.*/cudf=${NEXT_SHORT_TAG}/g" dependencies.yaml
+sed_runner "s/dask-cudf=.*/dask-cudf=${NEXT_SHORT_TAG}/g" dependencies.yaml
+sed_runner "s/ucx-py=.*/ucx-py=${NEXT_UCXPY_VERSION}/g" dependencies.yaml
diff --git a/ci/test_python.sh b/ci/test_python.sh
new file mode 100755
index 00000000..25e19cca
--- /dev/null
+++ b/ci/test_python.sh
@@ -0,0 +1,88 @@
+#!/bin/bash
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+. /opt/conda/etc/profile.d/conda.sh
+
+rapids-logger "Generate Python testing dependencies"
+rapids-dependency-file-generator \
+  --output conda \
+  --file_key test_python \
+  --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml
+
+rapids-mamba-retry env create --force -f env.yaml -n test
+
+# Temporarily allow unbound variables for conda activation.
+set +u
+conda activate test
+set -u
+
+rapids-logger "Downloading artifacts from previous jobs"
+PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python)
+
+RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"}
+RAPIDS_COVERAGE_DIR=${RAPIDS_COVERAGE_DIR:-"${PWD}/coverage-results"}
+mkdir -p "${RAPIDS_TESTS_DIR}" "${RAPIDS_COVERAGE_DIR}"
+SUITEERROR=0
+
+rapids-print-env
+
+rapids-mamba-retry install \
+  -c "${PYTHON_CHANNEL}" \
+  dask-cuda
+
+rapids-logger "Check GPU usage"
+nvidia-smi
+
+set +e
+
+rapids-logger "pytest dask-cuda"
+pushd dask_cuda
+DASK_CUDA_TEST_SINGLE_GPU=1 \
+UCXPY_IFNAME=eth0 \
+UCX_WARN_UNUSED_ENV_VARS=n \
+UCX_MEMTYPE_CACHE=n \
+pytest \
+  --capture=no \
+  --cache-clear \
+  --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cuda.xml" \
+  --cov-config=../pyproject.toml \
+  --cov=dask_cuda \
+  --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/dask-cuda-coverage.xml" \
+  --cov-report=term \
+  tests
+exitcode=$?
+
+if (( ${exitcode} != 0 )); then
+    SUITEERROR=${exitcode}
+    echo "FAILED: 1 or more tests in dask-cuda"
+fi
+popd
+
+rapids-logger "Run local benchmark"
+python dask_cuda/benchmarks/local_cudf_shuffle.py \
+  --partition-size="1 KiB" \
+  -d 0 \
+  --runs 1 \
+  --backend dask
+exitcode=$?
+
+if (( ${exitcode} != 0 )); then
+    SUITEERROR=${exitcode}
+    echo "FAILED: Local benchmark with dask comms"
+fi
+
+python dask_cuda/benchmarks/local_cudf_shuffle.py \
+  --partition-size="1 KiB" \
+  -d 0 \
+  --runs 1 \
+  --backend explicit-comms
+exitcode=$?
+
+if (( ${exitcode} != 0 )); then
+    SUITEERROR=${exitcode}
+    echo "FAILED: Local benchmark with explicit comms"
+fi
+
+exit ${SUITEERROR}
diff --git a/conda/recipes/dask-cuda/meta.yaml b/conda/recipes/dask-cuda/meta.yaml
index a31628b2..b0b02cb2 100644
--- a/conda/recipes/dask-cuda/meta.yaml
+++ b/conda/recipes/dask-cuda/meta.yaml
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2021, NVIDIA CORPORATION.
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
 
 # Usage:
 #   conda build -c conda-forge .
@@ -6,7 +6,7 @@
 
 {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
 {% set number = environ.get('GIT_DESCRIBE_NUMBER', 0) %}
-{% set py_version = environ.get('CONDA_PY', 36) %}
+{% set py_version = environ['CONDA_PY'] %}
 {% set git_hash = environ.get('GIT_DESCRIBE_HASH', '') %}
 
 package:
@@ -42,9 +42,8 @@ test:
   imports:
     - dask_cuda
 
-
 about:
-  home: http://rapids.ai/
+  home: https://rapids.ai/
   license: Apache-2.0
   license_file: ../../../LICENSE
   summary: dask-cuda library
diff --git a/dependencies.yaml b/dependencies.yaml
new file mode 100644
index 00000000..663fd216
--- /dev/null
+++ b/dependencies.yaml
@@ -0,0 +1,103 @@
+# Dependency list for https://github.com/rapidsai/dependency-file-generator
+files:
+  all:
+    output: none
+    includes:
+      - build_python
+      - cudatoolkit
+      - develop
+      - py_version
+      - run_python
+      - test_python
+  test_python:
+    output: none
+    includes:
+      - cudatoolkit
+      - py_version
+      - test_python
+  checks:
+    output: none
+    includes:
+      - develop
+      - py_version
+channels:
+  - rapidsai
+  - rapidsai-nightly
+  - dask/label/dev
+  - conda-forge
+  - nvidia
+dependencies:
+  build_python:
+    common:
+      - output_types: [conda, requirements]
+        packages:
+          - setuptools>=64.0.0
+  cudatoolkit:
+    specific:
+      - output_types: conda
+        matrices:
+          - matrix:
+              cuda: "11.2"
+            packages:
+              - cudatoolkit=11.2
+          - matrix:
+              cuda: "11.4"
+            packages:
+              - cudatoolkit=11.4
+          - matrix:
+              cuda: "11.5"
+            packages:
+              - cudatoolkit=11.5
+  develop:
+    common:
+      - output_types: [conda, requirements]
+        packages:
+          - pre-commit
+  py_version:
+    specific:
+      - output_types: conda
+        matrices:
+          - matrix:
+              py: "3.8"
+            packages:
+              - python=3.8
+          - matrix:
+              py: "3.9"
+            packages:
+              - python=3.9
+          - matrix:
+            packages:
+              - python>=3.8,<3.10
+  run_python:
+    common:
+      - output_types: [conda, requirements]
+        packages:
+          - dask>=2022.12.0
+          - distributed>=2022.12.0
+          - numba>=0.54
+          - numpy>=1.18.0
+          - pandas>=1.0
+          - pynvml>=11.0.0
+          - zict>=0.1.3
+  test_python:
+    common:
+      - output_types: [conda]
+        packages:
+          - cucim
+          - cudf=23.02
+          - dask-cudf=23.02
+          - pytest
+          - pytest-cov
+          - ucx-proc=*=gpu
+          - ucx-py=0.30
+    specific:
+      - output_types: conda
+        matrices:
+          - matrix:
+              arch: x86_64
+            packages:
+              - numactl-devel-cos7-x86_64
+          - matrix:
+              arch: aarch64
+            packages:
+              - numactl-devel-cos7-aarch64

From b345d9c830ec38e7a682d6a271a39b582e1e308d Mon Sep 17 00:00:00 2001
From: AJ Schmidt <ajschmidt8@users.noreply.github.com>
Date: Wed, 4 Jan 2023 15:44:06 -0500
Subject: [PATCH 08/31] Update builds for CUDA `11.8` and Python `310` (#1072)

This PR updates the `dask-cuda` CI workflows to build against the CUDA `11.8` / Python `3.10` [branch](https://github.com/rapidsai/shared-action-workflows/tree/cuda-118) of the `shared-action-workflows` repository.

Authors:
  - AJ Schmidt (https://github.com/ajschmidt8)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/dask-cuda/pull/1072
---
 .github/workflows/build.yaml |  4 ++--
 .github/workflows/pr.yaml    |  9 ++++++---
 .github/workflows/test.yaml  |  2 +-
 dependencies.yaml            | 10 +++++++++-
 pyproject.toml               |  1 +
 5 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 46ba4284..6376d33c 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   conda-python-build:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-matrix-build.yaml@main
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@cuda-118
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   upload-conda:
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@main
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@cuda-118
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index c48c8f7b..3ba8410f 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -17,19 +17,22 @@ jobs:
       - conda-python-tests
       - wheel-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@main
+    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@cuda-118
   checks:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@main
+    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@cuda-118
   conda-python-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-matrix-build.yaml@main
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@cuda-118
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
+    # TODO: Switch this testing branch to "cuda-118" after `cudf` `3.10` builds are out.
+    # There is a circular testing dependency between `dask-cuda` and `cudf` right now, which
+    # prevents us from running `3.10` tests for `dask-cuda` until `3.10` `cudf` packages are published.
     uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@main
     with:
       build_type: pull-request
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 44dbd99a..33d6c020 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@main
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@cuda-118
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/dependencies.yaml b/dependencies.yaml
index 663fd216..c7964722 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -48,6 +48,10 @@ dependencies:
               cuda: "11.5"
             packages:
               - cudatoolkit=11.5
+          - matrix:
+              cuda: "11.8"
+            packages:
+              - cudatoolkit=11.8
   develop:
     common:
       - output_types: [conda, requirements]
@@ -65,9 +69,13 @@ dependencies:
               py: "3.9"
             packages:
               - python=3.9
+          - matrix:
+              py: "3.10"
+            packages:
+              - python=3.10
           - matrix:
             packages:
-              - python>=3.8,<3.10
+              - python>=3.8,<3.11
   run_python:
     common:
       - output_types: [conda, requirements]
diff --git a/pyproject.toml b/pyproject.toml
index beb3aa1b..7a88741e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -35,6 +35,7 @@ classifiers=[
     "Programming Language :: Python :: 3",
     "Programming Language :: Python :: 3.8",
     "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
 ]
 
 [project.scripts]

From 74b4557df64fbf42461060b4bb536a6b5249202e Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 6 Jan 2023 14:50:21 +0000
Subject: [PATCH 09/31] Fix owner check when the owner is a cupy array (#1061)

A cupy array can't be used in a boolean setting (it is neither truthy nor falsy because at heart it's intuitionist) so we need to explicitly check that the owner is None.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)
  - Mads R. B. Kristensen (https://github.com/madsbk)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/dask-cuda/pull/1061
---
 dask_cuda/get_device_memory_objects.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dask_cuda/get_device_memory_objects.py b/dask_cuda/get_device_memory_objects.py
index 44dc433f..c5746c86 100644
--- a/dask_cuda/get_device_memory_objects.py
+++ b/dask_cuda/get_device_memory_objects.py
@@ -51,8 +51,8 @@ def get_device_memory_objects_default(obj):
         return dispatch(obj._pxy_get().obj)
     if hasattr(obj, "data"):
         return dispatch(obj.data)
-    owner = getattr(obj, "owner", None) or getattr(obj, "_owner", None)
-    if owner:
+    owner = getattr(obj, "owner", getattr(obj, "_owner", None))
+    if owner is not None:
         return dispatch(owner)
     if hasattr(obj, "__cuda_array_interface__"):
         return [obj]

From bdb7b565e92eb79080eadd82482bdac9d1ca0c64 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Tue, 10 Jan 2023 15:00:01 +0100
Subject: [PATCH 10/31] Improve shuffle-benchmark (#1074)

Adding `--ignore-index` and balance the partition distribution between workers.

This should make the runs more consist and improve the data creation significantly.

Authors:
  - Mads R. B. Kristensen (https://github.com/madsbk)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/dask-cuda/pull/1074
---
 dask_cuda/benchmarks/common.py             | 11 ++-
 dask_cuda/benchmarks/local_cudf_shuffle.py | 92 ++++++++++++++++------
 dask_cuda/explicit_comms/comms.py          |  2 +-
 3 files changed, 79 insertions(+), 26 deletions(-)

diff --git a/dask_cuda/benchmarks/common.py b/dask_cuda/benchmarks/common.py
index 7c489d00..00aa31dc 100644
--- a/dask_cuda/benchmarks/common.py
+++ b/dask_cuda/benchmarks/common.py
@@ -85,7 +85,8 @@ class Config(NamedTuple):
 def run_benchmark(client: Client, args: Namespace, config: Config):
     """Run a benchmark a specified number of times
 
-    If ``args.profile`` is set, the final run is profiled."""
+    If ``args.profile`` is set, the final run is profiled.
+    """
     results = []
     for _ in range(max(1, args.runs) - 1):
         res = config.bench_once(client, args, write_profile=None)
@@ -110,8 +111,11 @@ def gather_bench_results(client: Client, args: Namespace, config: Config):
 def run(client: Client, args: Namespace, config: Config):
     """Run the full benchmark on the cluster
 
-    Waits for the cluster, sets up memory pools, prints and saves results"""
+    Waits for the cluster, sets up memory pools, prints and saves results
+    """
+
     wait_for_cluster(client, shutdown_on_failure=True)
+    assert len(client.scheduler_info()["workers"]) > 0
     setup_memory_pools(
         client,
         args.type == "gpu",
@@ -156,7 +160,8 @@ def run_client_from_existing_scheduler(args: Namespace, config: Config):
 def run_create_client(args: Namespace, config: Config):
     """Create a client + cluster and run
 
-    Shuts down the cluster at the end of the benchmark"""
+    Shuts down the cluster at the end of the benchmark
+    """
     cluster_options = get_cluster_options(args)
     Cluster = cluster_options["class"]
     cluster_args = cluster_options["args"]
diff --git a/dask_cuda/benchmarks/local_cudf_shuffle.py b/dask_cuda/benchmarks/local_cudf_shuffle.py
index 7ff099cc..d9039aad 100644
--- a/dask_cuda/benchmarks/local_cudf_shuffle.py
+++ b/dask_cuda/benchmarks/local_cudf_shuffle.py
@@ -1,13 +1,16 @@
 import contextlib
 from collections import ChainMap
 from time import perf_counter
+from typing import Tuple
 
+import numpy as np
 import pandas as pd
 
 import dask
-from dask import array as da
+import dask.dataframe
+from dask.dataframe.core import new_dd_object
 from dask.dataframe.shuffle import shuffle
-from dask.distributed import performance_report, wait
+from dask.distributed import Client, performance_report, wait
 from dask.utils import format_bytes, parse_bytes
 
 import dask_cuda.explicit_comms.dataframe.shuffle
@@ -20,42 +23,82 @@
     print_throughput_bandwidth,
 )
 
+try:
+    import cupy
 
-def shuffle_dask(df, *, noop=False):
-    result = shuffle(df, index="data", shuffle="tasks")
-    if noop:
+    import cudf
+except ImportError:
+    cupy = None
+    cudf = None
+
+
+def shuffle_dask(df, args):
+    result = shuffle(df, index="data", shuffle="tasks", ignore_index=args.ignore_index)
+    if args.backend == "dask-noop":
         result = as_noop(result)
     t1 = perf_counter()
     wait(result.persist())
     return perf_counter() - t1
 
 
-def shuffle_explicit_comms(df):
+def shuffle_explicit_comms(df, args):
     t1 = perf_counter()
     wait(
         dask_cuda.explicit_comms.dataframe.shuffle.shuffle(
-            df, column_names="data"
+            df, column_names="data", ignore_index=args.ignore_index
         ).persist()
     )
     return perf_counter() - t1
 
 
-def bench_once(client, args, write_profile=None):
-    # Generate random Dask dataframe
-    chunksize = args.partition_size // 8  # Convert bytes to float64
-    nchunks = args.in_parts
-    totalsize = chunksize * nchunks
-    x = da.random.random((totalsize,), chunks=(chunksize,))
-    df = dask.dataframe.from_dask_array(x, columns="data").to_frame()
+def create_df(nelem, df_type):
+    if df_type == "cpu":
+        return pd.DataFrame({"data": np.random.random(nelem)})
+    elif df_type == "gpu":
+        if cudf is None or cupy is None:
+            raise RuntimeError("`--type=gpu` requires cudf and cupy ")
+        return cudf.DataFrame({"data": cupy.random.random(nelem)})
+    else:
+        raise ValueError(f"Unknown type {df_type}")
+
+
+def create_data(
+    client: Client, args, name="balanced-df"
+) -> Tuple[int, dask.dataframe.DataFrame]:
+    """Create an evenly distributed dask dataframe
+
+    The partitions are perfectly distributed across workers, if the number of
+    requested partitions is evenly divisible by the number of workers.
+    """
+
+    workers = list(client.scheduler_info()["workers"].keys())
+    assert len(workers) > 0
+
+    chunksize = args.partition_size // np.float64().nbytes
+    # Distribute the new partitions between workers by round robin.
+    # We use `client.submit` to control the distribution exactly.
+    # TODO: support unbalanced partition distribution
+    dsk = {}
+    for i in range(args.in_parts):
+        worker = workers[i % len(workers)]  # Round robin
+        dsk[(name, i)] = client.submit(
+            create_df, chunksize, args.type, workers=[worker], pure=False
+        )
+    wait(dsk.values())
 
-    if args.type == "gpu":
-        import cudf
+    df_meta = create_df(0, args.type)
+    divs = [None] * (len(dsk) + 1)
+    ret = new_dd_object(dsk, name, df_meta, divs).persist()
+    wait(ret)
 
-        df = df.map_partitions(cudf.from_pandas)
+    data_processed = args.in_parts * args.partition_size
+    if not args.ignore_index:
+        data_processed += args.in_parts * chunksize * df_meta.index.dtype.itemsize
+    return data_processed, ret
 
-    df = df.persist()
-    wait(df)
-    data_processed = len(df) * sum([t.itemsize for t in df.dtypes])
+
+def bench_once(client, args, write_profile=None):
+    data_processed, df = create_data(client, args)
 
     if write_profile is None:
         ctx = contextlib.nullcontext()
@@ -64,9 +107,9 @@ def bench_once(client, args, write_profile=None):
 
     with ctx:
         if args.backend in {"dask", "dask-noop"}:
-            duration = shuffle_dask(df, noop=args.backend == "dask-noop")
+            duration = shuffle_dask(df, args)
         else:
-            duration = shuffle_explicit_comms(df)
+            duration = shuffle_explicit_comms(df, args)
 
     return (data_processed, duration)
 
@@ -177,6 +220,11 @@ def parse_args():
             "type": int,
             "help": "Number of runs",
         },
+        {
+            "name": "--ignore-index",
+            "action": "store_true",
+            "help": "When shuffle, ignore the index",
+        },
     ]
 
     return parse_benchmark_args(
diff --git a/dask_cuda/explicit_comms/comms.py b/dask_cuda/explicit_comms/comms.py
index 0ebd7f0c..05dbc961 100644
--- a/dask_cuda/explicit_comms/comms.py
+++ b/dask_cuda/explicit_comms/comms.py
@@ -180,7 +180,7 @@ def __init__(self, client: Optional[Client] = None):
         self.sessionId = uuid.uuid4().int
 
         # Get address of all workers (not Nanny addresses)
-        self.worker_addresses = list(self.client.run(lambda: 42).keys())
+        self.worker_addresses = list(self.client.scheduler_info()["workers"].keys())
 
         # Make all workers listen and get all listen addresses
         self.worker_direct_addresses = []

From 0957418497d22e595d838a611d87709a10e2879d Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Wed, 11 Jan 2023 20:53:23 +0100
Subject: [PATCH 11/31] Use TrackingResourceAdaptor to get better debug info
 (#1079)

For better out of memory message, JIT-unspill now check the current RMM resource stack for resources such as `StatisticsResourceAdaptor` and `TrackingResourceAdaptor` that can report the current allocated bytes.

Enable by running `dask-cuda-worker` with `--rmm-track-allocations=True` or calling `dask_cuda.LocalCUDACluster` with `rmm_track_allocations=True`.

This is very useful for debugging RMM fragmentation.

Authors:
  - Mads R. B. Kristensen (https://github.com/madsbk)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/1079
---
 dask_cuda/benchmarks/common.py            |  1 +
 dask_cuda/benchmarks/utils.py             | 18 ++++++++++-
 dask_cuda/local_cuda_cluster.py           |  2 +-
 dask_cuda/proxify_host_file.py            | 15 ++++++---
 dask_cuda/tests/test_proxify_host_file.py | 29 +++++++++++-------
 dask_cuda/utils.py                        | 37 +++++++++++++++++++++--
 6 files changed, 82 insertions(+), 20 deletions(-)

diff --git a/dask_cuda/benchmarks/common.py b/dask_cuda/benchmarks/common.py
index 00aa31dc..e734f882 100644
--- a/dask_cuda/benchmarks/common.py
+++ b/dask_cuda/benchmarks/common.py
@@ -122,6 +122,7 @@ def run(client: Client, args: Namespace, config: Config):
         args.rmm_pool_size,
         args.disable_rmm_pool,
         args.rmm_log_directory,
+        args.enable_rmm_statistics,
     )
     address_to_index, results, message_data = gather_bench_results(client, args, config)
     p2p_bw = peer_to_peer_bandwidths(message_data, address_to_index)
diff --git a/dask_cuda/benchmarks/utils.py b/dask_cuda/benchmarks/utils.py
index 8a8419cd..28d43cc1 100644
--- a/dask_cuda/benchmarks/utils.py
+++ b/dask_cuda/benchmarks/utils.py
@@ -105,6 +105,13 @@ def parse_benchmark_args(description="Generic dask-cuda Benchmark", args_list=[]
         help="Directory to write worker and scheduler RMM log files to. "
         "Logging is only enabled if RMM memory pool is enabled.",
     )
+    cluster_args.add_argument(
+        "--enable-rmm-statistics",
+        action="store_true",
+        help="Use RMM's StatisticsResourceAdaptor to gather allocation statistics. "
+        "This enables spilling implementations such as JIT-Unspill to provides more "
+        "information on out-of-memory errors",
+    )
     cluster_args.add_argument(
         "--enable-tcp-over-ucx",
         default=None,
@@ -340,6 +347,7 @@ def setup_memory_pool(
     pool_size=None,
     disable_pool=False,
     log_directory=None,
+    statistics=False,
 ):
     import cupy
 
@@ -358,9 +366,15 @@ def setup_memory_pool(
             log_file_name=get_rmm_log_file_name(dask_worker, logging, log_directory),
         )
         cupy.cuda.set_allocator(rmm.rmm_cupy_allocator)
+    if statistics:
+        rmm.mr.set_current_device_resource(
+            rmm.mr.StatisticsResourceAdaptor(rmm.mr.get_current_device_resource())
+        )
 
 
-def setup_memory_pools(client, is_gpu, pool_size, disable_pool, log_directory):
+def setup_memory_pools(
+    client, is_gpu, pool_size, disable_pool, log_directory, statistics
+):
     if not is_gpu:
         return
     client.run(
@@ -368,6 +382,7 @@ def setup_memory_pools(client, is_gpu, pool_size, disable_pool, log_directory):
         pool_size=pool_size,
         disable_pool=disable_pool,
         log_directory=log_directory,
+        statistics=statistics,
     )
     # Create an RMM pool on the scheduler due to occasional deserialization
     # of CUDA objects. May cause issues with InfiniBand otherwise.
@@ -376,6 +391,7 @@ def setup_memory_pools(client, is_gpu, pool_size, disable_pool, log_directory):
         pool_size=1e9,
         disable_pool=disable_pool,
         log_directory=log_directory,
+        statistics=statistics,
     )
 
 
diff --git a/dask_cuda/local_cuda_cluster.py b/dask_cuda/local_cuda_cluster.py
index 115c419c..fa532b5f 100644
--- a/dask_cuda/local_cuda_cluster.py
+++ b/dask_cuda/local_cuda_cluster.py
@@ -124,7 +124,7 @@ class LocalCUDACluster(LocalCluster):
             Managed memory is currently incompatible with NVLink. Trying to enable both
             will result in an exception.
     rmm_async: bool, default False
-        Initialize each worker withh RMM and set it to use RMM's asynchronous allocator.
+        Initialize each worker with RMM and set it to use RMM's asynchronous allocator.
         See ``rmm.mr.CudaAsyncMemoryResource`` for more info.
 
         .. warning::
diff --git a/dask_cuda/proxify_host_file.py b/dask_cuda/proxify_host_file.py
index f258776e..47bb3952 100644
--- a/dask_cuda/proxify_host_file.py
+++ b/dask_cuda/proxify_host_file.py
@@ -43,6 +43,7 @@
 from .is_spillable_object import cudf_spilling_status
 from .proxify_device_objects import proxify_device_objects, unproxify_device_objects
 from .proxy_object import ProxyObject
+from .utils import get_rmm_device_memory_usage
 
 T = TypeVar("T")
 
@@ -591,12 +592,16 @@ def oom(nbytes: int) -> bool:
                             traceback.print_stack(file=f)
                             f.seek(0)
                             tb = f.read()
+
+                        dev_mem = get_rmm_device_memory_usage()
+                        dev_msg = ""
+                        if dev_mem is not None:
+                            dev_msg = f"RMM allocs: {format_bytes(dev_mem)}, "
+
                         self.logger.warning(
-                            "RMM allocation of %s failed, spill-on-demand couldn't "
-                            "find any device memory to spill:\n%s\ntraceback:\n%s\n",
-                            format_bytes(nbytes),
-                            self.manager.pprint(),
-                            tb,
+                            f"RMM allocation of {format_bytes(nbytes)} failed, "
+                            "spill-on-demand couldn't find any device memory to "
+                            f"spill.\n{dev_msg}{self.manager}, traceback:\n{tb}\n"
                         )
                         # Since we didn't find anything to spill, we give up.
                         return False
diff --git a/dask_cuda/tests/test_proxify_host_file.py b/dask_cuda/tests/test_proxify_host_file.py
index 09b5c9b4..1babaa2c 100644
--- a/dask_cuda/tests/test_proxify_host_file.py
+++ b/dask_cuda/tests/test_proxify_host_file.py
@@ -1,4 +1,3 @@
-import re
 from typing import Iterable
 from unittest.mock import patch
 
@@ -10,6 +9,7 @@
 import dask.dataframe
 from dask.dataframe.shuffle import shuffle_group
 from dask.sizeof import sizeof
+from dask.utils import format_bytes
 from distributed import Client
 from distributed.utils_test import gen_test
 from distributed.worker import get_worker
@@ -448,25 +448,32 @@ def test_on_demand_debug_info():
     if not hasattr(rmm.mr, "FailureCallbackResourceAdaptor"):
         pytest.skip("RMM doesn't implement FailureCallbackResourceAdaptor")
 
-    total_mem = get_device_total_memory()
+    rmm_pool_size = 2**20
 
     def task():
-        rmm.DeviceBuffer(size=total_mem + 1)
+        (
+            rmm.DeviceBuffer(size=rmm_pool_size // 2),
+            rmm.DeviceBuffer(size=rmm_pool_size // 2),
+            rmm.DeviceBuffer(size=rmm_pool_size),  # Trigger OOM
+        )
 
-    with dask_cuda.LocalCUDACluster(n_workers=1, jit_unspill=True) as cluster:
+    with dask_cuda.LocalCUDACluster(
+        n_workers=1,
+        jit_unspill=True,
+        rmm_pool_size=rmm_pool_size,
+        rmm_maximum_pool_size=rmm_pool_size,
+        rmm_track_allocations=True,
+    ) as cluster:
         with Client(cluster) as client:
             # Warmup, which trigger the initialization of spill on demand
             client.submit(range, 10).result()
 
             # Submit too large RMM buffer
-            with pytest.raises(
-                MemoryError, match=r".*std::bad_alloc:.*CUDA error at:.*"
-            ):
+            with pytest.raises(MemoryError, match="Maximum pool size exceeded"):
                 client.submit(task).result()
 
             log = str(client.get_worker_logs())
-            assert re.search(
-                "WARNING - RMM allocation of .* failed, spill-on-demand", log
-            )
-            assert re.search("<ProxyManager dev_limit=.* host_limit=.*>: Empty", log)
+            size = format_bytes(rmm_pool_size)
+            assert f"WARNING - RMM allocation of {size} failed" in log
+            assert f"RMM allocs: {size}" in log
             assert "traceback:" in log
diff --git a/dask_cuda/utils.py b/dask_cuda/utils.py
index a60c05e7..850006ea 100644
--- a/dask_cuda/utils.py
+++ b/dask_cuda/utils.py
@@ -7,6 +7,7 @@
 from contextlib import suppress
 from functools import singledispatch
 from multiprocessing import cpu_count
+from typing import Optional
 
 import numpy as np
 import pynvml
@@ -19,8 +20,6 @@
 from distributed import Worker, wait
 from distributed.comm import parse_address
 
-from .proxify_host_file import ProxifyHostFile
-
 try:
     from nvtx import annotate as nvtx_annotate
 except ImportError:
@@ -681,6 +680,8 @@ def get_gpu_uuid_from_index(device_index=0):
 
 
 def get_worker_config(dask_worker):
+    from .proxify_host_file import ProxifyHostFile
+
     # assume homogenous cluster
     plugin_vals = dask_worker.plugins.values()
     ret = {}
@@ -822,3 +823,35 @@ def get_cluster_configuration(client):
         _get_cluster_configuration, client=client, asynchronous=client.asynchronous
     )
     return data
+
+
+def get_rmm_device_memory_usage() -> Optional[int]:
+    """Get current bytes allocated on current device through RMM
+
+    Check the current RMM resource stack for resources such as
+    `StatisticsResourceAdaptor` and `TrackingResourceAdaptor`
+    that can report the current allocated bytes. Returns None,
+    if no such resources exist.
+
+    Return
+    ------
+    nbytes: int or None
+        Number of bytes allocated on device through RMM or None
+    """
+
+    def get_rmm_memory_resource_stack(mr) -> list:
+        if hasattr(mr, "upstream_mr"):
+            return [mr] + get_rmm_memory_resource_stack(mr.upstream_mr)
+        return [mr]
+
+    try:
+        import rmm
+    except ImportError:
+        return None
+
+    for mr in get_rmm_memory_resource_stack(rmm.mr.get_current_device_resource()):
+        if isinstance(mr, rmm.mr.TrackingResourceAdaptor):
+            return mr.get_allocated_bytes()
+        if isinstance(mr, rmm.mr.StatisticsResourceAdaptor):
+            return mr.allocation_counts["current_bytes"]
+    return None

From b42151d8bfe9c28be46d1ace7e0e2be26a4de06d Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Thu, 12 Jan 2023 14:11:54 +0100
Subject: [PATCH 12/31] Shuffle by partition to reduce memory usage
 significantly (#1068)

In order to reduce peak memory usage, this PR implements _rounds_ in explicit-comms shuffle.
The idea is that each worker handles a number of dataframe partitions in each round instead of doing everything at once.
The number of partitions handled in each round can be controlled by setting `DASK_EXPLICIT_COMMS_BATCHSIZE` or directly when calling `shuffle()`.

By default, each worker handles one partition per round.
Set `DASK_EXPLICIT_COMMS_BATCHSIZE=-1`, to handle all partitions in a single round (the previous behavior).

Authors:
  - Mads R. B. Kristensen (https://github.com/madsbk)
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/dask-cuda/pull/1068
---
 dask_cuda/explicit_comms/dataframe/shuffle.py | 335 +++++++++++++-----
 dask_cuda/tests/test_explicit_comms.py        |  51 +--
 2 files changed, 275 insertions(+), 111 deletions(-)

diff --git a/dask_cuda/explicit_comms/dataframe/shuffle.py b/dask_cuda/explicit_comms/dataframe/shuffle.py
index 6099025d..c6e07006 100644
--- a/dask_cuda/explicit_comms/dataframe/shuffle.py
+++ b/dask_cuda/explicit_comms/dataframe/shuffle.py
@@ -4,11 +4,10 @@
 import functools
 import inspect
 from collections import defaultdict
+from math import ceil
 from operator import getitem
 from typing import Any, Callable, Dict, List, Optional, Set, TypeVar
 
-import numpy
-
 import dask
 import dask.dataframe
 from dask.base import tokenize
@@ -23,9 +22,68 @@
 T = TypeVar("T")
 
 
+Proxify = Callable[[T], T]
+
+
+def get_proxify(worker: Worker) -> Proxify:
+    """Get function to proxify objects"""
+    from dask_cuda.proxify_host_file import ProxifyHostFile
+
+    if isinstance(worker.data, ProxifyHostFile):
+        data = worker.data
+        return lambda x: data.manager.proxify(x)[0]
+    return lambda x: x  # no-op
+
+
+def get_no_comm_postprocess(
+    stage: Dict[str, Any], num_rounds: int, batchsize: int
+) -> Callable[[DataFrame], DataFrame]:
+    """Get function for post-processing partitions not communicated
+
+    In cuDF, the `group_split_dispatch` uses `scatter_by_map` to create
+    the partitions, which is implemented by splitting a single base dataframe
+    into multiple partitions. This means that memory are not freed until
+    ALL partitions are deleted.
+
+    In order to free memory ASAP, we can deep copy partitions NOT being
+    communicated. We do this when `num_rounds != batchsize`.
+
+    Parameters
+    ----------
+    stage
+        The staged input dataframes.
+    num_rounds: int
+        Number of rounds of dataframe partitioning and all-to-all communication.
+    batchsize: int
+        Number of partitions each worker will handle in each round.
+
+    Returns
+    -------
+    Function to be called on partitions not communicated.
+
+    """
+    if num_rounds == batchsize:
+        return lambda x: x
+
+    # Check that we are shuffling a cudf dataframe
+    try:
+        import cudf
+    except ImportError:
+        return lambda x: x
+    if not stage or not isinstance(next(iter(stage.values())), cudf.DataFrame):
+        return lambda x: x
+
+    # Deep copying a cuDF dataframe doesn't deep copy its index hence
+    # we have to do it explicitly.
+    return lambda x: x._from_data(
+        x._data.copy(deep=True),
+        x._index.copy(deep=True),
+    )
+
+
 async def send(
     eps,
-    myrank,
+    myrank: int,
     rank_to_out_part_ids: Dict[int, Set[int]],
     out_part_id_to_dataframe: Dict[int, DataFrame],
 ) -> None:
@@ -43,10 +101,10 @@ async def send(
 
 async def recv(
     eps,
-    myrank,
+    myrank: int,
     rank_to_out_part_ids: Dict[int, Set[int]],
     out_part_id_to_dataframe_list: Dict[int, List[DataFrame]],
-    proxify,
+    proxify: Proxify,
 ) -> None:
     """Notice, received items are appended to `out_parts_list`"""
 
@@ -60,17 +118,9 @@ async def read_msg(rank: int) -> None:
     )
 
 
-def get_proxify(worker: Worker) -> Callable[[T], T]:
-    """Get function to proxify objects"""
-    from dask_cuda.proxify_host_file import ProxifyHostFile
-
-    if isinstance(worker.data, ProxifyHostFile):
-        data = worker.data
-        return lambda x: data.manager.proxify(x)[0]
-    return lambda x: x  # no-op
-
-
-def compute_map_index(df: Any, column_names, npartitions) -> Series:
+def compute_map_index(
+    df: DataFrame, column_names: List[str], npartitions: int
+) -> Series:
     """Return a Series that maps each row `df` to a partition ID
 
     The partitions are determined by hashing the columns given by column_names
@@ -79,17 +129,17 @@ def compute_map_index(df: Any, column_names, npartitions) -> Series:
 
     Parameters
     ----------
-    df: DataFrame
-    column_names: list of strings
+    df
+        The dataframe.
+    column_names
         List of column names on which we want to split.
-    npartitions: int or None
+    npartitions
         The desired number of output partitions.
 
     Returns
     -------
-    out: Dict[int, DataFrame]
-        A dictionary mapping integers in {0..k} to dataframes such that the
-        hash values of `df[col]` are well partitioned.
+    Series
+        Series that maps each row `df` to a partition ID
     """
 
     if column_names[0] == "_partitions":
@@ -98,61 +148,82 @@ def compute_map_index(df: Any, column_names, npartitions) -> Series:
         ind = hash_object_dispatch(
             df[column_names] if column_names else df, index=False
         )
-    typ = numpy.min_scalar_type(npartitions * 2)
-    return (ind % npartitions).astype(typ, copy=False)
+    return ind % npartitions
 
 
-def single_shuffle_group(
-    df: DataFrame, column_names, npartitions, ignore_index
+def partition_dataframe(
+    df: DataFrame, column_names: List[str], npartitions: int, ignore_index: bool
 ) -> Dict[int, DataFrame]:
-    """Split dataframe based on the indexes returned by `compute_map_index`"""
+    """Partition dataframe to a dict of dataframes
+
+    The partitions are determined by hashing the columns given by column_names
+    unless `column_names[0] == "_partitions"`, in which case the values of
+    `column_names[0]` are used as index.
+
+    Parameters
+    ----------
+    df
+        The dataframe to partition
+    column_names
+        List of column names on which we want to partition.
+    npartitions
+        The desired number of output partitions.
+    ignore_index
+        Ignore index during shuffle. If True, performance may improve,
+        but index values will not be preserved.
+
+    Returns
+    -------
+    partitions: list of DataFrames
+        List of dataframe-partitions
+    """
+    # TODO: use cuDF's partition_by_hash() when `column_names[0] != "_partitions"`
     map_index = compute_map_index(df, column_names, npartitions)
     return group_split_dispatch(df, map_index, npartitions, ignore_index=ignore_index)
 
 
-def multi_shuffle_group(
-    df_meta: DataFrame,
-    dfs: Dict[str, DataFrame],
-    column_names,
-    npartitions,
-    ignore_index,
-    proxify,
+def create_partitions(
+    stage: Dict[str, Any],
+    batchsize: int,
+    column_names: List[str],
+    npartitions: int,
+    ignore_index: bool,
+    proxify: Proxify,
 ) -> Dict[int, DataFrame]:
-    """Split multiple dataframes such that each partition hashes to the same
-
-    Since we concatenate dataframes belonging to the same partition, each
-    partition ID maps to exactly one dataframe.
+    """Create partitions from one or more staged dataframes
 
     Parameters
     ----------
-    df_meta: DataFrame
-        An empty dataframe matching the expected output
-    dfs: dict of dataframes
-        The dataframes to split given as a map of stage keys to dataframes
-    column_names: list of strings
+    stage
+        The staged input dataframes
+    column_names
         List of column names on which we want to split.
-    npartitions: int or None
+    npartitions
         The desired number of output partitions.
-    ignore_index: bool
+    ignore_index
         Ignore index during shuffle.  If True, performance may improve,
         but index values will not be preserved.
-    proxify: callable
+    proxify
         Function to proxify object.
 
     Returns
     -------
-    dict of DataFrames
-        Mapping from partition ID to dataframe.
+    partitions: list of DataFrames
+        List of dataframe-partitions
     """
 
+    if not stage:
+        return {}
+    batchsize = min(len(stage), batchsize)
+
     # Grouping each input dataframe, one part for each partition ID.
     dfs_grouped: List[Dict[int, DataFrame]] = []
-    while dfs:
+    for _ in range(batchsize):
         dfs_grouped.append(
             proxify(
-                single_shuffle_group(
+                partition_dataframe(
                     # pop dataframe in any order, to free staged memory ASAP
-                    dfs.popitem()[1],
+                    stage.popitem()[1],
                     column_names,
                     npartitions,
                     ignore_index,
@@ -165,24 +236,82 @@ def multi_shuffle_group(
     ret: Dict[int, DataFrame] = {}
     for i in range(npartitions):  # Iterate over all possible output partition IDs
         t = [df_grouped[i] for df_grouped in dfs_grouped]
+        assert len(t) > 0
         if len(t) == 1:
-            ret[i] = t[0]
+            ret[i] = proxify(t[0])
         elif len(t) > 1:
             ret[i] = proxify(dd_concat(t, ignore_index=ignore_index))
-        else:
-            ret[i] = df_meta  # Empty dataframe
     return ret
 
 
+async def send_recv_partitions(
+    eps: dict,
+    myrank: int,
+    rank_to_out_part_ids: Dict[int, Set[int]],
+    out_part_id_to_dataframe: Dict[int, DataFrame],
+    no_comm_postprocess: Callable[[DataFrame], DataFrame],
+    proxify: Proxify,
+    out_part_id_to_dataframe_list: Dict[int, List[DataFrame]],
+) -> None:
+    """Send and receive (all-to-all) partitions between all workers
+
+    Parameters
+    ----------
+    eps
+        Communication endpoints to the other workers.
+    myrank
+        The rank of this worker.
+    rank_to_out_part_ids
+        dict that for each worker rank specifices a set of output partition IDs.
+        If the worker shouldn't return any partitions, it is excluded from the
+        dict. Partition IDs are global integers `0..npartitions` and corresponds
+        to the dict keys returned by `group_split_dispatch`.
+    out_part_id_to_dataframe
+        Mapping from partition ID to dataframe. This dict is cleared on return.
+    no_comm_postprocess
+        Function to post-process partitions not communicated.
+        See `get_no_comm_postprocess`
+    proxify
+        Function to proxify object.
+    out_part_id_to_dataframe_list
+        The **output** of this function, which is a dict of the partitions owned by
+        this worker.
+    """
+    await asyncio.gather(
+        recv(
+            eps,
+            myrank,
+            rank_to_out_part_ids,
+            out_part_id_to_dataframe_list,
+            proxify,
+        ),
+        send(eps, myrank, rank_to_out_part_ids, out_part_id_to_dataframe),
+    )
+
+    # At this point `send()` should have pop'ed all output partitions
+    # beside the partitions owned be `myrank` (if any).
+    assert (
+        rank_to_out_part_ids[myrank] == out_part_id_to_dataframe.keys()
+        or not out_part_id_to_dataframe
+    )
+    # We can now add them to the output dataframes.
+    for out_part_id, dataframe in out_part_id_to_dataframe.items():
+        out_part_id_to_dataframe_list[out_part_id].append(
+            no_comm_postprocess(proxify(dataframe))
+        )
+    out_part_id_to_dataframe.clear()
+
+
 async def shuffle_task(
     s,
-    stage_name,
-    df_meta,
+    stage_name: str,
     rank_to_inkeys: Dict[int, set],
     rank_to_out_part_ids: Dict[int, Set[int]],
-    column_names,
-    npartitions,
-    ignore_index,
+    column_names: List[str],
+    npartitions: int,
+    ignore_index: bool,
+    num_rounds: int,
+    batchsize: int,
 ) -> List[DataFrame]:
     """Explicit-comms shuffle task
 
@@ -203,11 +332,15 @@ async def shuffle_task(
         to the dict keys returned by `group_split_dispatch`.
     column_names: list of strings
         List of column names on which we want to split.
-    npartitions: int or None
+    npartitions: int
         The desired number of output partitions.
     ignore_index: bool
         Ignore index during shuffle.  If True, performance may improve,
         but index values will not be preserved.
+    num_rounds: int
+        Number of rounds of dataframe partitioning and all-to-all communication.
+    batchsize: int
+        Number of partitions each worker will handle in each round.
 
     Returns
     -------
@@ -216,42 +349,42 @@ async def shuffle_task(
     """
 
     proxify = get_proxify(s["worker"])
-    myrank = s["rank"]
     eps = s["eps"]
+    myrank: int = s["rank"]
     stage = comms.pop_staging_area(s, stage_name)
     assert stage.keys() == rank_to_inkeys[myrank]
+    no_comm_postprocess = get_no_comm_postprocess(stage, num_rounds, batchsize)
 
-    out_part_id_to_dataframe = multi_shuffle_group(
-        df_meta=df_meta,
-        dfs=stage,
-        column_names=column_names,
-        npartitions=npartitions,
-        ignore_index=ignore_index,
-        proxify=proxify,
-    )
-
-    # Communicate all the dataframe-partitions all-to-all. The result is
-    # `out_part_id_to_dataframe_list` that for each output partition maps
-    # a list of dataframes received.
     out_part_id_to_dataframe_list: Dict[int, List[DataFrame]] = defaultdict(list)
-    await asyncio.gather(
-        recv(eps, myrank, rank_to_out_part_ids, out_part_id_to_dataframe_list, proxify),
-        send(eps, myrank, rank_to_out_part_ids, out_part_id_to_dataframe),
-    )
-
-    # At this point `send()` should have pop'ed all output partitions
-    # beside the partitions owned be `myrank`.
-    assert rank_to_out_part_ids[myrank] == out_part_id_to_dataframe.keys()
-    # We can now add them to the output dataframes.
-    for out_part_id, dataframe in out_part_id_to_dataframe.items():
-        out_part_id_to_dataframe_list[out_part_id].append(dataframe)
-    del out_part_id_to_dataframe
+    for _ in range(num_rounds):
+        partitions = create_partitions(
+            stage, batchsize, column_names, npartitions, ignore_index, proxify
+        )
+        await send_recv_partitions(
+            eps,
+            myrank,
+            rank_to_out_part_ids,
+            partitions,
+            no_comm_postprocess,
+            proxify,
+            out_part_id_to_dataframe_list,
+        )
 
     # Finally, we concatenate the output dataframes into the final output partitions
-    return [
-        proxify(dd_concat(dfs, ignore_index=ignore_index))
-        for dfs in out_part_id_to_dataframe_list.values()
-    ]
+    ret = []
+    while out_part_id_to_dataframe_list:
+        ret.append(
+            proxify(
+                dd_concat(
+                    out_part_id_to_dataframe_list.popitem()[1],
+                    ignore_index=ignore_index,
+                )
+            )
+        )
+        # For robustness, we yield this task to give Dask a chance to do bookkeeping
+        # such as letting the Worker answer heartbeat requests
+        await asyncio.sleep(0)
+    return ret
 
 
 def shuffle(
@@ -259,6 +392,7 @@ def shuffle(
     column_names: List[str],
     npartitions: Optional[int] = None,
     ignore_index: bool = False,
+    batchsize: Optional[int] = None,
 ) -> DataFrame:
     """Order divisions of DataFrame so that all values within column(s) align
 
@@ -283,6 +417,15 @@ def shuffle(
     ignore_index: bool
         Ignore index during shuffle. If True, performance may improve,
         but index values will not be preserved.
+    batchsize: int
+        A shuffle consist of multiple rounds where each worker partitions and
+        then all-to-all communicates a number of its dataframe partitions. The batch
+        size is the number of partitions each worker will handle in each round.
+        If -1, each worker will handle all its partitions in a single round and
+        all techniques to reduce memory usage are disabled, which might be faster
+        when memory pressure isn't an issue.
+        If None, the value of `DASK_EXPLICIT_COMMS_BATCHSIZE` is used or 1 if not
+        set thus by default, we prioritize robustness over performance.
 
     Returns
     -------
@@ -324,6 +467,15 @@ def shuffle(
     rank_to_inkeys = c.stage_keys(name=name, keys=df.__dask_keys__())
     c.client.cancel(df)
 
+    # Get batchsize
+    max_num_inkeys = max(len(k) for k in rank_to_inkeys.values())
+    batchsize = batchsize or dask.config.get("explicit_comms-batchsize", 1)
+    if batchsize == -1:
+        batchsize = max_num_inkeys
+
+    # Get number of rounds of dataframe partitioning and all-to-all communication.
+    num_rounds = ceil(max_num_inkeys / batchsize)
+
     # Find the output partition IDs for each worker
     div = npartitions // len(ranks)
     rank_to_out_part_ids: Dict[int, Set[int]] = {}  # rank -> set of partition id
@@ -332,19 +484,20 @@ def shuffle(
     for rank, i in zip(ranks, range(div * len(ranks), npartitions)):
         rank_to_out_part_ids[rank].add(i)
 
-    # Run `_shuffle()` on each worker
+    # Run a shuffle task on each worker
     shuffle_result = {}
     for rank in ranks:
         shuffle_result[rank] = c.submit(
             c.worker_addresses[rank],
             shuffle_task,
             name,
-            df_meta,
             rank_to_inkeys,
             rank_to_out_part_ids,
             column_names,
             npartitions,
             ignore_index,
+            num_rounds,
+            batchsize,
         )
     wait(list(shuffle_result.values()))
 
diff --git a/dask_cuda/tests/test_explicit_comms.py b/dask_cuda/tests/test_explicit_comms.py
index dd92e2a6..88e1294c 100644
--- a/dask_cuda/tests/test_explicit_comms.py
+++ b/dask_cuda/tests/test_explicit_comms.py
@@ -74,10 +74,14 @@ def _test_dataframe_merge_empty_partitions(nrows, npartitions):
             expected = df1.merge(df2).set_index("key")
             ddf1 = dd.from_pandas(df1, npartitions=npartitions)
             ddf2 = dd.from_pandas(df2, npartitions=npartitions)
-            with dask.config.set(explicit_comms=True):
-                ddf3 = ddf1.merge(ddf2, on=["key"]).set_index("key")
-                got = ddf3.compute()
-                pd.testing.assert_frame_equal(got, expected)
+
+            for batchsize in (-1, 1, 2):
+                with dask.config.set(
+                    explicit_comms=True, explicit_comms_batchsize=batchsize
+                ):
+                    ddf3 = ddf1.merge(ddf2, on=["key"]).set_index("key")
+                    got = ddf3.compute()
+                    pd.testing.assert_frame_equal(got, expected)
 
 
 def test_dataframe_merge_empty_partitions():
@@ -130,22 +134,29 @@ def _test_dataframe_shuffle(backend, protocol, n_workers):
                     ddf = dd.from_pandas(df.copy(), npartitions=input_nparts).persist(
                         workers=all_workers
                     )
-                    ddf = explicit_comms_shuffle(
-                        ddf, ["key"], npartitions=output_nparts
-                    ).persist()
-
-                    assert ddf.npartitions == output_nparts
-
-                    # Check that each partition of `ddf` hashes to the same value
-                    result = ddf.map_partitions(
-                        check_partitions, output_nparts
-                    ).compute()
-                    assert all(result.to_list())
-
-                    # Check the values of `ddf` (ignoring the row order)
-                    expected = df.sort_values("key")
-                    got = ddf.compute().sort_values("key")
-                    assert_eq(got, expected)
+                    # To reduce test runtime, we change the batchsizes here instead
+                    # of using a test parameter.
+                    for batchsize in (-1, 1, 2):
+                        with dask.config.set(explicit_comms_batchsize=batchsize):
+                            ddf = explicit_comms_shuffle(
+                                ddf,
+                                ["key"],
+                                npartitions=output_nparts,
+                                batchsize=batchsize,
+                            ).persist()
+
+                            assert ddf.npartitions == output_nparts
+
+                            # Check that each partition hashes to the same value
+                            result = ddf.map_partitions(
+                                check_partitions, output_nparts
+                            ).compute()
+                            assert all(result.to_list())
+
+                            # Check the values (ignoring the row order)
+                            expected = df.sort_values("key")
+                            got = ddf.compute().sort_values("key")
+                            assert_eq(got, expected)
 
 
 @pytest.mark.parametrize("nworkers", [1, 2, 3])

From 1149257bba62ee4ffd3a7df8da47aecf327726bc Mon Sep 17 00:00:00 2001
From: AJ Schmidt <ajschmidt8@users.noreply.github.com>
Date: Fri, 13 Jan 2023 17:19:15 -0500
Subject: [PATCH 13/31] Add timeout to `pytest` command (#1082)

There were two instances recently (below) where some Python test errors caused the `conda-python-tests` job to run/hang for ~4 hours.

- https://github.com/rapidsai/dask-cuda/pull/981#issuecomment-1382289752
- https://github.com/rapidsai/dask-cuda/pull/1081#issuecomment-1382288016

To prevent this from happening again in the future, I've added a reasonable timeout of ~~45 minutes to that particular job~~ 30 minutes to the `pytest` command.

The job usually takes ~25 minutes to complete entirely, so 30 minutes just for `pytest` should be plenty.

This timeout will help prevent jobs from hanging and thus help preserve our finite GPU capacity for CI (particularly for `arm` nodes).

Authors:
   - AJ Schmidt (https://github.com/ajschmidt8)

Approvers:
   - Jake Awe (https://github.com/AyodeAwe)
---
 ci/test_python.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ci/test_python.sh b/ci/test_python.sh
index 25e19cca..bf221f49 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -43,7 +43,8 @@ DASK_CUDA_TEST_SINGLE_GPU=1 \
 UCXPY_IFNAME=eth0 \
 UCX_WARN_UNUSED_ENV_VARS=n \
 UCX_MEMTYPE_CACHE=n \
-pytest \
+timeout 30m pytest \
+  -vv \
   --capture=no \
   --cache-clear \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cuda.xml" \

From 2eee5ebfff3289aa10688630ca4b8d51a3f4f794 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Mon, 16 Jan 2023 17:13:24 +0100
Subject: [PATCH 14/31] Make proxy tests with `LocalCUDACluster` asynchronous
 (#1084)

After https://github.com/dask/distributed/pull/7429 was merged, some of those tests started hanging and I could confirm there were two threads concurrently attempting to take the UCX spinlock and the GIL, which led to such deadlock. UCX-Py is currently not thread-safe, and indeed can cause problems like this should two or more threads attempt to call communication routines that will required the UCX spinlock. My theory is that the synchronous cluster will indeed cause communication on the main thread (in this case, the `pytest` thread) upon attempting to shutdown the cluster, instead of only within the Distributed communication thread, likely being the reason behind the test hanging.

Asynchronous Distributed clusters seem not to cause any communication from the main thread, but only in the communication thread as expected, thus making the tests asynchronous suffice to resolve such issues. In practice, it's unlikely that people will use sync Distributed clusters from the same process (as pytest does), and thus it's improbable to happen in real use-cases.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/dask-cuda/pull/1084
---
 dask_cuda/tests/test_proxy.py | 51 +++++++++++++++++++++--------------
 1 file changed, 31 insertions(+), 20 deletions(-)

diff --git a/dask_cuda/tests/test_proxy.py b/dask_cuda/tests/test_proxy.py
index 830b403d..1a4abafe 100644
--- a/dask_cuda/tests/test_proxy.py
+++ b/dask_cuda/tests/test_proxy.py
@@ -16,9 +16,10 @@
 from dask.sizeof import sizeof
 from distributed import Client
 from distributed.protocol.serialize import deserialize, serialize
+from distributed.utils_test import gen_test
 
 import dask_cuda
-from dask_cuda import proxy_object
+from dask_cuda import LocalCUDACluster, proxy_object
 from dask_cuda.disk_io import SpillToDiskFile
 from dask_cuda.proxify_device_objects import proxify_device_objects
 from dask_cuda.proxify_host_file import ProxifyHostFile
@@ -282,7 +283,8 @@ def test_fixed_attribute_name():
 
 
 @pytest.mark.parametrize("jit_unspill", [True, False])
-def test_spilling_local_cuda_cluster(jit_unspill):
+@gen_test(timeout=20)
+async def test_spilling_local_cuda_cluster(jit_unspill):
     """Testing spilling of a proxied cudf dataframe in a local cuda cluster"""
     cudf = pytest.importorskip("cudf")
     dask_cudf = pytest.importorskip("dask_cudf")
@@ -299,14 +301,17 @@ def task(x):
         return x
 
     # Notice, setting `device_memory_limit=1B` to trigger spilling
-    with dask_cuda.LocalCUDACluster(
-        n_workers=1, device_memory_limit="1B", jit_unspill=jit_unspill
+    async with LocalCUDACluster(
+        n_workers=1,
+        device_memory_limit="1B",
+        jit_unspill=jit_unspill,
+        asynchronous=True,
     ) as cluster:
-        with Client(cluster):
+        async with Client(cluster, asynchronous=True) as client:
             df = cudf.DataFrame({"a": range(10)})
             ddf = dask_cudf.from_cudf(df, npartitions=1)
             ddf = ddf.map_partitions(task, meta=df.head())
-            got = ddf.compute()
+            got = await client.compute(ddf)
             if isinstance(got, pandas.Series):
                 pytest.xfail(
                     "BUG fixed by <https://github.com/rapidsai/dask-cuda/pull/451>"
@@ -395,7 +400,8 @@ def _pxy_deserialize(self):
 
 @pytest.mark.parametrize("send_serializers", [None, ("dask", "pickle"), ("cuda",)])
 @pytest.mark.parametrize("protocol", ["tcp", "ucx"])
-def test_communicating_proxy_objects(protocol, send_serializers):
+@gen_test(timeout=20)
+async def test_communicating_proxy_objects(protocol, send_serializers):
     """Testing serialization of cuDF dataframe when communicating"""
     cudf = pytest.importorskip("cudf")
 
@@ -413,10 +419,13 @@ def task(x):
         else:
             assert serializers_used == "dask"
 
-    with dask_cuda.LocalCUDACluster(
-        n_workers=1, protocol=protocol, enable_tcp_over_ucx=protocol == "ucx"
+    async with dask_cuda.LocalCUDACluster(
+        n_workers=1,
+        protocol=protocol,
+        enable_tcp_over_ucx=protocol == "ucx",
+        asynchronous=True,
     ) as cluster:
-        with Client(cluster) as client:
+        async with Client(cluster, asynchronous=True) as client:
             df = cudf.DataFrame({"a": range(10)})
             df = proxy_object.asproxy(
                 df, serializers=send_serializers, subclass=_PxyObjTest
@@ -429,14 +438,14 @@ def task(x):
                 df._pxy_get().assert_on_deserializing = False
             else:
                 df._pxy_get().assert_on_deserializing = True
-            df = client.scatter(df)
-            client.submit(task, df).result()
-            client.shutdown()  # Avoids a UCX shutdown error
+            df = await client.scatter(df)
+            await client.submit(task, df)
 
 
 @pytest.mark.parametrize("protocol", ["tcp", "ucx"])
 @pytest.mark.parametrize("shared_fs", [True, False])
-def test_communicating_disk_objects(protocol, shared_fs):
+@gen_test(timeout=20)
+async def test_communicating_disk_objects(protocol, shared_fs):
     """Testing disk serialization of cuDF dataframe when communicating"""
     cudf = pytest.importorskip("cudf")
     ProxifyHostFile._spill_to_disk.shared_filesystem = shared_fs
@@ -450,16 +459,18 @@ def task(x):
         else:
             assert serializer_used == "dask"
 
-    with dask_cuda.LocalCUDACluster(
-        n_workers=1, protocol=protocol, enable_tcp_over_ucx=protocol == "ucx"
+    async with dask_cuda.LocalCUDACluster(
+        n_workers=1,
+        protocol=protocol,
+        enable_tcp_over_ucx=protocol == "ucx",
+        asynchronous=True,
     ) as cluster:
-        with Client(cluster) as client:
+        async with Client(cluster, asynchronous=True) as client:
             df = cudf.DataFrame({"a": range(10)})
             df = proxy_object.asproxy(df, serializers=("disk",), subclass=_PxyObjTest)
             df._pxy_get().assert_on_deserializing = False
-            df = client.scatter(df)
-            client.submit(task, df).result()
-            client.shutdown()  # Avoids a UCX shutdown error
+            df = await client.scatter(df)
+            await client.submit(task, df)
 
 
 @pytest.mark.parametrize("array_module", ["numpy", "cupy"])

From 52dd850d4df1a2c7aa2db043ac5fc208f28e458f Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Tue, 17 Jan 2023 14:57:54 -0500
Subject: [PATCH 15/31] Use `pkgutil.iter_modules` to get un-imported module
 for `test_pre_import` (#1085)

Changed this because IIUC `pkg_resources.working_set` is listing the installed distributions and not necessarily the importable modules; this becomes an issue if the distribution and module names aren't the same (e.g. one would `conda install pillow` and then `import PIL`), which was causing some failures in CI that seem unrelated to the changes here.

_Originally posted by @charlesbluca in https://github.com/rapidsai/dask-cuda/pull/981#discussion_r1072650294_

Authors:
  - Charles Blackmon-Luca (https://github.com/charlesbluca)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/1085
---
 dask_cuda/tests/test_dask_cuda_worker.py   | 8 ++++----
 dask_cuda/tests/test_local_cuda_cluster.py | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/dask_cuda/tests/test_dask_cuda_worker.py b/dask_cuda/tests/test_dask_cuda_worker.py
index 951e0269..7ff7a9c9 100644
--- a/dask_cuda/tests/test_dask_cuda_worker.py
+++ b/dask_cuda/tests/test_dask_cuda_worker.py
@@ -1,11 +1,11 @@
 from __future__ import absolute_import, division, print_function
 
 import os
+import pkgutil
 import subprocess
 import sys
 from unittest.mock import patch
 
-import pkg_resources
 import pytest
 
 from distributed import Client, wait
@@ -194,9 +194,9 @@ def test_pre_import(loop):  # noqa: F811
     module = None
 
     # Pick a module that isn't currently loaded
-    for m in pkg_resources.working_set:
-        if m.key not in sys.modules.keys():
-            module = m.key
+    for m in pkgutil.iter_modules():
+        if m.ispkg and m.name not in sys.modules.keys():
+            module = m.name
             break
 
     if module is None:
diff --git a/dask_cuda/tests/test_local_cuda_cluster.py b/dask_cuda/tests/test_local_cuda_cluster.py
index 5e407080..b0ac8823 100644
--- a/dask_cuda/tests/test_local_cuda_cluster.py
+++ b/dask_cuda/tests/test_local_cuda_cluster.py
@@ -1,9 +1,9 @@
 import asyncio
 import os
+import pkgutil
 import sys
 from unittest.mock import patch
 
-import pkg_resources
 import pytest
 
 from dask.distributed import Client
@@ -263,9 +263,9 @@ async def test_pre_import():
     module = None
 
     # Pick a module that isn't currently loaded
-    for m in pkg_resources.working_set:
-        if m.key not in sys.modules.keys():
-            module = m.key
+    for m in pkgutil.iter_modules():
+        if m.ispkg and m.name not in sys.modules.keys():
+            module = m.name
             break
 
     if module is None:

From c034d2290d821b72a30e326707a3772346aa40e5 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Wed, 18 Jan 2023 20:57:54 +0100
Subject: [PATCH 16/31] Update tests for Python 3.10 (#1086)

Because in Python 3.10 `asyncio.get_event_loop()` does not create an event loop anymore, using synchronous `LocalCluster` raises `DeprecationWarning`s in `tornado.ioloop.IOLoop`. Ideally we should update all tests to `async`, the changes here are the minimum necessary to unblock Python 3.10.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Benjamin Zaitlen (https://github.com/quasiben)

URL: https://github.com/rapidsai/dask-cuda/pull/1086
---
 dask_cuda/tests/test_proxify_host_file.py | 27 ++++++++++++++---------
 pyproject.toml                            |  2 ++
 2 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/dask_cuda/tests/test_proxify_host_file.py b/dask_cuda/tests/test_proxify_host_file.py
index 1babaa2c..0b0f9d5b 100644
--- a/dask_cuda/tests/test_proxify_host_file.py
+++ b/dask_cuda/tests/test_proxify_host_file.py
@@ -239,7 +239,8 @@ def test_spill_on_demand(root_dir):
 
 
 @pytest.mark.parametrize("jit_unspill", [True, False])
-def test_local_cuda_cluster(jit_unspill):
+@gen_test(timeout=20)
+async def test_local_cuda_cluster(jit_unspill):
     """Testing spilling of a proxied cudf dataframe in a local cuda cluster"""
     cudf = pytest.importorskip("cudf")
     dask_cudf = pytest.importorskip("dask_cudf")
@@ -256,14 +257,17 @@ def task(x):
         return x
 
     # Notice, setting `device_memory_limit=1B` to trigger spilling
-    with dask_cuda.LocalCUDACluster(
-        n_workers=1, device_memory_limit="1B", jit_unspill=jit_unspill
+    async with dask_cuda.LocalCUDACluster(
+        n_workers=1,
+        device_memory_limit="1B",
+        jit_unspill=jit_unspill,
+        asynchronous=True,
     ) as cluster:
-        with Client(cluster):
+        async with Client(cluster, asynchronous=True) as client:
             df = cudf.DataFrame({"a": range(10)})
             ddf = dask_cudf.from_cudf(df, npartitions=1)
             ddf = ddf.map_partitions(task, meta=df.head())
-            got = ddf.compute()
+            got = await client.compute(ddf)
             assert_frame_equal(got.to_pandas(), df.to_pandas())
 
 
@@ -381,15 +385,18 @@ def test_incompatible_types(root_dir):
 
 @pytest.mark.parametrize("npartitions", [1, 2, 3])
 @pytest.mark.parametrize("compatibility_mode", [True, False])
-def test_compatibility_mode_dataframe_shuffle(compatibility_mode, npartitions):
+@gen_test(timeout=20)
+async def test_compatibility_mode_dataframe_shuffle(compatibility_mode, npartitions):
     cudf = pytest.importorskip("cudf")
 
     def is_proxy_object(x):
         return "ProxyObject" in str(type(x))
 
     with dask.config.set(jit_unspill_compatibility_mode=compatibility_mode):
-        with dask_cuda.LocalCUDACluster(n_workers=1, jit_unspill=True) as cluster:
-            with Client(cluster):
+        async with dask_cuda.LocalCUDACluster(
+            n_workers=1, jit_unspill=True, asynchronous=True
+        ) as cluster:
+            async with Client(cluster, asynchronous=True) as client:
                 ddf = dask.dataframe.from_pandas(
                     cudf.DataFrame({"key": np.arange(10)}), npartitions=npartitions
                 )
@@ -397,8 +404,8 @@ def is_proxy_object(x):
 
                 # With compatibility mode on, we shouldn't encounter any proxy objects
                 if compatibility_mode:
-                    assert "ProxyObject" not in str(type(res.compute()))
-                res = res.map_partitions(is_proxy_object).compute()
+                    assert "ProxyObject" not in str(type(await client.compute(res)))
+                res = await client.compute(res.map_partitions(is_proxy_object))
                 res = res.to_list()
 
                 if compatibility_mode:
diff --git a/pyproject.toml b/pyproject.toml
index 7a88741e..f8d98957 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -121,6 +121,8 @@ filterwarnings = [
     # tornado 6.2, remove when dask/distributed#6669 is fixed
     "ignore:clear_current is deprecated:DeprecationWarning:",
     "ignore:make_current is deprecated:DeprecationWarning:",
+    # remove after https://github.com/rapidsai/dask-cuda/issues/1087 is closed
+    "ignore:There is no current event loop:DeprecationWarning:tornado",
 ]
 
 [tool.setuptools]

From 2c88933a9bc0e63c27b46b8920705abf348cdb1c Mon Sep 17 00:00:00 2001
From: AJ Schmidt <ajschmidt8@users.noreply.github.com>
Date: Wed, 18 Jan 2023 15:32:45 -0500
Subject: [PATCH 17/31] Ensure tests run for Python `3.10` (#1080)

Previously we had disabled `cucim` testing for Python `3.10` because the tests depended on `3.10` packages of `cudf`, which weren't previously available.

Now that `3.10` packages of `cudf` are available, we can enable `3.10` testing for `cucim`.

Authors:
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Jordan Jacobelli (https://github.com/Ethyling)
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/1080
---
 .github/workflows/pr.yaml    | 5 +----
 ci/release/update-version.sh | 1 +
 dependencies.yaml            | 2 +-
 3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 3ba8410f..238205c1 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -30,10 +30,7 @@ jobs:
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    # TODO: Switch this testing branch to "cuda-118" after `cudf` `3.10` builds are out.
-    # There is a circular testing dependency between `dask-cuda` and `cudf` right now, which
-    # prevents us from running `3.10` tests for `dask-cuda` until `3.10` `cudf` packages are published.
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@main
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@cuda-118
     with:
       build_type: pull-request
   wheel-build:
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 0938bff0..41658e73 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -37,4 +37,5 @@ sed_runner "s/export UCXPY_VERSION=.*/export UCXPY_VERSION="${NEXT_UCXPY_VERSION
 # Bump cudf and dask-cudf testing dependencies
 sed_runner "s/cudf=.*/cudf=${NEXT_SHORT_TAG}/g" dependencies.yaml
 sed_runner "s/dask-cudf=.*/dask-cudf=${NEXT_SHORT_TAG}/g" dependencies.yaml
+sed_runner "s/cucim=.*/cucim=${NEXT_SHORT_TAG}/g" dependencies.yaml
 sed_runner "s/ucx-py=.*/ucx-py=${NEXT_UCXPY_VERSION}/g" dependencies.yaml
diff --git a/dependencies.yaml b/dependencies.yaml
index c7964722..3aaf8b58 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -91,7 +91,7 @@ dependencies:
     common:
       - output_types: [conda]
         packages:
-          - cucim
+          - cucim=23.02
           - cudf=23.02
           - dask-cudf=23.02
           - pytest

From cf179f138aa6310e7663018e10b304c4c88613da Mon Sep 17 00:00:00 2001
From: Jacob Tomlinson <jacobtomlinson@users.noreply.github.com>
Date: Thu, 19 Jan 2023 09:25:52 +0000
Subject: [PATCH 18/31] Switch to the new dask CLI (#981)

In https://github.com/dask/dask/pull/9283 we are adding a new top level `dask` CLI command which can be extended by other modules using entry points. A primary motivation here is to improve discoverability by uniting everything under one tool and allowing folks to run `dask --help` and `dask <subcommand> --help` to learn more about the various tools.

This PR adds a new `click` group called `cuda` and moves the `dask-cuda-worker` command under that group with the name `worker`. This means the `dask-cuda-worker` becomes `dask cuda worker` in the new CLI tool.

I haven't made any changes to the existing `dask-cuda-worker` console script so that will still continue to work, but maybe we should add a deprecation warning to it?

I went with this name rather than `dask cuda-worker` because I think it is more readable and also leaves us open to adding more subcommands in the future without cluttering up the top-level `dask` namespace.

```console
$ dask --help
Usage: dask [OPTIONS] COMMAND [ARGS]...

  Dask command line interface.

Options:
  --version   Show the version and exit.
  -h, --help  Show this message and exit.

Commands:
  cluster    Manage dask clusters.
  cuda       GPU subcommands.
  docs       Open Dask documentation (https://docs.dask.org/) in a web browser.
  info       Information about your dask installation.
  scheduler  Launch a distributed scheduler.
  ssh        Launch a distributed cluster over SSH.
  worker     Launch a distributed worker attached to an existing SCHEDULER.
```

```console
$ dask cuda --help
Usage: dask cuda [OPTIONS] COMMAND [ARGS]...

  GPU subcommands.

Options:
  -h, --help  Show this message and exit.

Commands:
  worker  Launch a distributed worker with GPUs attached to an existing SCHEDULER.
```

```console
$ dask cuda worker --help
Usage: dask cuda worker [OPTIONS] [SCHEDULER] [PRELOAD_ARGV]...

  Launch a distributed worker with GPUs attached to an existing SCHEDULER.

  See https://docs.rapids.ai/api/dask-cuda/stable/quickstart.html#dask-cuda-worker for
  info.

Options:
  --host TEXT                     IP address of serving host; should be visible to the
                                  scheduler and other workers. Can be a string (like
                                  ``"127.0.0.1"``) or ``None`` to fall back on the
                                  address of the interface specified by
                                  ``--interface`` or the default interface.
  --nthreads INTEGER              Number of threads to be used for each Dask worker
                                  process.  [default: 1]
...
```

The CLI PR needs to be merged and released before this can be merged.

Fixes https://github.com/rapidsai/dask-cuda/issues/1038

Authors:
  - Jacob Tomlinson (https://github.com/jacobtomlinson)
  - Ray Douglass (https://github.com/raydouglass)
  - Peter Andreas Entschev (https://github.com/pentschev)
  - Charles Blackmon-Luca (https://github.com/charlesbluca)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)
  - https://github.com/jakirkham
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/981
---
 conda/recipes/dask-cuda/meta.yaml             |   6 +
 dask_cuda/{cli/dask_cuda_worker.py => cli.py} | 151 +++++++++++++-----
 dask_cuda/cli/__init__.py                     |   0
 dask_cuda/cli/dask_config.py                  |  95 -----------
 dask_cuda/cuda_worker.py                      |   2 +-
 dask_cuda/initialize.py                       |   2 +-
 dask_cuda/tests/test_dask_cuda_worker.py      |  80 ++++++----
 docs/source/api.rst                           |  15 +-
 docs/source/examples/ucx.rst                  |  24 +--
 docs/source/examples/worker_count.rst         |   8 +-
 docs/source/index.rst                         |   2 +-
 docs/source/quickstart.rst                    |  12 +-
 docs/source/spilling.rst                      |  20 +--
 docs/source/ucx.rst                           |   4 +-
 examples/ucx/dask_cuda_worker.sh              |   6 +-
 pyproject.toml                                |   7 +-
 16 files changed, 226 insertions(+), 208 deletions(-)
 rename dask_cuda/{cli/dask_cuda_worker.py => cli.py} (82%)
 mode change 100755 => 100644
 delete mode 100644 dask_cuda/cli/__init__.py
 delete mode 100755 dask_cuda/cli/dask_config.py

diff --git a/conda/recipes/dask-cuda/meta.yaml b/conda/recipes/dask-cuda/meta.yaml
index b0b02cb2..cc26426d 100644
--- a/conda/recipes/dask-cuda/meta.yaml
+++ b/conda/recipes/dask-cuda/meta.yaml
@@ -41,6 +41,12 @@ requirements:
 test:
   imports:
     - dask_cuda
+  commands:
+    - dask cuda --help
+    {% for e in data.get("project", {}).get("scripts", {}).keys() %}
+    - {{ e }} --help
+    - {{ e|replace("-", " ") }} --help
+    {% endfor %}
 
 about:
   home: https://rapids.ai/
diff --git a/dask_cuda/cli/dask_cuda_worker.py b/dask_cuda/cli.py
old mode 100755
new mode 100644
similarity index 82%
rename from dask_cuda/cli/dask_cuda_worker.py
rename to dask_cuda/cli.py
index 62faeddb..7e3b0e75
--- a/dask_cuda/cli/dask_cuda_worker.py
+++ b/dask_cuda/cli.py
@@ -5,25 +5,62 @@
 import click
 from tornado.ioloop import IOLoop, TimeoutError
 
-from dask import config
+from dask import config as dask_config
+from distributed import Client
 from distributed.cli.utils import install_signal_handlers
 from distributed.preloading import validate_preload_argv
 from distributed.security import Security
 from distributed.utils import import_term
 
-from ..cuda_worker import CUDAWorker
+from .cuda_worker import CUDAWorker
+from .utils import print_cluster_config
 
 logger = logging.getLogger(__name__)
 
 
 pem_file_option_type = click.Path(exists=True, resolve_path=True)
-
-
-@click.command(context_settings=dict(ignore_unknown_options=True))
-@click.argument("scheduler", type=str, required=False)
-@click.argument(
+scheduler = click.argument("scheduler", type=str, required=False)
+preload_argv = click.argument(
     "preload_argv", nargs=-1, type=click.UNPROCESSED, callback=validate_preload_argv
 )
+scheduler_file = click.option(
+    "--scheduler-file",
+    type=str,
+    default=None,
+    help="""Filename to JSON encoded scheduler information. To be used in conjunction
+    with the equivalent ``dask scheduler`` option.""",
+)
+tls_ca_file = click.option(
+    "--tls-ca-file",
+    type=pem_file_option_type,
+    default=None,
+    help="""CA certificate(s) file for TLS (in PEM format). Can be a string (like
+    ``"path/to/certs"``), or ``None`` for no certificate(s).""",
+)
+tls_cert = click.option(
+    "--tls-cert",
+    type=pem_file_option_type,
+    default=None,
+    help="""Certificate file for TLS (in PEM format). Can be a string (like
+    ``"path/to/certs"``), or ``None`` for no certificate(s).""",
+)
+tls_key = click.option(
+    "--tls-key",
+    type=pem_file_option_type,
+    default=None,
+    help="""Private key file for TLS (in PEM format). Can be a string (like
+    ``"path/to/certs"``), or ``None`` for no private key.""",
+)
+
+
+@click.group
+def cuda():
+    """Subcommands to launch or query distributed workers with GPUs."""
+
+
+@cuda.command(name="worker", context_settings=dict(ignore_unknown_options=True))
+@scheduler
+@preload_argv
 @click.option(
     "--host",
     type=str,
@@ -174,13 +211,7 @@
     specified by `"jit-unspill-shared-fs"`.
     Notice, a shared filesystem must support the `os.link()` operation.""",
 )
-@click.option(
-    "--scheduler-file",
-    type=str,
-    default=None,
-    help="""Filename to JSON encoded scheduler information. To be used in conjunction
-    with the equivalent ``dask-scheduler`` option.""",
-)
+@scheduler_file
 @click.option(
     "--protocol", type=str, default=None, help="Protocol like tcp, tls, or ucx"
 )
@@ -208,27 +239,9 @@
     help="""Prefix for the dashboard. Can be a string (like ...) or ``None`` for no
     prefix.""",
 )
-@click.option(
-    "--tls-ca-file",
-    type=pem_file_option_type,
-    default=None,
-    help="""CA certificate(s) file for TLS (in PEM format). Can be a string (like
-    ``"path/to/certs"``), or ``None`` for no certificate(s).""",
-)
-@click.option(
-    "--tls-cert",
-    type=pem_file_option_type,
-    default=None,
-    help="""Certificate file for TLS (in PEM format). Can be a string (like
-    ``"path/to/certs"``), or ``None`` for no certificate(s).""",
-)
-@click.option(
-    "--tls-key",
-    type=pem_file_option_type,
-    default=None,
-    help="""Private key file for TLS (in PEM format). Can be a string (like
-    ``"path/to/certs"``), or ``None`` for no private key.""",
-)
+@tls_ca_file
+@tls_cert
+@tls_key
 @click.option(
     "--enable-tcp-over-ucx/--disable-tcp-over-ucx",
     default=None,
@@ -288,7 +301,7 @@
     type=click.Choice(["spawn", "fork", "forkserver"]),
     help="""Method used to start new processes with multiprocessing""",
 )
-def main(
+def worker(
     scheduler,
     host,
     nthreads,
@@ -324,6 +337,15 @@ def main(
     multiprocessing_method,
     **kwargs,
 ):
+    """Launch a distributed worker with GPUs attached to an existing scheduler.
+
+    A scheduler can be specified either through a URI passed through the ``SCHEDULER``
+    argument or a scheduler file passed through the ``--scheduler-file`` option.
+
+    See
+    https://docs.rapids.ai/api/dask-cuda/stable/quickstart.html#dask-cuda-worker
+    for info.
+    """
     if multiprocessing_method == "forkserver":
         import multiprocessing.forkserver as f
 
@@ -347,7 +369,7 @@ def main(
     if worker_class is not None:
         worker_class = import_term(worker_class)
 
-    with config.set(
+    with dask_config.set(
         {"distributed.worker.multiprocessing-method": multiprocessing_method}
     ):
         worker = CUDAWorker(
@@ -404,9 +426,56 @@ async def run():
             logger.info("End worker")
 
 
-def go():
-    main()
+@cuda.command(name="config", context_settings=dict(ignore_unknown_options=True))
+@scheduler
+@preload_argv
+@scheduler_file
+@click.option(
+    "--get-cluster-configuration",
+    "get_cluster_conf",
+    default=False,
+    is_flag=True,
+    required=False,
+    show_default=True,
+    help="""Print a table of the current cluster configuration""",
+)
+@tls_ca_file
+@tls_cert
+@tls_key
+def config(
+    scheduler,
+    scheduler_file,
+    get_cluster_conf,
+    tls_ca_file,
+    tls_cert,
+    tls_key,
+    **kwargs,
+):
+    """Query an existing GPU cluster's configuration.
+
+    A cluster can be specified either through a URI passed through the ``SCHEDULER``
+    argument or a scheduler file passed through the ``--scheduler-file`` option.
+    """
 
+    if tls_ca_file and tls_cert and tls_key:
+        security = Security(
+            tls_ca_file=tls_ca_file,
+            tls_worker_cert=tls_cert,
+            tls_worker_key=tls_key,
+        )
+    else:
+        security = None
+
+    if isinstance(scheduler, str) and scheduler.startswith("-"):
+        raise ValueError(
+            "The scheduler address can't start with '-'. Please check "
+            "your command line arguments, you probably attempted to use "
+            "unsupported one. Scheduler address: %s" % scheduler
+        )
 
-if __name__ == "__main__":
-    go()
+    if get_cluster_conf:
+        if scheduler_file is not None:
+            client = Client(scheduler_file=scheduler_file, security=security)
+        else:
+            client = Client(scheduler, security=security)
+        print_cluster_config(client)
diff --git a/dask_cuda/cli/__init__.py b/dask_cuda/cli/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/dask_cuda/cli/dask_config.py b/dask_cuda/cli/dask_config.py
deleted file mode 100755
index 51c9aa2b..00000000
--- a/dask_cuda/cli/dask_config.py
+++ /dev/null
@@ -1,95 +0,0 @@
-from __future__ import absolute_import, division, print_function
-
-import logging
-
-import click
-
-from distributed import Client
-from distributed.preloading import validate_preload_argv
-from distributed.security import Security
-
-from ..utils import print_cluster_config
-
-logger = logging.getLogger(__name__)
-
-
-pem_file_option_type = click.Path(exists=True, resolve_path=True)
-
-
-@click.command(context_settings=dict(ignore_unknown_options=True))
-@click.argument("scheduler", type=str, required=False)
-@click.argument(
-    "preload_argv", nargs=-1, type=click.UNPROCESSED, callback=validate_preload_argv
-)
-@click.option(
-    "--scheduler-file",
-    type=str,
-    default=None,
-    help="""Filename to JSON encoded scheduler information. To be used in conjunction
-    with the equivalent ``dask-scheduler`` option.""",
-)
-@click.option(
-    "--get-cluster-configuration",
-    "get_cluster_conf",
-    default=False,
-    is_flag=True,
-    required=False,
-    show_default=True,
-    help="""Print a table of the current cluster configuration""",
-)
-@click.option(
-    "--tls-ca-file",
-    type=pem_file_option_type,
-    default=None,
-    help="""CA certificate(s) file for TLS (in PEM format). Can be a string (like
-    ``"path/to/certs"``), or ``None`` for no certificate(s).""",
-)
-@click.option(
-    "--tls-cert",
-    type=pem_file_option_type,
-    default=None,
-    help="""Certificate file for TLS (in PEM format). Can be a string (like
-    ``"path/to/certs"``), or ``None`` for no certificate(s).""",
-)
-@click.option(
-    "--tls-key",
-    type=pem_file_option_type,
-    default=None,
-    help="""Private key file for TLS (in PEM format). Can be a string (like
-    ``"path/to/certs"``), or ``None`` for no private key.""",
-)
-def main(
-    scheduler,
-    scheduler_file,
-    get_cluster_conf,
-    tls_ca_file,
-    tls_cert,
-    tls_key,
-    **kwargs,
-):
-    if tls_ca_file and tls_cert and tls_key:
-        security = Security(
-            tls_ca_file=tls_ca_file,
-            tls_worker_cert=tls_cert,
-            tls_worker_key=tls_key,
-        )
-    else:
-        security = None
-
-    if isinstance(scheduler, str) and scheduler.startswith("-"):
-        raise ValueError(
-            "The scheduler address can't start with '-'. Please check "
-            "your command line arguments, you probably attempted to use "
-            "unsupported one. Scheduler address: %s" % scheduler
-        )
-
-    if get_cluster_conf:
-        if scheduler_file is not None:
-            client = Client(scheduler_file=scheduler_file, security=security)
-        else:
-            client = Client(scheduler, security=security)
-        print_cluster_config(client)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/dask_cuda/cuda_worker.py b/dask_cuda/cuda_worker.py
index b7682de2..e499def5 100644
--- a/dask_cuda/cuda_worker.py
+++ b/dask_cuda/cuda_worker.py
@@ -117,7 +117,7 @@ def del_pid_file():
         ):
             raise ValueError(
                 "Need to provide scheduler address like\n"
-                "dask-worker SCHEDULER_ADDRESS:8786"
+                "dask cuda worker SCHEDULER_ADDRESS:8786"
             )
 
         if isinstance(scheduler, Cluster):
diff --git a/dask_cuda/initialize.py b/dask_cuda/initialize.py
index f03f99ec..52a67e31 100644
--- a/dask_cuda/initialize.py
+++ b/dask_cuda/initialize.py
@@ -73,7 +73,7 @@ def initialize(
     To ensure UCX works correctly, it is important to ensure it is initialized with the
     correct options. This is especially important for the client, which cannot be
     configured to use UCX with arguments like ``LocalCUDACluster`` and
-    ``dask-cuda-worker``. This function will ensure that they are provided a UCX
+    ``dask cuda worker``. This function will ensure that they are provided a UCX
     configuration based on the flags and options passed by the user.
 
     This function can also be used within a worker preload script for UCX configuration
diff --git a/dask_cuda/tests/test_dask_cuda_worker.py b/dask_cuda/tests/test_dask_cuda_worker.py
index 7ff7a9c9..64950e2b 100644
--- a/dask_cuda/tests/test_dask_cuda_worker.py
+++ b/dask_cuda/tests/test_dask_cuda_worker.py
@@ -25,10 +25,12 @@
 @patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,3,7,8"})
 def test_cuda_visible_devices_and_memory_limit_and_nthreads(loop):  # noqa: F811
     nthreads = 4
-    with popen(["dask-scheduler", "--port", "9359", "--no-dashboard"]):
+    with popen(["dask", "scheduler", "--port", "9359", "--no-dashboard"]):
         with popen(
             [
-                "dask-cuda-worker",
+                "dask",
+                "cuda",
+                "worker",
                 "127.0.0.1:9359",
                 "--host",
                 "127.0.0.1",
@@ -62,10 +64,12 @@ def get_visible_devices():
 
 def test_rmm_pool(loop):  # noqa: F811
     rmm = pytest.importorskip("rmm")
-    with popen(["dask-scheduler", "--port", "9369", "--no-dashboard"]):
+    with popen(["dask", "scheduler", "--port", "9369", "--no-dashboard"]):
         with popen(
             [
-                "dask-cuda-worker",
+                "dask",
+                "cuda",
+                "worker",
                 "127.0.0.1:9369",
                 "--host",
                 "127.0.0.1",
@@ -86,10 +90,12 @@ def test_rmm_pool(loop):  # noqa: F811
 
 def test_rmm_managed(loop):  # noqa: F811
     rmm = pytest.importorskip("rmm")
-    with popen(["dask-scheduler", "--port", "9369", "--no-dashboard"]):
+    with popen(["dask", "scheduler", "--port", "9369", "--no-dashboard"]):
         with popen(
             [
-                "dask-cuda-worker",
+                "dask",
+                "cuda",
+                "worker",
                 "127.0.0.1:9369",
                 "--host",
                 "127.0.0.1",
@@ -115,10 +121,12 @@ def test_rmm_async(loop):  # noqa: F811
     if driver_version < 11020 or runtime_version < 11020:
         pytest.skip("cudaMallocAsync not supported")
 
-    with popen(["dask-scheduler", "--port", "9369", "--no-dashboard"]):
+    with popen(["dask", "scheduler", "--port", "9369", "--no-dashboard"]):
         with popen(
             [
-                "dask-cuda-worker",
+                "dask",
+                "cuda",
+                "worker",
                 "127.0.0.1:9369",
                 "--host",
                 "127.0.0.1",
@@ -138,10 +146,12 @@ def test_rmm_async(loop):  # noqa: F811
 
 def test_rmm_logging(loop):  # noqa: F811
     rmm = pytest.importorskip("rmm")
-    with popen(["dask-scheduler", "--port", "9369", "--no-dashboard"]):
+    with popen(["dask", "scheduler", "--port", "9369", "--no-dashboard"]):
         with popen(
             [
-                "dask-cuda-worker",
+                "dask",
+                "cuda",
+                "worker",
                 "127.0.0.1:9369",
                 "--host",
                 "127.0.0.1",
@@ -164,10 +174,12 @@ def test_rmm_logging(loop):  # noqa: F811
 
 @patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"})
 def test_dashboard_address(loop):  # noqa: F811
-    with popen(["dask-scheduler", "--port", "9369", "--no-dashboard"]):
+    with popen(["dask", "scheduler", "--port", "9369", "--no-dashboard"]):
         with popen(
             [
-                "dask-cuda-worker",
+                "dask",
+                "cuda",
+                "worker",
                 "127.0.0.1:9369",
                 "--dashboard-address",
                 "127.0.0.1:9370",
@@ -184,7 +196,9 @@ def test_dashboard_address(loop):  # noqa: F811
 
 
 def test_unknown_argument():
-    ret = subprocess.run(["dask-cuda-worker", "--my-argument"], capture_output=True)
+    ret = subprocess.run(
+        ["dask", "cuda", "worker", "--my-argument"], capture_output=True
+    )
     assert ret.returncode != 0
     assert b"Scheduler address: --my-argument" in ret.stderr
 
@@ -202,10 +216,12 @@ def test_pre_import(loop):  # noqa: F811
     if module is None:
         pytest.skip("No module found that isn't already loaded")
 
-    with popen(["dask-scheduler", "--port", "9369", "--no-dashboard"]):
+    with popen(["dask", "scheduler", "--port", "9369", "--no-dashboard"]):
         with popen(
             [
-                "dask-cuda-worker",
+                "dask",
+                "cuda",
+                "worker",
                 "127.0.0.1:9369",
                 "--pre-import",
                 module,
@@ -221,9 +237,9 @@ def test_pre_import(loop):  # noqa: F811
 @pytest.mark.timeout(20)
 @patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"})
 def test_pre_import_not_found():
-    with popen(["dask-scheduler", "--port", "9369", "--no-dashboard"]):
+    with popen(["dask", "scheduler", "--port", "9369", "--no-dashboard"]):
         ret = subprocess.run(
-            ["dask-cuda-worker", "127.0.0.1:9369", "--pre-import", "my_module"],
+            ["dask", "cuda", "worker", "127.0.0.1:9369", "--pre-import", "my_module"],
             capture_output=True,
         )
         assert ret.returncode != 0
@@ -241,10 +257,12 @@ def test_cuda_mig_visible_devices_and_memory_limit_and_nthreads(loop):  # noqa:
 
     with patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": cuda_visible_devices}):
         nthreads = len(cuda_visible_devices)
-        with popen(["dask-scheduler", "--port", "9359", "--no-dashboard"]):
+        with popen(["dask", "scheduler", "--port", "9359", "--no-dashboard"]):
             with popen(
                 [
-                    "dask-cuda-worker",
+                    "dask",
+                    "cuda",
+                    "worker",
                     "127.0.0.1:9359",
                     "--host",
                     "127.0.0.1",
@@ -276,10 +294,12 @@ def test_cuda_visible_devices_uuid(loop):  # noqa: F811
     gpu_uuid = get_gpu_uuid_from_index(0)
 
     with patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": gpu_uuid}):
-        with popen(["dask-scheduler", "--port", "9359", "--no-dashboard"]):
+        with popen(["dask", "scheduler", "--port", "9359", "--no-dashboard"]):
             with popen(
                 [
-                    "dask-cuda-worker",
+                    "dask",
+                    "cuda",
+                    "worker",
                     "127.0.0.1:9359",
                     "--host",
                     "127.0.0.1",
@@ -297,10 +317,12 @@ def test_cuda_visible_devices_uuid(loop):  # noqa: F811
 
 def test_rmm_track_allocations(loop):  # noqa: F811
     rmm = pytest.importorskip("rmm")
-    with popen(["dask-scheduler", "--port", "9369", "--no-dashboard"]):
+    with popen(["dask", "scheduler", "--port", "9369", "--no-dashboard"]):
         with popen(
             [
-                "dask-cuda-worker",
+                "dask",
+                "cuda",
+                "worker",
                 "127.0.0.1:9369",
                 "--host",
                 "127.0.0.1",
@@ -329,10 +351,12 @@ def test_rmm_track_allocations(loop):  # noqa: F811
 @patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"})
 def test_get_cluster_configuration(loop):  # noqa: F811
     pytest.importorskip("rmm")
-    with popen(["dask-scheduler", "--port", "9369", "--no-dashboard"]):
+    with popen(["dask", "scheduler", "--port", "9369", "--no-dashboard"]):
         with popen(
             [
-                "dask-cuda-worker",
+                "dask",
+                "cuda",
+                "worker",
                 "127.0.0.1:9369",
                 "--host",
                 "127.0.0.1",
@@ -360,10 +384,12 @@ def test_get_cluster_configuration(loop):  # noqa: F811
 @patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"})
 def test_worker_fraction_limits(loop):  # noqa: F811
     pytest.importorskip("rmm")
-    with popen(["dask-scheduler", "--port", "9369", "--no-dashboard"]):
+    with popen(["dask", "scheduler", "--port", "9369", "--no-dashboard"]):
         with popen(
             [
-                "dask-cuda-worker",
+                "dask",
+                "cuda",
+                "worker",
                 "127.0.0.1:9369",
                 "--host",
                 "127.0.0.1",
diff --git a/docs/source/api.rst b/docs/source/api.rst
index 10a3ed6d..7989fa5e 100644
--- a/docs/source/api.rst
+++ b/docs/source/api.rst
@@ -7,10 +7,19 @@ Cluster
 .. autoclass:: LocalCUDACluster
    :members:
 
+CLI
+---
+
 Worker
-------
-.. click:: dask_cuda.cli.dask_cuda_worker:main
-   :prog: dask-cuda-worker
+~~~~~~
+.. click:: dask_cuda.cli:worker
+   :prog: dask cuda
+   :nested: none
+
+Cluster configuration
+~~~~~~~~~~~~~~~~~~~~~
+.. click:: dask_cuda.cli:config
+   :prog: dask cuda
    :nested: none
 
 Client initialization
diff --git a/docs/source/examples/ucx.rst b/docs/source/examples/ucx.rst
index b9a36777..6230caf6 100644
--- a/docs/source/examples/ucx.rst
+++ b/docs/source/examples/ucx.rst
@@ -1,7 +1,7 @@
 Enabling UCX communication
 ==========================
 
-A CUDA cluster using UCX communication can be started automatically with LocalCUDACluster or manually with the ``dask-cuda-worker`` CLI tool.
+A CUDA cluster using UCX communication can be started automatically with LocalCUDACluster or manually with the ``dask cuda worker`` CLI tool.
 In either case, a ``dask.distributed.Client`` must be made for the worker cluster using the same Dask UCX configuration; see `UCX Integration -- Configuration <../ucx.html#configuration>`_ for details on all available options.
 
 LocalCUDACluster with Automatic Configuration
@@ -48,10 +48,10 @@ To connect a client to a cluster with all supported transports and an RMM pool:
     )
     client = Client(cluster)
 
-dask-cuda-worker with Automatic Configuration
----------------------------------------------
+``dask cuda worker`` with Automatic Configuration
+-------------------------------------------------
 
-When using ``dask-cuda-worker`` with UCX communication and automatic configuration, the scheduler, workers, and client must all be started manually, but without specifying any UCX transports explicitly. This is only supported in Dask-CUDA 22.02 and newer and requires UCX >= 1.11.1.
+When using ``dask cuda worker`` with UCX communication and automatic configuration, the scheduler, workers, and client must all be started manually, but without specifying any UCX transports explicitly. This is only supported in Dask-CUDA 22.02 and newer and requires UCX >= 1.11.1.
 
 Scheduler
 ^^^^^^^^^
@@ -64,7 +64,7 @@ To start a Dask scheduler using UCX with automatic configuration and one GB of R
 
     $ DASK_DISTRIBUTED__COMM__UCX__CREATE_CUDA_CONTEXT=True \
     > DASK_DISTRIBUTED__RMM__POOL_SIZE=1GB \
-    > dask-scheduler --protocol ucx --interface ib0
+    > dask scheduler --protocol ucx --interface ib0
 
 .. note::
     The ``interface="ib0"`` is intentionally specified above to ensure RDMACM is used in systems that support InfiniBand. On systems that don't support InfiniBand or where RDMACM isn't required, the ``interface`` argument may be omitted or specified to listen on a different interface.
@@ -79,7 +79,7 @@ To start workers with automatic UCX configuration and an RMM pool of 14GB per GP
 .. code-block:: bash
 
     $ UCX_MEMTYPE_REG_WHOLE_ALLOC_TYPES=cuda
-    > dask-cuda-worker ucx://<scheduler_address>:8786 \
+    > dask cuda worker ucx://<scheduler_address>:8786 \
     > --rmm-pool-size="14GB" \
     > --interface="ib0"
 
@@ -121,15 +121,15 @@ Alternatively, the ``with dask.config.set`` statement from the example above may
 .. note::
     We specify ``UCX_MEMTYPE_REG_WHOLE_ALLOC_TYPES=cuda`` above for optimal performance with InfiniBand, see details `here <https://ucx-py.readthedocs.io/en/latest/configuration.html#ucx-memtype-reg-whole-alloc-types>`_. If not using InfiniBand, that option may be omitted. In UCX 1.12 and newer, that option is default and may be omitted as well even when using InfiniBand.
 
-dask-cuda-worker with Manual Configuration
+``dask cuda worker`` with Manual Configuration
 ------------------------------------------
 
-When using ``dask-cuda-worker`` with UCX communication and manual configuration, the scheduler, workers, and client must all be started manually, each using the same UCX configuration.
+When using ``dask cuda worker`` with UCX communication and manual configuration, the scheduler, workers, and client must all be started manually, each using the same UCX configuration.
 
 Scheduler
 ^^^^^^^^^
 
-UCX configuration options will need to be specified for ``dask-scheduler`` as environment variables; see `Dask Configuration -- Environment Variables <https://docs.dask.org/en/latest/configuration.html#environment-variables>`_ for more details on the mapping between environment variables and options.
+UCX configuration options will need to be specified for ``dask scheduler`` as environment variables; see `Dask Configuration -- Environment Variables <https://docs.dask.org/en/latest/configuration.html#environment-variables>`_ for more details on the mapping between environment variables and options.
 
 To start a Dask scheduler using UCX with all supported transports and an gigabyte RMM pool:
 
@@ -141,19 +141,19 @@ To start a Dask scheduler using UCX with all supported transports and an gigabyt
     > DASK_DISTRIBUTED__COMM__UCX__INFINIBAND=True \
     > DASK_DISTRIBUTED__COMM__UCX__RDMACM=True \
     > DASK_DISTRIBUTED__RMM__POOL_SIZE=1GB \
-    > dask-scheduler --protocol ucx --interface ib0
+    > dask scheduler --protocol ucx --interface ib0
 
 We communicate to the scheduler that we will be using UCX with the ``--protocol`` option, and that we will be using InfiniBand with the ``--interface`` option.
 
 Workers
 ^^^^^^^
 
-All UCX configuration options have analogous options in ``dask-cuda-worker``; see `API -- Worker <../api.html#worker>`_ for a complete list of these options.
+All UCX configuration options have analogous options in ``dask cuda worker``; see `API -- Worker <../api.html#worker>`_ for a complete list of these options.
 To start a cluster with all supported transports and an RMM pool:
 
 .. code-block:: bash
 
-    $ dask-cuda-worker ucx://<scheduler_address>:8786 \
+    $ dask cuda worker ucx://<scheduler_address>:8786 \
     > --enable-tcp-over-ucx \
     > --enable-nvlink \
     > --enable-infiniband \
diff --git a/docs/source/examples/worker_count.rst b/docs/source/examples/worker_count.rst
index 62954ffb..40123672 100644
--- a/docs/source/examples/worker_count.rst
+++ b/docs/source/examples/worker_count.rst
@@ -20,14 +20,14 @@ This argument can be used on its own or in conjunction with ``CUDA_VISIBLE_DEVIC
     cluster = LocalCUDACluster(n_workers=2)                                # will use GPUs 0,1
     cluster = LocalCUDACluster(CUDA_VISIBLE_DEVICES="3,4,5", n_workers=2)  # will use GPUs 3,4
 
-When using ``dask-cuda-worker``, ``CUDA_VISIBLE_DEVICES`` must be provided as an environment variable:
+When using ``dask cuda worker``, ``CUDA_VISIBLE_DEVICES`` must be provided as an environment variable:
 
 .. code-block:: bash
 
-    $ dask-scheduler
+    $ dask scheduler
     distributed.scheduler - INFO -   Scheduler at:  tcp://127.0.0.1:8786
 
-    $ CUDA_VISIBLE_DEVICES=0,1 dask-cuda-worker 127.0.0.1:8786
+    $ CUDA_VISIBLE_DEVICES=0,1 dask cuda worker 127.0.0.1:8786
 
 GPUs can also be selected by their UUIDs, which can be acquired using `NVIDIA System Management Interface <https://developer.nvidia.com/nvidia-system-management-interface>`_:
 
@@ -46,4 +46,4 @@ These UUIDs can then be passed to ``CUDA_VISIBLE_DEVICES`` in place of a GPU ind
 .. code-block:: bash
 
     $ CUDA_VISIBLE_DEVICES="GPU-dae76d0e-3414-958a-8f3e-fc6682b36f31" \
-    > dask-cuda-worker 127.0.0.1:8786
+    > dask cuda worker 127.0.0.1:8786
diff --git a/docs/source/index.rst b/docs/source/index.rst
index a43f2907..37ba1213 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -9,7 +9,7 @@ Motivation
 
 While Distributed can be used to leverage GPU workloads through libraries such as `cuDF <https://docs.rapids.ai/api/cudf/stable/>`_, `CuPy <https://cupy.dev/>`_, and `Numba <https://numba.pydata.org/>`_, Dask-CUDA offers several unique features unavailable to Distributed:
 
-- **Automatic instantiation of per-GPU workers** -- Using Dask-CUDA's LocalCUDACluster or ``dask-cuda-worker`` CLI will automatically launch one worker for each GPU available on the executing node, avoiding the need to explicitly select GPUs.
+- **Automatic instantiation of per-GPU workers** -- Using Dask-CUDA's LocalCUDACluster or ``dask cuda worker`` CLI will automatically launch one worker for each GPU available on the executing node, avoiding the need to explicitly select GPUs.
 - **Automatic setting of CPU affinity**  -- The setting of CPU affinity for each GPU is done automatically, preventing memory transfers from taking suboptimal paths.
 - **Automatic selection of InfiniBand devices** -- When UCX communication is enabled over InfiniBand, Dask-CUDA automatically selects the optimal InfiniBand device for each GPU (see `UCX Integration <ucx.html>`_ for instructions on configuring UCX communication).
 - **Memory spilling from GPU** -- For memory-intensive workloads, Dask-CUDA supports spilling from GPU to host memory when a GPU reaches the default or user-specified memory utilization limit.
diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst
index ce9ea2f2..c5592b43 100644
--- a/docs/source/quickstart.rst
+++ b/docs/source/quickstart.rst
@@ -1,7 +1,7 @@
 Quickstart
 ==========
 
-A Dask-CUDA cluster can be created using either LocalCUDACluster or ``dask-cuda-worker`` from the command line.
+A Dask-CUDA cluster can be created using either LocalCUDACluster or ``dask cuda worker`` from the command line.
 
 LocalCUDACluster
 ----------------
@@ -16,17 +16,17 @@ To create a Dask-CUDA cluster using all available GPUs and connect a Dask.distri
     cluster = LocalCUDACluster()
     client = Client(cluster)
 
-dask-cuda-worker
-----------------
+``dask cuda worker``
+--------------------
 
-To create an equivalent cluster from the command line, Dask-CUDA workers must be connected to a scheduler started with ``dask-scheduler``:
+To create an equivalent cluster from the command line, Dask-CUDA workers must be connected to a scheduler started with ``dask scheduler``:
 
 .. code-block:: bash
 
-    $ dask-scheduler
+    $ dask scheduler
     distributed.scheduler - INFO -   Scheduler at:  tcp://127.0.0.1:8786
 
-    $ dask-cuda-worker 127.0.0.1:8786
+    $ dask cuda worker 127.0.0.1:8786
 
 To connect a client to this cluster:
 
diff --git a/docs/source/spilling.rst b/docs/source/spilling.rst
index ba8e7b93..28f3562b 100644
--- a/docs/source/spilling.rst
+++ b/docs/source/spilling.rst
@@ -19,17 +19,17 @@ Memory spilling can be disabled by setting ``device_memory_limit`` to 0:
 
     cluster = LocalCUDACluster(device_memory_limit=0)  # spilling disabled
 
-The same applies for ``dask-cuda-worker``, and spilling can be controlled by setting ``--device-memory-limit``:
+The same applies for ``dask cuda worker``, and spilling can be controlled by setting ``--device-memory-limit``:
 
 .. code-block::
 
-    $ dask-scheduler
+    $ dask scheduler
     distributed.scheduler - INFO -   Scheduler at:  tcp://127.0.0.1:8786
 
-    $ dask-cuda-worker --device-memory-limit 50000
-    $ dask-cuda-worker --device-memory-limit 5GB
-    $ dask-cuda-worker --device-memory-limit 0.3
-    $ dask-cuda-worker --device-memory-limit 0
+    $ dask cuda worker --device-memory-limit 50000
+    $ dask cuda worker --device-memory-limit 5GB
+    $ dask cuda worker --device-memory-limit 0.3
+    $ dask cuda worker --device-memory-limit 0
 
 
 JIT-Unspill
@@ -65,19 +65,19 @@ Or set the worker argument ``--enable-jit-unspill​``
 
 .. code-block::
 
-    $ dask-scheduler
+    $ dask scheduler
     distributed.scheduler - INFO - Scheduler at:  tcp://127.0.0.1:8786
 
-    $ dask-cuda-worker --enable-jit-unspill​
+    $ dask cuda worker --enable-jit-unspill​
 
 Or environment variable ``DASK_JIT_UNSPILL=True``
 
 .. code-block::
 
-    $ dask-scheduler
+    $ dask scheduler
     distributed.scheduler - INFO -   Scheduler at:  tcp://127.0.0.1:8786
 
-    $ DASK_JIT_UNSPILL=True dask-cuda-worker​
+    $ DASK_JIT_UNSPILL=True dask cuda worker​
 
 
 Limitations
diff --git a/docs/source/ucx.rst b/docs/source/ucx.rst
index fe9b95c4..7463f0c1 100644
--- a/docs/source/ucx.rst
+++ b/docs/source/ucx.rst
@@ -37,7 +37,7 @@ Automatic
 
 Beginning with Dask-CUDA 22.02 and assuming UCX >= 1.11.1, specifying UCX transports is now optional.
 
-A local cluster can now be started with ``LocalCUDACluster(protocol="ucx")``, implying automatic UCX transport selection (``UCX_TLS=all``). Starting a cluster separately -- scheduler, workers and client as different processes -- is also possible, as long as Dask scheduler is created with ``dask-scheduler --protocol="ucx"`` and connecting a ``dask-cuda-worker`` to the scheduler will imply automatic UCX transport selection, but that requires the Dask scheduler and client to be started with ``DASK_DISTRIBUTED__COMM__UCX__CREATE_CUDA_CONTEXT=True``. See `Enabling UCX communication <examples/ucx.html>`_ for more details examples of UCX usage with automatic configuration.
+A local cluster can now be started with ``LocalCUDACluster(protocol="ucx")``, implying automatic UCX transport selection (``UCX_TLS=all``). Starting a cluster separately -- scheduler, workers and client as different processes -- is also possible, as long as Dask scheduler is created with ``dask scheduler --protocol="ucx"`` and connecting a ``dask cuda worker`` to the scheduler will imply automatic UCX transport selection, but that requires the Dask scheduler and client to be started with ``DASK_DISTRIBUTED__COMM__UCX__CREATE_CUDA_CONTEXT=True``. See `Enabling UCX communication <examples/ucx.html>`_ for more details examples of UCX usage with automatic configuration.
 
 Configuring transports manually is still possible, please refer to the subsection below.
 
@@ -97,7 +97,7 @@ this when using Dask-CUDA's UCX integration, processes launched via
 multiprocessing should use the start processes using the
 `"forkserver"
 <https://docs.python.org/dev/library/multiprocessing.html#contexts-and-start-methods>`_
-method. When launching workers using `dask-cuda-worker <quickstart.html#dask-cuda-worker>`_, this can be
+method. When launching workers using `dask cuda worker <quickstart.html#dask-cuda-worker>`_, this can be
 achieved by passing ``--multiprocessing-method forkserver`` as an
 argument. In user code, the method can be controlled with the
 ``distributed.worker.multiprocessing-method`` configuration key in
diff --git a/examples/ucx/dask_cuda_worker.sh b/examples/ucx/dask_cuda_worker.sh
index f1ec9818..f139bfd6 100644
--- a/examples/ucx/dask_cuda_worker.sh
+++ b/examples/ucx/dask_cuda_worker.sh
@@ -3,7 +3,7 @@
 usage() {
     echo "usage: $0 [-a <scheduler_address>] [-i <interface>] [-r <rmm_pool_size>] [-t <transports>]" >&2
     exit 1
-    } 
+    }
 
 # parse arguments
 rmm_pool_size=1GB
@@ -46,7 +46,7 @@ if [[ $transport == *"ib"* ]]; then
 fi
 
 # initialize scheduler
-dask-scheduler $scheduler_flags &
+dask scheduler $scheduler_flags &
 
 # initialize workers
-dask-cuda-worker $worker_flags
+dask cuda worker $worker_flags
diff --git a/pyproject.toml b/pyproject.toml
index f8d98957..9b4b5633 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -39,8 +39,11 @@ classifiers=[
 ]
 
 [project.scripts]
-dask-cuda-worker = "dask_cuda.cli.dask_cuda_worker:go"
-dask-config = "dask_cuda.cli.dask_config:go"
+dask-cuda-worker = "dask_cuda.cli:worker"
+dask-cuda-config = "dask_cuda.cli:config"
+
+[project.entry-points.dask_cli]
+cuda = "dask_cuda.cli:cuda"
 
 [project.optional-dependencies]
 docs = [

From 03e5dcc8cf13fdc1c16fad84e2ab3387d712c355 Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Thu, 19 Jan 2023 13:14:47 -0500
Subject: [PATCH 19/31] Remove `--get-cluster-configuration` option, check for
 scheduler in `dask cuda config` (#1088)

As @pentschev brought up in https://github.com/rapidsai/dask-cuda/pull/981#discussion_r1069887904, we shouldn't need the `--get-cluster-configuration` option for `dask cuda config` since it only enables/disables printing the cluster configuration.

Also added a check to ensure that a scheduler address or scheduler file has been specified, as otherwise IIUC running `dask cuda config` would just end up starting up and querying a local cluster on CPU.

EDIT:

Modified the scheduler check for `dask cuda worker` as well since it seems like a general improvement

Authors:
  - Charles Blackmon-Luca (https://github.com/charlesbluca)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/1088
---
 dask_cuda/cli.py         | 47 ++++++++++++++++++++--------------------
 dask_cuda/cuda_worker.py | 11 ++++++----
 2 files changed, 31 insertions(+), 27 deletions(-)

diff --git a/dask_cuda/cli.py b/dask_cuda/cli.py
index 7e3b0e75..e2690f15 100644
--- a/dask_cuda/cli.py
+++ b/dask_cuda/cli.py
@@ -430,22 +430,12 @@ async def run():
 @scheduler
 @preload_argv
 @scheduler_file
-@click.option(
-    "--get-cluster-configuration",
-    "get_cluster_conf",
-    default=False,
-    is_flag=True,
-    required=False,
-    show_default=True,
-    help="""Print a table of the current cluster configuration""",
-)
 @tls_ca_file
 @tls_cert
 @tls_key
 def config(
     scheduler,
     scheduler_file,
-    get_cluster_conf,
     tls_ca_file,
     tls_cert,
     tls_key,
@@ -456,6 +446,25 @@ def config(
     A cluster can be specified either through a URI passed through the ``SCHEDULER``
     argument or a scheduler file passed through the ``--scheduler-file`` option.
     """
+    if (
+        scheduler is None
+        and scheduler_file is None
+        and dask_config.get("scheduler-address", None) is None
+    ):
+        raise ValueError(
+            "No scheduler specified. A scheduler can be specified by "
+            "passing an address through the SCHEDULER argument or "
+            "'dask.scheduler-address' config option, or by passing the "
+            "location of a scheduler file through the --scheduler-file "
+            "option"
+        )
+
+    if isinstance(scheduler, str) and scheduler.startswith("-"):
+        raise ValueError(
+            "The scheduler address can't start with '-'. Please check "
+            "your command line arguments, you probably attempted to use "
+            "unsupported one. Scheduler address: %s" % scheduler
+        )
 
     if tls_ca_file and tls_cert and tls_key:
         security = Security(
@@ -466,16 +475,8 @@ def config(
     else:
         security = None
 
-    if isinstance(scheduler, str) and scheduler.startswith("-"):
-        raise ValueError(
-            "The scheduler address can't start with '-'. Please check "
-            "your command line arguments, you probably attempted to use "
-            "unsupported one. Scheduler address: %s" % scheduler
-        )
-
-    if get_cluster_conf:
-        if scheduler_file is not None:
-            client = Client(scheduler_file=scheduler_file, security=security)
-        else:
-            client = Client(scheduler, security=security)
-        print_cluster_config(client)
+    if scheduler_file is not None:
+        client = Client(scheduler_file=scheduler_file, security=security)
+    else:
+        client = Client(scheduler, security=security)
+    print_cluster_config(client)
diff --git a/dask_cuda/cuda_worker.py b/dask_cuda/cuda_worker.py
index e499def5..03b16b52 100644
--- a/dask_cuda/cuda_worker.py
+++ b/dask_cuda/cuda_worker.py
@@ -111,13 +111,16 @@ def del_pid_file():
         kwargs = {"worker_port": None, "listen_address": None, **kwargs}
 
         if (
-            not scheduler
-            and not scheduler_file
+            scheduler is None
+            and scheduler_file is None
             and dask.config.get("scheduler-address", None) is None
         ):
             raise ValueError(
-                "Need to provide scheduler address like\n"
-                "dask cuda worker SCHEDULER_ADDRESS:8786"
+                "No scheduler specified. A scheduler can be specified by "
+                "passing an address through the SCHEDULER argument or "
+                "'dask.scheduler-address' config option, or by passing the "
+                "location of a scheduler file through the --scheduler-file "
+                "option"
             )
 
         if isinstance(scheduler, Cluster):

From e9609c678301fda9c3ac64487c15468c4291cb09 Mon Sep 17 00:00:00 2001
From: Ajay Thorve <AjayThorve@users.noreply.github.com>
Date: Fri, 20 Jan 2023 10:33:08 -0800
Subject: [PATCH 20/31] add initial docs build (#1089)

The PR adds a docs_build process to the PR and Build workflows for this repository. The generated docs are synced to s3 for only the build workflows.


cc @ajschmidt8

Authors:
  - Ajay Thorve (https://github.com/AjayThorve)
  - AJ Schmidt (https://github.com/ajschmidt8)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/dask-cuda/pull/1089
---
 .github/workflows/build.yaml | 11 +++++++++++
 .github/workflows/pr.yaml    | 11 +++++++++++
 ci/build_docs.sh             | 38 ++++++++++++++++++++++++++++++++++++
 dependencies.yaml            | 15 ++++++++++++++
 4 files changed, 75 insertions(+)
 create mode 100755 ci/build_docs.sh

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 6376d33c..d36d0e81 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -34,6 +34,17 @@ jobs:
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
+  docs-build:
+    if: ${{ startsWith(github.ref, 'refs/heads/branch-') }}
+    needs: [conda-python-build]
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-118
+    with:
+      build_type: branch
+      node_type: "gpu-latest-1"
+      arch: "amd64"
+      container_image: "rapidsai/ci:latest"
+      run_script: "ci/build_docs.sh"
   upload-conda:
     needs: [conda-python-build]
     secrets: inherit
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 238205c1..730b3587 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -15,6 +15,7 @@ jobs:
       - checks
       - conda-python-build
       - conda-python-tests
+      - docs-build
       - wheel-build
     secrets: inherit
     uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@cuda-118
@@ -33,6 +34,16 @@ jobs:
     uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@cuda-118
     with:
       build_type: pull-request
+  docs-build:
+    needs: conda-python-build
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-118
+    with:
+      build_type: pull-request
+      node_type: "gpu-latest-1"
+      arch: "amd64"
+      container_image: "rapidsai/ci:latest"
+      run_script: "ci/build_docs.sh"
   wheel-build:
     needs: checks
     runs-on: ubuntu-latest
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
new file mode 100755
index 00000000..338ff974
--- /dev/null
+++ b/ci/build_docs.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+set -euo pipefail
+
+rapids-logger "Create test conda environment"
+. /opt/conda/etc/profile.d/conda.sh
+
+rapids-dependency-file-generator \
+    --output conda \
+    --file_key docs \
+    --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml
+
+rapids-mamba-retry env create --force -f env.yaml -n docs
+conda activate docs
+
+rapids-print-env
+
+rapids-logger "Downloading artifacts from previous jobs"
+
+PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python)
+VERSION_NUMBER=$(rapids-get-rapids-version-from-git)
+
+rapids-mamba-retry install \
+    --channel "${PYTHON_CHANNEL}" \
+    dask-cuda
+
+# Build Python docs
+rapids-logger "Build Python docs"
+pushd docs
+sphinx-build -b dirhtml ./source _html
+sphinx-build -b text ./source _text
+popd
+
+if [[ "${RAPIDS_BUILD_TYPE}" == "branch" ]]; then
+  rapids-logger "Upload Docs to S3"
+  aws s3 sync --no-progress --delete docs/_html "s3://rapidsai-docs/dask-cuda/${VERSION_NUMBER}/html"
+  aws s3 sync --no-progress --delete docs/_text "s3://rapidsai-docs/dask-cuda/${VERSION_NUMBER}/txt"
+fi
diff --git a/dependencies.yaml b/dependencies.yaml
index 3aaf8b58..2d673971 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -6,6 +6,7 @@ files:
       - build_python
       - cudatoolkit
       - develop
+      - docs
       - py_version
       - run_python
       - test_python
@@ -20,6 +21,12 @@ files:
     includes:
       - develop
       - py_version
+  docs:
+    output: none
+    includes:
+      - cudatoolkit
+      - docs
+      - py_version
 channels:
   - rapidsai
   - rapidsai-nightly
@@ -57,6 +64,14 @@ dependencies:
       - output_types: [conda, requirements]
         packages:
           - pre-commit
+  docs:
+    common:
+      - output_types: [conda, requirements]
+        packages:
+          - numpydoc
+          - sphinx
+          - sphinx-click
+          - sphinx_rtd_theme
   py_version:
     specific:
       - output_types: conda

From 963b745437ce45e4db4a0c8c382ed52bf6116033 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Mon, 23 Jan 2023 10:31:12 +0100
Subject: [PATCH 21/31] shuffle: use cuDF's `partition_by_hash()` when
 available (#1090)

cuDF's `partition_by_hash()` is faster than calling `compute_map_index()` followed by `scatter_by_map()`.

Depend on https://github.com/rapidsai/cudf/pull/12554

Authors:
  - Mads R. B. Kristensen (https://github.com/madsbk)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/1090
---
 dask_cuda/benchmarks/local_cudf_shuffle.py    |  2 +-
 dask_cuda/explicit_comms/dataframe/shuffle.py | 14 +++++++++++---
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/dask_cuda/benchmarks/local_cudf_shuffle.py b/dask_cuda/benchmarks/local_cudf_shuffle.py
index d9039aad..6497fb7c 100644
--- a/dask_cuda/benchmarks/local_cudf_shuffle.py
+++ b/dask_cuda/benchmarks/local_cudf_shuffle.py
@@ -45,7 +45,7 @@ def shuffle_explicit_comms(df, args):
     t1 = perf_counter()
     wait(
         dask_cuda.explicit_comms.dataframe.shuffle.shuffle(
-            df, column_names="data", ignore_index=args.ignore_index
+            df, column_names=["data"], ignore_index=args.ignore_index
         ).persist()
     )
     return perf_counter() - t1
diff --git a/dask_cuda/explicit_comms/dataframe/shuffle.py b/dask_cuda/explicit_comms/dataframe/shuffle.py
index c6e07006..46c4bccb 100644
--- a/dask_cuda/explicit_comms/dataframe/shuffle.py
+++ b/dask_cuda/explicit_comms/dataframe/shuffle.py
@@ -174,10 +174,18 @@ def partition_dataframe(
 
     Returns
     -------
-    partitions: list of DataFrames
-        List of dataframe-partitions
+    partitions
+        Dict of dataframe-partitions, mapping partition-ID to dataframe
     """
-    # TODO: use cuDF's partition_by_hash() when `column_names[0] != "_partitions"`
+    if column_names[0] != "_partitions" and hasattr(df, "partition_by_hash"):
+        return dict(
+            zip(
+                range(npartitions),
+                df.partition_by_hash(
+                    column_names, npartitions, keep_index=not ignore_index
+                ),
+            )
+        )
     map_index = compute_map_index(df, column_names, npartitions)
     return group_split_dispatch(df, map_index, npartitions, ignore_index=ignore_index)
 

From 66a6a46ad7f7bfc030e1882321549af25110d02c Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Mon, 23 Jan 2023 13:25:07 +0100
Subject: [PATCH 22/31] shuffle-benchmark: add `--partition-distribution`
 (#1081)

Implements a `--partition-distribution` argument to `local_cudf_shuffle.py`

Authors:
  - Mads R. B. Kristensen (https://github.com/madsbk)
  - Charles Blackmon-Luca (https://github.com/charlesbluca)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/dask-cuda/pull/1081
---
 dask_cuda/benchmarks/local_cudf_shuffle.py | 44 +++++++++++++++++-----
 1 file changed, 35 insertions(+), 9 deletions(-)

diff --git a/dask_cuda/benchmarks/local_cudf_shuffle.py b/dask_cuda/benchmarks/local_cudf_shuffle.py
index 6497fb7c..51ba48f9 100644
--- a/dask_cuda/benchmarks/local_cudf_shuffle.py
+++ b/dask_cuda/benchmarks/local_cudf_shuffle.py
@@ -70,20 +70,37 @@ def create_data(
     The partitions are perfectly distributed across workers, if the number of
     requested partitions is evenly divisible by the number of workers.
     """
+    chunksize = args.partition_size // np.float64().nbytes
 
     workers = list(client.scheduler_info()["workers"].keys())
     assert len(workers) > 0
 
-    chunksize = args.partition_size // np.float64().nbytes
-    # Distribute the new partitions between workers by round robin.
-    # We use `client.submit` to control the distribution exactly.
-    # TODO: support unbalanced partition distribution
-    dsk = {}
-    for i in range(args.in_parts):
-        worker = workers[i % len(workers)]  # Round robin
-        dsk[(name, i)] = client.submit(
-            create_df, chunksize, args.type, workers=[worker], pure=False
+    dist = args.partition_distribution
+    if dist is None:
+        # By default, we create a balanced distribution
+        dist = [args.in_parts // len(workers)] * len(workers)
+        for i in range(args.in_parts % len(workers)):
+            dist[i] += 1
+
+    if len(dist) != len(workers):
+        raise ValueError(
+            f"The length of `--devs`({len(dist)}) and "
+            f"`--partition-distribution`({len(workers)}) doesn't match"
         )
+    if sum(dist) != args.in_parts:
+        raise ValueError(
+            f"The sum of `--partition-distribution`({sum(dist)}) must match "
+            f"the number of input partitions `--in-parts={args.in_parts}`"
+        )
+
+    # Create partition based to the specified partition distribution
+    dsk = {}
+    for i, part_size in enumerate(dist):
+        for _ in range(part_size):
+            # We use `client.submit` to control placement of the partition.
+            dsk[(name, len(dsk))] = client.submit(
+                create_df, chunksize, args.type, workers=[workers[i]], pure=False
+            )
     wait(dsk.values())
 
     df_meta = create_df(0, args.type)
@@ -225,6 +242,15 @@ def parse_args():
             "action": "store_true",
             "help": "When shuffle, ignore the index",
         },
+        {
+            "name": "--partition-distribution",
+            "default": None,
+            "metavar": "PARTITION_SIZE_LIST",
+            "type": lambda x: [int(y) for y in x.split(",")],
+            "help": "Comma separated list defining the size of each partition, "
+            "which must have the same length as `--devs`. "
+            "If not set, a balanced distribution is used.",
+        },
     ]
 
     return parse_benchmark_args(

From 9ff39962d0f06f7650d899debff6c45cfa95bf99 Mon Sep 17 00:00:00 2001
From: jakirkham <jakirkham@gmail.com>
Date: Tue, 24 Jan 2023 23:41:53 -0800
Subject: [PATCH 23/31] Fix whitespace & add URLs in `pyproject.toml` (#1092)

Authors:
  - https://github.com/jakirkham

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/1092
---
 pyproject.toml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 9b4b5633..7163e4f6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,7 +16,7 @@ readme = { file = "README.md", content-type = "text/markdown" }
 authors = [
     { name = "NVIDIA Corporation" },
 ]
-license= { text = "Apache-2.0" }
+license = { text = "Apache-2.0" }
 requires-python = ">=3.8"
 dependencies = [
     "dask >=2022.12.0",
@@ -27,7 +27,7 @@ dependencies = [
     "pandas >=1.0",
     "zict >=0.1.3",
 ]
-classifiers=[
+classifiers = [
     "Intended Audience :: Developers",
     "Topic :: Database",
     "Topic :: Scientific/Engineering",
@@ -58,6 +58,8 @@ test = [
 
 [project.urls]
 Homepage = "https://github.com/rapidsai/dask-cuda"
+Documentation = "https://docs.rapids.ai/api/dask-cuda/stable/"
+Source = "https://github.com/rapidsai/dask-cuda"
 
 [tool.coverage.run]
 disable_warnings = [

From 4f0922cb3d9adda4f185beb03e868b54b9e0293a Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Fri, 27 Jan 2023 23:38:47 +0100
Subject: [PATCH 24/31] Update `cudf.Buffer` pointer access method (#1094)

Fix test that reads directly from `cudf.Buffer` pointer to new `get_ptr(mode="read")`, in accordance with changes from https://github.com/rapidsai/cudf/pull/12587 .

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/dask-cuda/pull/1094
---
 dask_cuda/tests/test_proxify_host_file.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dask_cuda/tests/test_proxify_host_file.py b/dask_cuda/tests/test_proxify_host_file.py
index 0b0f9d5b..41399d67 100644
--- a/dask_cuda/tests/test_proxify_host_file.py
+++ b/dask_cuda/tests/test_proxify_host_file.py
@@ -281,7 +281,7 @@ def test_dataframes_share_dev_mem(root_dir):
     # Even though the two dataframe doesn't point to the same cudf.Buffer object
     assert view1["a"].data is not view2["a"].data
     # They still share the same underlying device memory
-    view1["a"].data.ptr == view2["a"].data.ptr
+    view1["a"].data.get_ptr(mode="read") == view2["a"].data.get_ptr(mode="read")
 
     dhf = ProxifyHostFile(
         worker_local_directory=root_dir, device_memory_limit=160, memory_limit=1000

From 43969d72237479ce3b7c68d5d262ac339dc525cf Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Mon, 30 Jan 2023 17:20:59 +0100
Subject: [PATCH 25/31] pre-commit: spell, whitespace, and mypy check (#1091)

close https://github.com/rapidsai/dask-cuda/issues/1077

Authors:
  - Mads R. B. Kristensen (https://github.com/madsbk)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/1091
---
 .pre-commit-config.yaml                       | 23 +++++++++++++++++++
 .readthedocs.yml                              |  2 +-
 dask_cuda/benchmarks/utils.py                 |  2 +-
 dask_cuda/cli.py                              |  2 +-
 dask_cuda/disk_io.py                          |  4 ++--
 dask_cuda/explicit_comms/dataframe/shuffle.py |  6 ++---
 dask_cuda/initialize.py                       |  2 +-
 dask_cuda/is_spillable_object.py              |  2 +-
 dask_cuda/proxify_device_objects.py           | 10 ++++----
 dask_cuda/proxify_host_file.py                | 14 +++++------
 dask_cuda/proxy_object.py                     | 10 ++++----
 dask_cuda/tests/test_cudf_builtin_spilling.py |  2 +-
 dask_cuda/utils.py                            |  2 +-
 docs/Makefile                                 |  2 +-
 docs/source/api.rst                           |  1 -
 docs/source/examples/best-practices.rst       |  1 -
 docs/source/ucx.rst                           |  3 +--
 rtd/Makefile                                  |  2 +-
 18 files changed, 57 insertions(+), 33 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index bd219066..cc597578 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,4 +1,9 @@
 repos:
+      - repo: https://github.com/pre-commit/pre-commit-hooks
+        rev: v4.3.0
+        hooks:
+              - id: trailing-whitespace
+              - id: end-of-file-fixer
       - repo: https://github.com/pycqa/isort
         rev: 5.10.1
         hooks:
@@ -11,5 +16,23 @@ repos:
         rev: 3.8.3
         hooks:
               - id: flake8
+      - repo: https://github.com/codespell-project/codespell
+        rev: v2.1.0
+        hooks:
+              - id: codespell
+                exclude: |
+                  (?x)^(
+                    .*test.*|
+                    ^CHANGELOG.md$|
+                    ^.*versioneer.py$
+                  )
+      - repo: https://github.com/pre-commit/mirrors-mypy
+        rev: 'v0.991'
+        hooks:
+              - id: mypy
+                additional_dependencies: [types-cachetools]
+                args: ["--module=dask_cuda", "--ignore-missing-imports"]
+                pass_filenames: false
+
 default_language_version:
       python: python3
diff --git a/.readthedocs.yml b/.readthedocs.yml
index 0b2ac73c..fd5ccf68 100644
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@@ -4,4 +4,4 @@ sphinx:
   configuration: rtd/conf.py
 
 formats:
-  - htmlzip
\ No newline at end of file
+  - htmlzip
diff --git a/dask_cuda/benchmarks/utils.py b/dask_cuda/benchmarks/utils.py
index 28d43cc1..1d07df30 100644
--- a/dask_cuda/benchmarks/utils.py
+++ b/dask_cuda/benchmarks/utils.py
@@ -648,7 +648,7 @@ def bandwidth_statistics(
     logs:
         the ``dask_worker.incoming_transfer_log`` object
     ignore_size: int (optional)
-        ignore messsages whose total byte count is smaller than this
+        ignore messages whose total byte count is smaller than this
         value (if provided)
 
     Returns
diff --git a/dask_cuda/cli.py b/dask_cuda/cli.py
index e2690f15..b7069d63 100644
--- a/dask_cuda/cli.py
+++ b/dask_cuda/cli.py
@@ -137,7 +137,7 @@ def cuda():
     "--rmm-async/--no-rmm-async",
     default=False,
     show_default=True,
-    help="""Initialize each worker withh RMM and set it to use RMM's asynchronous
+    help="""Initialize each worker with RMM and set it to use RMM's asynchronous
     allocator. See ``rmm.mr.CudaAsyncMemoryResource`` for more info.
 
     .. warning::
diff --git a/dask_cuda/disk_io.py b/dask_cuda/disk_io.py
index 7ccda0f3..0427b77f 100644
--- a/dask_cuda/disk_io.py
+++ b/dask_cuda/disk_io.py
@@ -96,8 +96,8 @@ class SpillToDiskProperties:
     def __init__(
         self,
         root_dir: Union[str, os.PathLike],
-        shared_filesystem: bool = None,
-        gds: bool = None,
+        shared_filesystem: Optional[bool] = None,
+        gds: Optional[bool] = None,
     ):
         """
         Parameters
diff --git a/dask_cuda/explicit_comms/dataframe/shuffle.py b/dask_cuda/explicit_comms/dataframe/shuffle.py
index 46c4bccb..84bc5570 100644
--- a/dask_cuda/explicit_comms/dataframe/shuffle.py
+++ b/dask_cuda/explicit_comms/dataframe/shuffle.py
@@ -270,7 +270,7 @@ async def send_recv_partitions(
     myrank
         The rank of this worker.
     rank_to_out_part_ids
-        dict that for each worker rank specifices a set of output partition IDs.
+        dict that for each worker rank specifies a set of output partition IDs.
         If the worker shouldn't return any partitions, it is excluded from the
         dict. Partition IDs are global integers `0..npartitions` and corresponds
         to the dict keys returned by `group_split_dispatch`.
@@ -332,9 +332,9 @@ async def shuffle_task(
     stage_name: str
         Name of the stage to retrieve the input keys from.
     rank_to_inkeys: dict
-        dict that for each worker rank specifices the set of staged input keys.
+        dict that for each worker rank specifies the set of staged input keys.
     rank_to_out_part_ids: dict
-        dict that for each worker rank specifices a set of output partition IDs.
+        dict that for each worker rank specifies a set of output partition IDs.
         If the worker shouldn't return any partitions, it is excluded from the
         dict. Partition IDs are global integers `0..npartitions` and corresponds
         to the dict keys returned by `group_split_dispatch`.
diff --git a/dask_cuda/initialize.py b/dask_cuda/initialize.py
index 52a67e31..0b9c92a5 100644
--- a/dask_cuda/initialize.py
+++ b/dask_cuda/initialize.py
@@ -30,7 +30,7 @@ def _create_cuda_context():
         try:
             distributed.comm.ucx.init_once()
         except ModuleNotFoundError:
-            # UCX intialization has to be delegated to Distributed, it will take care
+            # UCX initialization has to be delegated to Distributed, it will take care
             # of setting correct environment variables and importing `ucp` after that.
             # Therefore if ``import ucp`` fails we can just continue here.
             pass
diff --git a/dask_cuda/is_spillable_object.py b/dask_cuda/is_spillable_object.py
index 9e337aa8..cb85248e 100644
--- a/dask_cuda/is_spillable_object.py
+++ b/dask_cuda/is_spillable_object.py
@@ -40,7 +40,7 @@ def is_device_object_cudf_index(s):
 
 
 def cudf_spilling_status() -> Optional[bool]:
-    """Check the status of cudf's build-in spilling
+    """Check the status of cudf's built-in spilling
 
     Returns:
         - True if cudf's internal spilling is enabled, or
diff --git a/dask_cuda/proxify_device_objects.py b/dask_cuda/proxify_device_objects.py
index 923e7cf8..a8b8a45d 100644
--- a/dask_cuda/proxify_device_objects.py
+++ b/dask_cuda/proxify_device_objects.py
@@ -19,7 +19,7 @@ def _register_incompatible_types():
     """Lazy register types that ProxifyHostFile should unproxify on retrieval.
 
     It reads the config key "jit-unspill-incompatible"
-    (DASK_JIT_UNSPILL_INCOMPATIBLE), which should be a comma seperated
+    (DASK_JIT_UNSPILL_INCOMPATIBLE), which should be a comma separated
     list of types. The default value is:
         DASK_JIT_UNSPILL_INCOMPATIBLE="cupy.ndarray"
     """
@@ -51,8 +51,8 @@ def f(paths):
 
 def proxify_device_objects(
     obj: T,
-    proxied_id_to_proxy: MutableMapping[int, ProxyObject] = None,
-    found_proxies: List[ProxyObject] = None,
+    proxied_id_to_proxy: Optional[MutableMapping[int, ProxyObject]] = None,
+    found_proxies: Optional[List[ProxyObject]] = None,
     excl_proxies: bool = False,
     mark_as_explicit_proxies: bool = False,
 ) -> T:
@@ -135,7 +135,9 @@ def unproxify_device_objects(
         pxy = obj._pxy_get(copy=True)
         if only_incompatible_types:
             if incompatible_types and isinstance(obj, incompatible_types):
-                obj = obj._pxy_deserialize(maybe_evict=False, proxy_detail=pxy)
+                obj = obj._pxy_deserialize(  # type: ignore
+                    maybe_evict=False, proxy_detail=pxy
+                )
         elif not skip_explicit_proxies or not pxy.explicit_proxy:
             pxy.explicit_proxy = False
             obj = obj._pxy_deserialize(maybe_evict=False, proxy_detail=pxy)
diff --git a/dask_cuda/proxify_host_file.py b/dask_cuda/proxify_host_file.py
index 47bb3952..724a08ba 100644
--- a/dask_cuda/proxify_host_file.py
+++ b/dask_cuda/proxify_host_file.py
@@ -164,7 +164,7 @@ class ProxiesOnDevice(Proxies):
     In this case the tally of the total device memory usage is incorrect.
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
         self.proxy_id_to_dev_mems: Dict[int, Set[DeviceMemoryId]] = {}
         self.dev_mem_to_proxy_ids: DefaultDict[DeviceMemoryId, Set[int]] = defaultdict(
@@ -477,7 +477,7 @@ class ProxifyHostFile(MutableMapping):
     spill_on_demand: bool or None, default None
         Enables spilling when the RMM memory pool goes out of memory. If ``None``,
         the "spill-on-demand" config value are used, which defaults to True.
-        Notice, enabling this does nothing when RMM isn't availabe or not used.
+        Notice, enabling this does nothing when RMM isn't available or not used.
     gds_spilling: bool
         Enable GPUDirect Storage spilling. If ``None``, the "gds-spilling" config
         value are used, which defaults to ``False``.
@@ -497,10 +497,10 @@ def __init__(
         *,
         device_memory_limit: int,
         memory_limit: int,
-        shared_filesystem: bool = None,
-        compatibility_mode: bool = None,
-        spill_on_demand: bool = None,
-        gds_spilling: bool = None,
+        shared_filesystem: Optional[bool] = None,
+        compatibility_mode: Optional[bool] = None,
+        spill_on_demand: Optional[bool] = None,
+        gds_spilling: Optional[bool] = None,
     ):
         if cudf_spilling_status():
             warnings.warn(
@@ -635,7 +635,7 @@ def evict(self) -> int:
     def fast(self):
         """Alternative access to `.evict()` used by Dask
 
-        Dask expects `.fast.evict()` to be availabe for manually triggering
+        Dask expects `.fast.evict()` to be available for manually triggering
         of CPU-to-Disk spilling.
         """
         if len(self.manager._host) == 0:
diff --git a/dask_cuda/proxy_object.py b/dask_cuda/proxy_object.py
index 80aaa7c4..21dc15ea 100644
--- a/dask_cuda/proxy_object.py
+++ b/dask_cuda/proxy_object.py
@@ -46,7 +46,9 @@
 
 
 def asproxy(
-    obj: object, serializers: Iterable[str] = None, subclass: Type["ProxyObject"] = None
+    obj: object,
+    serializers: Optional[Iterable[str]] = None,
+    subclass: Optional[Type["ProxyObject"]] = None,
 ) -> "ProxyObject":
     """Wrap `obj` in a ProxyObject object if it isn't already.
 
@@ -344,7 +346,7 @@ class ProxyObject:
     Attributes
     ----------
     _pxy: ProxyDetail
-        Details of all proxy information of the underlaying proxied object.
+        Details of all proxy information of the underlying proxied object.
         Access to _pxy is not pass-through to the proxied object, which is
         the case for most other access to the ProxyObject.
 
@@ -380,7 +382,7 @@ def __del__(self):
     def _pxy_serialize(
         self,
         serializers: Iterable[str],
-        proxy_detail: ProxyDetail = None,
+        proxy_detail: Optional[ProxyDetail] = None,
     ) -> None:
         """Inplace serialization of the proxied object using the `serializers`
 
@@ -410,7 +412,7 @@ def _pxy_serialize(
         self._pxy_cache.pop("device_memory_objects", None)
 
     def _pxy_deserialize(
-        self, maybe_evict: bool = True, proxy_detail: ProxyDetail = None
+        self, maybe_evict: bool = True, proxy_detail: Optional[ProxyDetail] = None
     ):
         """Inplace deserialization of the proxied object
 
diff --git a/dask_cuda/tests/test_cudf_builtin_spilling.py b/dask_cuda/tests/test_cudf_builtin_spilling.py
index c6548e42..d4c28ba0 100644
--- a/dask_cuda/tests/test_cudf_builtin_spilling.py
+++ b/dask_cuda/tests/test_cudf_builtin_spilling.py
@@ -34,7 +34,7 @@
 
 @pytest.fixture
 def manager(request):
-    """Fixture to enable and make a spilling manager availabe"""
+    """Fixture to enable and make a spilling manager available"""
     kwargs = dict(getattr(request, "param", {}))
     set_global_manager(manager=SpillManager(**kwargs))
     yield get_global_manager()
diff --git a/dask_cuda/utils.py b/dask_cuda/utils.py
index 850006ea..1a24d80b 100644
--- a/dask_cuda/utils.py
+++ b/dask_cuda/utils.py
@@ -682,7 +682,7 @@ def get_gpu_uuid_from_index(device_index=0):
 def get_worker_config(dask_worker):
     from .proxify_host_file import ProxifyHostFile
 
-    # assume homogenous cluster
+    # assume homogeneous cluster
     plugin_vals = dask_worker.plugins.values()
     ret = {}
 
diff --git a/docs/Makefile b/docs/Makefile
index 69fe55ec..ba501f6f 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -16,4 +16,4 @@ help:
 # Catch-all target: route all unknown targets to Sphinx using the new
 # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
 %: Makefile
-	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
\ No newline at end of file
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/source/api.rst b/docs/source/api.rst
index 7989fa5e..b9d9d6df 100644
--- a/docs/source/api.rst
+++ b/docs/source/api.rst
@@ -33,4 +33,3 @@ Explicit-comms
 .. currentmodule:: dask_cuda.explicit_comms.comms
 .. autoclass:: CommsContext
    :members:
-
diff --git a/docs/source/examples/best-practices.rst b/docs/source/examples/best-practices.rst
index 242e90ff..84cc78b8 100644
--- a/docs/source/examples/best-practices.rst
+++ b/docs/source/examples/best-practices.rst
@@ -114,4 +114,3 @@ With UCX and NVLink, we greatly reduced the wall clock time to: ``347.43 ms +/-
     0                         | ucx://127.0.0.1:35954
     1                         | ucx://127.0.0.1:53584
     ================================================================================
-
diff --git a/docs/source/ucx.rst b/docs/source/ucx.rst
index 7463f0c1..d9cacdc7 100644
--- a/docs/source/ucx.rst
+++ b/docs/source/ucx.rst
@@ -127,8 +127,7 @@ therefore do something like the following:
 
 .. note::
 
-   To confirm that no bad fork calls are occuring, start jobs with
+   To confirm that no bad fork calls are occurring, start jobs with
    ``UCX_IB_FORK_INIT=n``. UCX will produce a warning ``UCX  WARN  IB:
    ibv_fork_init() was disabled or failed, yet a fork() has been
    issued.`` if the application calls ``fork()``.
-
diff --git a/rtd/Makefile b/rtd/Makefile
index 69fe55ec..ba501f6f 100644
--- a/rtd/Makefile
+++ b/rtd/Makefile
@@ -16,4 +16,4 @@ help:
 # Catch-all target: route all unknown targets to Sphinx using the new
 # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
 %: Makefile
-	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
\ No newline at end of file
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

From 7a67a3d27c06994cc8db845d2809c8fd885b7e44 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Mon, 30 Jan 2023 18:18:42 +0000
Subject: [PATCH 26/31] pre-commit: Update isort version to 5.12.0 (#1098)

poetry version 1.5.0 broke installs of isort prior to 5.11.5 (see pycqa/isort#2077 and pycqa/isort#2078), so we need to upgrade.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/1098
---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index cc597578..030c454b 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -5,7 +5,7 @@ repos:
               - id: trailing-whitespace
               - id: end-of-file-fixer
       - repo: https://github.com/pycqa/isort
-        rev: 5.10.1
+        rev: 5.12.0
         hooks:
               - id: isort
       - repo: https://github.com/ambv/black

From 0628f055bbd50fbd40498e841cddd5cec4187ec6 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Tue, 31 Jan 2023 15:06:19 +0100
Subject: [PATCH 27/31] explicit-comms: don't mix `-` and `_` in config (#1096)

Using `dask.config.get("explicit_comms-batchsize", 1)` doesn't read `DASK_EXPLICIT_COMMS_BATCHSIZE` correctly.

Authors:
  - Mads R. B. Kristensen (https://github.com/madsbk)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/1096
---
 dask_cuda/explicit_comms/dataframe/shuffle.py |  7 ++-
 dask_cuda/tests/test_explicit_comms.py        | 46 +++++++++++--------
 2 files changed, 33 insertions(+), 20 deletions(-)

diff --git a/dask_cuda/explicit_comms/dataframe/shuffle.py b/dask_cuda/explicit_comms/dataframe/shuffle.py
index 84bc5570..d79b08a4 100644
--- a/dask_cuda/explicit_comms/dataframe/shuffle.py
+++ b/dask_cuda/explicit_comms/dataframe/shuffle.py
@@ -477,9 +477,14 @@ def shuffle(
 
     # Get batchsize
     max_num_inkeys = max(len(k) for k in rank_to_inkeys.values())
-    batchsize = batchsize or dask.config.get("explicit_comms-batchsize", 1)
+    batchsize = batchsize or dask.config.get("explicit-comms-batchsize", 1)
     if batchsize == -1:
         batchsize = max_num_inkeys
+    if not isinstance(batchsize, int) or batchsize < 0:
+        raise ValueError(
+            "explicit-comms-batchsize must be a "
+            f"positive integer or -1 (was '{batchsize}')"
+        )
 
     # Get number of rounds of dataframe partitioning and all-to-all communication.
     num_rounds = ceil(max_num_inkeys / batchsize)
diff --git a/dask_cuda/tests/test_explicit_comms.py b/dask_cuda/tests/test_explicit_comms.py
index 88e1294c..413bf5bd 100644
--- a/dask_cuda/tests/test_explicit_comms.py
+++ b/dask_cuda/tests/test_explicit_comms.py
@@ -1,5 +1,7 @@
 import asyncio
 import multiprocessing as mp
+import os
+from unittest.mock import patch
 
 import numpy as np
 import pandas as pd
@@ -172,8 +174,9 @@ def test_dataframe_shuffle(backend, protocol, nworkers):
     assert not p.exitcode
 
 
-def _test_dask_use_explicit_comms():
-    def check_shuffle(in_cluster):
+@pytest.mark.parametrize("in_cluster", [True, False])
+def test_dask_use_explicit_comms(in_cluster):
+    def check_shuffle():
         """Check if shuffle use explicit-comms by search for keys named
         'explicit-comms-shuffle'
         """
@@ -189,23 +192,28 @@ def check_shuffle(in_cluster):
             else:  # If not in cluster, we cannot use explicit comms
                 assert all(name not in str(key) for key in res.dask)
 
-    with LocalCluster(
-        protocol="tcp",
-        dashboard_address=None,
-        n_workers=2,
-        threads_per_worker=1,
-        processes=True,
-    ) as cluster:
-        with Client(cluster):
-            check_shuffle(True)
-    check_shuffle(False)
-
-
-def test_dask_use_explicit_comms():
-    p = mp.Process(target=_test_dask_use_explicit_comms)
-    p.start()
-    p.join()
-    assert not p.exitcode
+        if in_cluster:
+            # We check environment variables by setting an illegal batchsize
+            with patch.dict(
+                os.environ,
+                {"DASK_EXPLICIT_COMMS": "1", "DASK_EXPLICIT_COMMS_BATCHSIZE": "-2"},
+            ):
+                dask.config.refresh()  # Trigger re-read of the environment variables
+                with pytest.raises(ValueError, match="explicit-comms-batchsize"):
+                    ddf.shuffle(on="key", npartitions=4, shuffle="tasks")
+
+    if in_cluster:
+        with LocalCluster(
+            protocol="tcp",
+            dashboard_address=None,
+            n_workers=2,
+            threads_per_worker=1,
+            processes=True,
+        ) as cluster:
+            with Client(cluster):
+                check_shuffle()
+    else:
+        check_shuffle()
 
 
 def _test_dataframe_shuffle_merge(backend, protocol, n_workers):

From 84f4aa2e73b28aef0139cb88a83459e3def08b4b Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Wed, 1 Feb 2023 09:09:30 +0100
Subject: [PATCH 28/31] Proxify: make duplicate check optional (#1101)

In order to improve performance, it is now possible to skip the duplication check in `ProxyManager.proxify()`.

We use this in explicit-comms shuffle.

Authors:
  - Mads R. B. Kristensen (https://github.com/madsbk)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/1101
---
 dask_cuda/explicit_comms/dataframe/shuffle.py | 27 ++++++++------
 dask_cuda/proxify_host_file.py                | 37 ++++++++++++++++---
 2 files changed, 47 insertions(+), 17 deletions(-)

diff --git a/dask_cuda/explicit_comms/dataframe/shuffle.py b/dask_cuda/explicit_comms/dataframe/shuffle.py
index d79b08a4..4b240d2f 100644
--- a/dask_cuda/explicit_comms/dataframe/shuffle.py
+++ b/dask_cuda/explicit_comms/dataframe/shuffle.py
@@ -30,13 +30,14 @@ def get_proxify(worker: Worker) -> Proxify:
     from dask_cuda.proxify_host_file import ProxifyHostFile
 
     if isinstance(worker.data, ProxifyHostFile):
-        data = worker.data
-        return lambda x: data.manager.proxify(x)[0]
+        # Notice, we know that we never call proxify() on the same proxied
+        # object thus we can speedup the call by setting `duplicate_check=False`
+        return lambda x: worker.data.manager.proxify(x, duplicate_check=False)[0]
     return lambda x: x  # no-op
 
 
 def get_no_comm_postprocess(
-    stage: Dict[str, Any], num_rounds: int, batchsize: int
+    stage: Dict[str, Any], num_rounds: int, batchsize: int, proxify: Proxify
 ) -> Callable[[DataFrame], DataFrame]:
     """Get function for post-processing partitions not communicated
 
@@ -52,10 +53,12 @@ def get_no_comm_postprocess(
     ----------
     stage
         The staged input dataframes.
-    num_rounds: int
+    num_rounds
         Number of rounds of dataframe partitioning and all-to-all communication.
-    batchsize: int
+    batchsize
         Number of partitions each worker will handle in each round.
+    proxify
+        Function to proxify object.
 
     Returns
     -------
@@ -75,9 +78,11 @@ def get_no_comm_postprocess(
 
     # Deep copying a cuDF dataframe doesn't deep copy its index hence
     # we have to do it explicitly.
-    return lambda x: x._from_data(
-        x._data.copy(deep=True),
-        x._index.copy(deep=True),
+    return lambda x: proxify(
+        x._from_data(
+            x._data.copy(deep=True),
+            x._index.copy(deep=True),
+        )
     )
 
 
@@ -246,7 +251,7 @@ def create_partitions(
         t = [df_grouped[i] for df_grouped in dfs_grouped]
         assert len(t) > 0
         if len(t) == 1:
-            ret[i] = proxify(t[0])
+            ret[i] = t[0]
         elif len(t) > 1:
             ret[i] = proxify(dd_concat(t, ignore_index=ignore_index))
     return ret
@@ -305,7 +310,7 @@ async def send_recv_partitions(
     # We can now add them to the output dataframes.
     for out_part_id, dataframe in out_part_id_to_dataframe.items():
         out_part_id_to_dataframe_list[out_part_id].append(
-            no_comm_postprocess(proxify(dataframe))
+            no_comm_postprocess(dataframe)
         )
     out_part_id_to_dataframe.clear()
 
@@ -361,7 +366,7 @@ async def shuffle_task(
     myrank: int = s["rank"]
     stage = comms.pop_staging_area(s, stage_name)
     assert stage.keys() == rank_to_inkeys[myrank]
-    no_comm_postprocess = get_no_comm_postprocess(stage, num_rounds, batchsize)
+    no_comm_postprocess = get_no_comm_postprocess(stage, num_rounds, batchsize, proxify)
 
     out_part_id_to_dataframe_list: Dict[int, List[DataFrame]] = defaultdict(list)
     for _ in range(num_rounds):
diff --git a/dask_cuda/proxify_host_file.py b/dask_cuda/proxify_host_file.py
index 724a08ba..04716a2b 100644
--- a/dask_cuda/proxify_host_file.py
+++ b/dask_cuda/proxify_host_file.py
@@ -322,20 +322,45 @@ def validate(self):
                         header, _ = pxy.obj
                         assert header["serializer"] == pxy.serializer
 
-    def proxify(self, obj: T) -> Tuple[T, bool]:
+    def proxify(self, obj: T, duplicate_check=True) -> Tuple[T, bool]:
         """Proxify `obj` and add found proxies to the `Proxies` collections
 
+        Search through `obj` and wrap all CUDA device objects in ProxyObject.
+        If duplicate_check is True, identical CUDA device objects found in
+        `obj` are wrapped by the same ProxyObject.
+
         Returns the proxified object and a boolean, which is `True` when one or
         more incompatible-types were found.
+
+        Parameters
+        ----------
+        obj
+            Object to search through or wrap in a ProxyObject.
+        duplicate_check
+            Make sure that identical CUDA device objects found in `obj` are
+            wrapped by the same ProxyObject. This check comes with a significant
+            overhead hence it is recommended setting to False when it is known
+            that no duplicate exist.
+
+        Return
+        ------
+        obj
+            The proxified object.
+        bool
+            Whether incompatible-types were found or not.
         """
+
         incompatible_type_found = False
         with self.lock:
             found_proxies: List[ProxyObject] = []
-            # In order detect already proxied object, proxify_device_objects()
-            # needs a mapping from proxied objects to their proxy objects.
-            proxied_id_to_proxy = {
-                id(p._pxy_get().obj): p for p in self._dev.get_proxies()
-            }
+            if duplicate_check:
+                # In order to detect already proxied object, proxify_device_objects()
+                # needs a mapping from proxied objects to their proxy objects.
+                proxied_id_to_proxy = {
+                    id(p._pxy_get().obj): p for p in self._dev.get_proxies()
+                }
+            else:
+                proxied_id_to_proxy = None
             ret = proxify_device_objects(obj, proxied_id_to_proxy, found_proxies)
             last_access = time.monotonic()
             for p in found_proxies:

From 7298f1e4601e344d033e5dbfaccfe03dfca7e83e Mon Sep 17 00:00:00 2001
From: AJ Schmidt <ajschmidt8@users.noreply.github.com>
Date: Fri, 3 Feb 2023 17:20:58 -0500
Subject: [PATCH 29/31] update workflow branches [skip ci] (#1105)

This PR updates the branch reference used for our shared workflows.

I will open similar PRs for `branch-23.04` next week.

Authors:
   - AJ Schmidt (https://github.com/ajschmidt8)

Approvers:
   - Ray Douglass (https://github.com/raydouglass)
---
 .github/workflows/build.yaml |  6 +++---
 .github/workflows/pr.yaml    | 10 +++++-----
 .github/workflows/test.yaml  |  2 +-
 ci/release/update-version.sh |  4 ++++
 4 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index d36d0e81..bce48ebd 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   conda-python-build:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@cuda-118
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -38,7 +38,7 @@ jobs:
     if: ${{ startsWith(github.ref, 'refs/heads/branch-') }}
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-118
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.02
     with:
       build_type: branch
       node_type: "gpu-latest-1"
@@ -48,7 +48,7 @@ jobs:
   upload-conda:
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@cuda-118
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 730b3587..3dee7d77 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -18,26 +18,26 @@ jobs:
       - docs-build
       - wheel-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@cuda-118
+    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.02
   checks:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@cuda-118
+    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.02
   conda-python-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@cuda-118
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.02
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@cuda-118
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.02
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-118
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.02
     with:
       build_type: pull-request
       node_type: "gpu-latest-1"
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 33d6c020..5c18a0b1 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@cuda-118
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 41658e73..cab06b0a 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -39,3 +39,7 @@ sed_runner "s/cudf=.*/cudf=${NEXT_SHORT_TAG}/g" dependencies.yaml
 sed_runner "s/dask-cudf=.*/dask-cudf=${NEXT_SHORT_TAG}/g" dependencies.yaml
 sed_runner "s/cucim=.*/cucim=${NEXT_SHORT_TAG}/g" dependencies.yaml
 sed_runner "s/ucx-py=.*/ucx-py=${NEXT_UCXPY_VERSION}/g" dependencies.yaml
+
+for FILE in .github/workflows/*.yaml; do
+  sed_runner "/shared-action-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
+done

From 80d72969ac5156f7e34dcaa38c074cfd77095536 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 6 Feb 2023 02:12:07 -0600
Subject: [PATCH 30/31] Pin `dask` and `distributed` for release (#1106)

This PR pins `dask` and `distributed` to `2023.1.1` for `23.02` release.

xref: https://github.com/rapidsai/cudf/pull/12695

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Mark Sadang (https://github.com/msadang)
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/1106
---
 ci/cpu/build.sh   | 4 ++--
 ci/gpu/build.sh   | 4 ++--
 dependencies.yaml | 4 ++--
 pyproject.toml    | 4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh
index 6b91ca9e..b1b27964 100755
--- a/ci/cpu/build.sh
+++ b/ci/cpu/build.sh
@@ -21,10 +21,10 @@ export GPUCI_CONDA_RETRY_SLEEP=30
 
 # Whether to keep `dask/label/dev` channel in the env. If INSTALL_DASK_MAIN=0,
 # `dask/label/dev` channel is removed.
-export INSTALL_DASK_MAIN=1
+export INSTALL_DASK_MAIN=0
 
 # Dask version to install when `INSTALL_DASK_MAIN=0`
-export DASK_STABLE_VERSION="2022.12.0"
+export DASK_STABLE_VERSION="2023.1.1"
 
 # Switch to project root; also root of repo checkout
 cd "$WORKSPACE"
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index b9661f52..2d6f35f1 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -35,10 +35,10 @@ export NUMPY_EXPERIMENTAL_ARRAY_FUNCTION=1
 
 # Install dask and distributed from main branch. Usually needed during
 # development time and disabled before a new dask-cuda release.
-export INSTALL_DASK_MAIN=1
+export INSTALL_DASK_MAIN=0
 
 # Dask version to install when `INSTALL_DASK_MAIN=0`
-export DASK_STABLE_VERSION="2022.12.0"
+export DASK_STABLE_VERSION="2023.1.1"
 
 # Temporary workaround for Jupyter errors.
 # See https://github.com/rapidsai/dask-cuda/issues/1040
diff --git a/dependencies.yaml b/dependencies.yaml
index 2d673971..9b471e6a 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -95,8 +95,8 @@ dependencies:
     common:
       - output_types: [conda, requirements]
         packages:
-          - dask>=2022.12.0
-          - distributed>=2022.12.0
+          - dask==2023.1.1
+          - distributed==2023.1.1
           - numba>=0.54
           - numpy>=1.18.0
           - pandas>=1.0
diff --git a/pyproject.toml b/pyproject.toml
index 7163e4f6..58f156bb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,8 +19,8 @@ authors = [
 license = { text = "Apache-2.0" }
 requires-python = ">=3.8"
 dependencies = [
-    "dask >=2022.12.0",
-    "distributed >=2022.12.0",
+    "dask ==2023.1.1",
+    "distributed ==2023.1.1",
     "pynvml >=11.0.0",
     "numpy >=1.18.0",
     "numba >=0.54",

From e2db7c9112474a1de7dda9624c710721d1dcd3ca Mon Sep 17 00:00:00 2001
From: Raymond Douglass <ray@raydouglass.com>
Date: Thu, 9 Feb 2023 10:08:35 -0500
Subject: [PATCH 31/31] update changelog

---
 CHANGELOG.md | 41 +++++++++++++++++++++++++++++++++++++++--
 1 file changed, 39 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 819da818..f82b7e59 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,43 @@
-# dask-cuda 23.02.00 (Date TBD)
+# dask-cuda 23.02.00 (9 Feb 2023)
 
-Please see https://github.com/rapidsai/dask-cuda/releases/tag/v23.02.00a for the latest changes to this development branch.
+## 🚨 Breaking Changes
+
+- Pin `dask` and `distributed` for release ([#1106](https://github.com/rapidsai/dask-cuda/pull/1106)) [@galipremsagar](https://github.com/galipremsagar)
+
+## 🐛 Bug Fixes
+
+- pre-commit: Update isort version to 5.12.0 ([#1098](https://github.com/rapidsai/dask-cuda/pull/1098)) [@wence-](https://github.com/wence-)
+- explicit-comms: don&#39;t mix `-` and `_` in config ([#1096](https://github.com/rapidsai/dask-cuda/pull/1096)) [@madsbk](https://github.com/madsbk)
+- Update `cudf.Buffer` pointer access method ([#1094](https://github.com/rapidsai/dask-cuda/pull/1094)) [@pentschev](https://github.com/pentschev)
+- Update tests for Python 3.10 ([#1086](https://github.com/rapidsai/dask-cuda/pull/1086)) [@pentschev](https://github.com/pentschev)
+- Use `pkgutil.iter_modules` to get un-imported module for `test_pre_import` ([#1085](https://github.com/rapidsai/dask-cuda/pull/1085)) [@charlesbluca](https://github.com/charlesbluca)
+- Make proxy tests with `LocalCUDACluster` asynchronous ([#1084](https://github.com/rapidsai/dask-cuda/pull/1084)) [@pentschev](https://github.com/pentschev)
+- Ensure consistent results from `safe_sizeof()` in test ([#1071](https://github.com/rapidsai/dask-cuda/pull/1071)) [@madsbk](https://github.com/madsbk)
+- Pass missing argument to groupby benchmark compute ([#1069](https://github.com/rapidsai/dask-cuda/pull/1069)) [@mattf](https://github.com/mattf)
+- Reorder channel priority. ([#1067](https://github.com/rapidsai/dask-cuda/pull/1067)) [@bdice](https://github.com/bdice)
+- Fix owner check when the owner is a cupy array ([#1061](https://github.com/rapidsai/dask-cuda/pull/1061)) [@wence-](https://github.com/wence-)
+
+## 🛠️ Improvements
+
+- Pin `dask` and `distributed` for release ([#1106](https://github.com/rapidsai/dask-cuda/pull/1106)) [@galipremsagar](https://github.com/galipremsagar)
+- Update shared workflow branches ([#1105](https://github.com/rapidsai/dask-cuda/pull/1105)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Proxify: make duplicate check optional ([#1101](https://github.com/rapidsai/dask-cuda/pull/1101)) [@madsbk](https://github.com/madsbk)
+- Fix whitespace &amp; add URLs in `pyproject.toml` ([#1092](https://github.com/rapidsai/dask-cuda/pull/1092)) [@jakirkham](https://github.com/jakirkham)
+- pre-commit: spell, whitespace, and mypy check ([#1091](https://github.com/rapidsai/dask-cuda/pull/1091)) [@madsbk](https://github.com/madsbk)
+- shuffle: use cuDF&#39;s `partition_by_hash()` when available ([#1090](https://github.com/rapidsai/dask-cuda/pull/1090)) [@madsbk](https://github.com/madsbk)
+- add initial docs build ([#1089](https://github.com/rapidsai/dask-cuda/pull/1089)) [@AjayThorve](https://github.com/AjayThorve)
+- Remove `--get-cluster-configuration` option, check for scheduler in `dask cuda config` ([#1088](https://github.com/rapidsai/dask-cuda/pull/1088)) [@charlesbluca](https://github.com/charlesbluca)
+- Add timeout to `pytest` command ([#1082](https://github.com/rapidsai/dask-cuda/pull/1082)) [@ajschmidt8](https://github.com/ajschmidt8)
+- shuffle-benchmark: add `--partition-distribution` ([#1081](https://github.com/rapidsai/dask-cuda/pull/1081)) [@madsbk](https://github.com/madsbk)
+- Ensure tests run for Python `3.10` ([#1080](https://github.com/rapidsai/dask-cuda/pull/1080)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Use TrackingResourceAdaptor to get better debug info ([#1079](https://github.com/rapidsai/dask-cuda/pull/1079)) [@madsbk](https://github.com/madsbk)
+- Improve shuffle-benchmark ([#1074](https://github.com/rapidsai/dask-cuda/pull/1074)) [@madsbk](https://github.com/madsbk)
+- Update builds for CUDA `11.8` and Python `310` ([#1072](https://github.com/rapidsai/dask-cuda/pull/1072)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Shuffle by partition to reduce memory usage significantly ([#1068](https://github.com/rapidsai/dask-cuda/pull/1068)) [@madsbk](https://github.com/madsbk)
+- Enable copy_prs. ([#1063](https://github.com/rapidsai/dask-cuda/pull/1063)) [@bdice](https://github.com/bdice)
+- Add GitHub Actions Workflows ([#1062](https://github.com/rapidsai/dask-cuda/pull/1062)) [@bdice](https://github.com/bdice)
+- Unpin `dask` and `distributed` for development ([#1060](https://github.com/rapidsai/dask-cuda/pull/1060)) [@galipremsagar](https://github.com/galipremsagar)
+- Switch to the new dask CLI ([#981](https://github.com/rapidsai/dask-cuda/pull/981)) [@jacobtomlinson](https://github.com/jacobtomlinson)
 
 # dask-cuda 22.12.00 (8 Dec 2022)