Merge pull request #1188 from rapidsai/branch-23.06

[RELEASE] dask-cuda v23.06
rapidsai · Jun 7, 2023 · ec3186d · ec3186d
2 parents c55bb7f + af05c73
commit ec3186d
Show file tree

Hide file tree

Showing 19 changed files with 108 additions and 73 deletions.
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
@@ -28,27 +28,30 @@ concurrency:
 jobs:
   conda-python-build:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
   docs-build:
-    if: github.ref_type == 'branch' && github.event_name == 'push'
+    if: github.ref_type == 'branch'
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.06
     with:
-      build_type: branch
-      node_type: "gpu-latest-1"
       arch: "amd64"
+      branch: ${{ inputs.branch }}
+      build_type: ${{ inputs.build_type || 'branch' }}
       container_image: "rapidsai/ci:latest"
+      date: ${{ inputs.date }}
+      node_type: "gpu-v100-latest-1"
       run_script: "ci/build_docs.sh"
+      sha: ${{ inputs.sha }}
   upload-conda:
     needs: [conda-python-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -67,8 +70,10 @@ jobs:
           fetch-depth: 0
       - name: Build wheel
         run: ci/build_python_pypi.sh
+        env:
+          GH_TOKEN: ${{ github.token }}
       - name: Publish distribution 📦 to PyPI
-        if: inputs.build_type == 'nightly'
         uses: pypa/gh-action-pypi-publish@release/v1
         with:
           password: ${{ secrets.RAPIDSAI_PYPI_TOKEN }}
+          skip-existing: true
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
@@ -18,29 +18,29 @@ jobs:
       - docs-build
       - wheel-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.06
   checks:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.06
   conda-python-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.06
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.06
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.06
     with:
       build_type: pull-request
-      node_type: "gpu-latest-1"
+      node_type: "gpu-v100-latest-1"
       arch: "amd64"
       container_image: "rapidsai/ci:latest"
       run_script: "ci/build_docs.sh"
@@ -58,3 +58,5 @@ jobs:
           fetch-depth: 0
       - name: Build wheel
         run: ci/build_python_pypi.sh
+        env:
+          GH_TOKEN: ${{ github.token }}
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.06
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,39 @@
+# dask-cuda 23.06.00 (7 Jun 2023)
+
+## 🚨 Breaking Changes
+
+- Update minimum Python version to Python 3.9 ([#1164](https://github.com/rapidsai/dask-cuda/pull/1164)) [@shwina](https://github.com/shwina)
+
+## 🐛 Bug Fixes
+
+- Increase pytest CI timeout ([#1196](https://github.com/rapidsai/dask-cuda/pull/1196)) [@pentschev](https://github.com/pentschev)
+- Increase minimum timeout to wait for workers in CI ([#1193](https://github.com/rapidsai/dask-cuda/pull/1193)) [@pentschev](https://github.com/pentschev)
+- Disable `np.bool` deprecation warning ([#1182](https://github.com/rapidsai/dask-cuda/pull/1182)) [@pentschev](https://github.com/pentschev)
+- Always upload on branch/nightly builds ([#1177](https://github.com/rapidsai/dask-cuda/pull/1177)) [@raydouglass](https://github.com/raydouglass)
+- Workaround for `DeviceHostFile` tests with CuPy&gt;=12.0.0 ([#1175](https://github.com/rapidsai/dask-cuda/pull/1175)) [@pentschev](https://github.com/pentschev)
+- Temporarily relax Python constraint ([#1166](https://github.com/rapidsai/dask-cuda/pull/1166)) [@vyasr](https://github.com/vyasr)
+
+## 📖 Documentation
+
+- [doc] Add document about main guard. ([#1157](https://github.com/rapidsai/dask-cuda/pull/1157)) [@trivialfis](https://github.com/trivialfis)
+
+## 🚀 New Features
+
+- Require Numba 0.57.0+ ([#1185](https://github.com/rapidsai/dask-cuda/pull/1185)) [@jakirkham](https://github.com/jakirkham)
+- Revert &quot;Temporarily relax Python constraint&quot; ([#1171](https://github.com/rapidsai/dask-cuda/pull/1171)) [@vyasr](https://github.com/vyasr)
+- Update to zict 3.0 ([#1160](https://github.com/rapidsai/dask-cuda/pull/1160)) [@pentschev](https://github.com/pentschev)
+
+## 🛠️ Improvements
+
+- Add `__main__` entrypoint to dask-cuda-worker CLI ([#1181](https://github.com/rapidsai/dask-cuda/pull/1181)) [@hmacdope](https://github.com/hmacdope)
+- run docs nightly too ([#1176](https://github.com/rapidsai/dask-cuda/pull/1176)) [@AyodeAwe](https://github.com/AyodeAwe)
+- Fix GHAs Workflows ([#1172](https://github.com/rapidsai/dask-cuda/pull/1172)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Remove `matrix_filter` from workflows ([#1168](https://github.com/rapidsai/dask-cuda/pull/1168)) [@charlesbluca](https://github.com/charlesbluca)
+- Revert to branch-23.06 for shared-action-workflows ([#1167](https://github.com/rapidsai/dask-cuda/pull/1167)) [@shwina](https://github.com/shwina)
+- Update minimum Python version to Python 3.9 ([#1164](https://github.com/rapidsai/dask-cuda/pull/1164)) [@shwina](https://github.com/shwina)
+- Remove usage of rapids-get-rapids-version-from-git ([#1163](https://github.com/rapidsai/dask-cuda/pull/1163)) [@jjacobelli](https://github.com/jjacobelli)
+- Use ARC V2 self-hosted runners for GPU jobs ([#1159](https://github.com/rapidsai/dask-cuda/pull/1159)) [@jjacobelli](https://github.com/jjacobelli)
+
 # dask-cuda 23.04.00 (6 Apr 2023)
 
 ## 🚨 Breaking Changes

diff --git a/ci/build_docs.sh b/ci/build_docs.sh
@@ -18,7 +18,7 @@ rapids-print-env
 rapids-logger "Downloading artifacts from previous jobs"
 
 PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python)
-VERSION_NUMBER=$(rapids-get-rapids-version-from-git)
+VERSION_NUMBER="23.06"
 
 rapids-mamba-retry install \
     --channel "${PYTHON_CHANNEL}" \
@@ -31,7 +31,7 @@ sphinx-build -b dirhtml ./source _html
 sphinx-build -b text ./source _text
 popd
 
-if [[ "${RAPIDS_BUILD_TYPE}" == "branch" ]]; then
+if [[ "${RAPIDS_BUILD_TYPE}" != "pull-request" ]]; then
   rapids-logger "Upload Docs to S3"
   aws s3 sync --no-progress --delete docs/_html "s3://rapidsai-docs/dask-cuda/${VERSION_NUMBER}/html"
   aws s3 sync --no-progress --delete docs/_text "s3://rapidsai-docs/dask-cuda/${VERSION_NUMBER}/txt"

diff --git a/ci/build_python_pypi.sh b/ci/build_python_pypi.sh
@@ -8,7 +8,7 @@ python -m pip install build --user
 export GIT_DESCRIBE_TAG=$(git describe --abbrev=0 --tags)
 export GIT_DESCRIBE_NUMBER=$(git rev-list ${GIT_DESCRIBE_TAG}..HEAD --count)
 
-# Compute/export VERSION_SUFFIX
+# Compute/export RAPIDS_DATE_STRING
 source rapids-env-update
 
 python -m build \

diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
@@ -37,6 +37,8 @@ sed_runner "s/dask-cudf=.*/dask-cudf=${NEXT_SHORT_TAG}/g" dependencies.yaml
 sed_runner "s/cucim=.*/cucim=${NEXT_SHORT_TAG}/g" dependencies.yaml
 sed_runner "s/ucx-py=.*/ucx-py=${NEXT_UCXPY_VERSION}/g" dependencies.yaml
 
+# CI files
 for FILE in .github/workflows/*.yaml; do
   sed_runner "/shared-action-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
 done
+sed_runner "s/VERSION_NUMBER=\".*/VERSION_NUMBER=\"${NEXT_SHORT_TAG}\"/g" ci/build_docs.sh
diff --git a/ci/test_python.sh b/ci/test_python.sh
@@ -41,10 +41,11 @@ set +e
 rapids-logger "pytest dask-cuda"
 pushd dask_cuda
 DASK_CUDA_TEST_SINGLE_GPU=1 \
+DASK_CUDA_WAIT_WORKERS_MIN_TIMEOUT=20 \
 UCXPY_IFNAME=eth0 \
 UCX_WARN_UNUSED_ENV_VARS=n \
 UCX_MEMTYPE_CACHE=n \
-timeout 30m pytest \
+timeout 40m pytest \
   -vv \
   --capture=no \
   --cache-clear \

diff --git a/dask_cuda/cli.py b/dask_cuda/cli.py
@@ -499,3 +499,7 @@ def config(
     else:
         client = Client(scheduler, security=security)
     print_cluster_config(client)
+
+
+if __name__ == "__main__":
+    worker()
diff --git a/dask_cuda/device_host_file.py b/dask_cuda/device_host_file.py
@@ -2,7 +2,6 @@
 import itertools
 import logging
 import os
-import sys
 import time
 
 import numpy
@@ -240,34 +239,6 @@ def __init__(
         # Dict of objects that will not be spilled by DeviceHostFile.
         self.others = {}
 
-    if sys.version_info < (3, 9):
-
-        def __new__(
-            cls,
-            # So named such that dask will pass in the worker's local
-            # directory when constructing this through the "data" callback.
-            worker_local_directory,
-            *,
-            device_memory_limit=None,
-            memory_limit=None,
-            log_spilling=False,
-        ):
-            """
-            This is here to support Python 3.8. Right now (to support
-            3.8), ZictBase inherits from typing.MutableMapping through
-            which inspect.signature determines that the signature of
-            __init__ is just (*args, **kwargs). We need to advertise the
-            correct signature so that distributed will correctly figure
-            out that it needs to pass the worker's local directory. In
-            Python 3.9 and later, typing.MutableMapping is just an alias
-            for collections.abc.MutableMapping and we don't need to do
-            anything.
-
-            With this pass-through definition of __new__, the
-            signature of the constructor is correctly determined.
-            """
-            return super().__new__(cls)
-
     def __setitem__(self, key, value):
         if key in self.device_buffer:
             # Make sure we register the removal of an existing key

diff --git a/dask_cuda/tests/test_device_host_file.py b/dask_cuda/tests/test_device_host_file.py
@@ -2,8 +2,10 @@
 
 import numpy as np
 import pytest
+from packaging import version
 
 import dask.array
+import distributed
 from distributed.protocol import (
     deserialize,
     deserialize_bytes,
@@ -51,7 +53,16 @@ def test_device_host_file_short(
     random.shuffle(full)
 
     for k, v in full:
-        dhf[k] = v
+        try:
+            dhf[k] = v
+        except TypeError as e:
+            # TODO: Remove when pinning to distributed>=2023.5.1 .
+            # See https://github.com/rapidsai/dask-cuda/issues/1174 and
+            # https://github.com/dask/distributed/pull/7836 .
+            if version.parse(distributed.__version__) <= version.parse("2023.5.0"):
+                dhf[k] = v
+            else:
+                raise e
 
     random.shuffle(full)
 

diff --git a/dask_cuda/tests/test_spill.py b/dask_cuda/tests/test_spill.py
@@ -2,7 +2,6 @@
 from time import sleep
 
 import pytest
-from zict.file import _safe_key as safe_key
 
 import dask
 from dask import array as da
@@ -31,7 +30,8 @@ def device_host_file_size_matches(
     # `dhf.disk` is only available when Worker's `memory_limit != 0`
     if dhf.disk is not None:
         file_path = [
-            os.path.join(dhf.disk.directory, safe_key(k)) for k in dhf.disk.keys()
+            os.path.join(dhf.disk.directory, fname)
+            for fname in dhf.disk.filenames.values()
         ]
         file_size = [os.path.getsize(f) for f in file_path]
         byte_sum += sum(file_size)

diff --git a/dask_cuda/utils.py b/dask_cuda/utils.py
@@ -446,7 +446,9 @@ def wait_workers(
     client: distributed.Client
         Instance of client, used to query for number of workers connected.
     min_timeout: float
-        Minimum number of seconds to wait before timeout.
+        Minimum number of seconds to wait before timeout. This value may be
+        overridden by setting the `DASK_CUDA_WAIT_WORKERS_MIN_TIMEOUT` with
+        a positive integer.
     seconds_per_gpu: float
         Seconds to wait for each GPU on the system. For example, if its
         value is 2 and there is a total of 8 GPUs (workers) being started,
@@ -463,6 +465,8 @@ def wait_workers(
     -------
     True if all workers were started, False if a timeout occurs.
     """
+    min_timeout_env = os.environ.get("DASK_CUDA_WAIT_WORKERS_MIN_TIMEOUT", None)
+    min_timeout = min_timeout if min_timeout_env is None else int(min_timeout_env)
     n_gpus = n_gpus or get_n_gpus()
     timeout = max(min_timeout, seconds_per_gpu * n_gpus)
 

diff --git a/dependencies.yaml b/dependencies.yaml
@@ -76,10 +76,6 @@ dependencies:
     specific:
       - output_types: conda
         matrices:
-          - matrix:
-              py: "3.8"
-            packages:
-              - python=3.8
           - matrix:
               py: "3.9"
             packages:
@@ -90,32 +86,32 @@ dependencies:
               - python=3.10
           - matrix:
             packages:
-              - python>=3.8,<3.11
+              - python>=3.9,<3.11
   run_python:
     common:
       - output_types: [conda, requirements]
         packages:
           - dask==2023.3.2
           - distributed==2023.3.2.1
-          - numba>=0.54
+          - numba>=0.57
           - numpy>=1.21
           - pandas>=1.3,<1.6.0dev0
           - pynvml>=11.0.0,<11.5
-          - zict>=0.1.3
+          - zict>=2.0.0
       - output_types: [conda]
         packages:
           - dask-core==2023.3.2
   test_python:
     common:
       - output_types: [conda]
         packages:
-          - cucim=23.04
-          - cudf=23.04
-          - dask-cudf=23.04
+          - cucim=23.06
+          - cudf=23.06
+          - dask-cudf=23.06
           - pytest
           - pytest-cov
           - ucx-proc=*=gpu
-          - ucx-py=0.31
+          - ucx-py=0.32
     specific:
       - output_types: conda
         matrices:

diff --git a/docs/source/examples/best-practices.rst b/docs/source/examples/best-practices.rst
@@ -9,9 +9,7 @@ When choosing between two multi-GPU setups, it is best to pick the one where mos
 `DGX <https://www.nvidia.com/en-us/data-center/dgx-systems/>`_, a cloud instance with `multi-gpu options <https://rapids.ai/cloud>`_ , a high-density GPU HPC instance, etc.  This is done for two reasons:
 
 - Moving data between GPUs is costly and performance decreases when computation stops due to communication overheads, Host-to-Device/Device-to-Host transfers, etc
-- Multi-GPU instances often come with accelerated networking like `NVLink <https://www.nvidia.com/en-us/data-center/nvlink/>`_.  These accelerated
-networking paths usually have much higher throughput/bandwidth compared with traditional networking *and* don't force and Host-to-Device/Device-to-Host transfers.  See
-`Accelerated Networking`_ for more discussion
+- Multi-GPU instances often come with accelerated networking like `NVLink <https://www.nvidia.com/en-us/data-center/nvlink/>`_.  These accelerated networking paths usually have much higher throughput/bandwidth compared with traditional networking *and* don't force and Host-to-Device/Device-to-Host transfers.  See `Accelerated Networking`_ for more discussion.
 
 .. code-block:: python
 

diff --git a/docs/source/examples/ucx.rst b/docs/source/examples/ucx.rst
@@ -69,7 +69,7 @@ To start a Dask scheduler using UCX with automatic configuration and one GB of R
 .. note::
     The ``interface="ib0"`` is intentionally specified above to ensure RDMACM is used in systems that support InfiniBand. On systems that don't support InfiniBand or where RDMACM isn't required, the ``interface`` argument may be omitted or specified to listen on a different interface.
 
-    We specify ``UCX_MEMTYPE_REG_WHOLE_ALLOC_TYPES=cuda`` above for optimal performance with InfiniBand, see details `here <https://ucx-py.readthedocs.io/en/latest/configuration.html#ucx-memtype-reg-whole-alloc-types>`_. If not using InfiniBand, that option may be omitted. In UCX 1.12 and newer, that option is default and may be omitted as well even when using InfiniBand.
+    We specify ``UCX_MEMTYPE_REG_WHOLE_ALLOC_TYPES=cuda`` above for optimal performance with InfiniBand, see details `here <https://ucx-py.readthedocs.io/en/latest/configuration.html#ucx-memtype-reg-whole-alloc-types>`__. If not using InfiniBand, that option may be omitted. In UCX 1.12 and newer, that option is default and may be omitted as well even when using InfiniBand.
 
 Workers
 ^^^^^^^
@@ -86,7 +86,7 @@ To start workers with automatic UCX configuration and an RMM pool of 14GB per GP
 .. note::
     Analogous to the scheduler setup, the ``interface="ib0"`` is intentionally specified above to ensure RDMACM is used in systems that support InfiniBand. On systems that don't support InfiniBand or where RDMACM isn't required, the ``interface`` argument may be omitted or specified to listen on a different interface.
 
-    We specify ``UCX_MEMTYPE_REG_WHOLE_ALLOC_TYPES=cuda`` above for optimal performance with InfiniBand, see details `here <https://ucx-py.readthedocs.io/en/latest/configuration.html#ucx-memtype-reg-whole-alloc-types>`_. If not using InfiniBand, that option may be omitted. In UCX 1.12 and newer, that option is default and may be omitted as well even when using InfiniBand.
+    We specify ``UCX_MEMTYPE_REG_WHOLE_ALLOC_TYPES=cuda`` above for optimal performance with InfiniBand, see details `here <https://ucx-py.readthedocs.io/en/latest/configuration.html#ucx-memtype-reg-whole-alloc-types>`__. If not using InfiniBand, that option may be omitted. In UCX 1.12 and newer, that option is default and may be omitted as well even when using InfiniBand.
 
 Client
 ^^^^^^
@@ -122,7 +122,7 @@ Alternatively, the ``with dask.config.set`` statement from the example above may
     We specify ``UCX_MEMTYPE_REG_WHOLE_ALLOC_TYPES=cuda`` above for optimal performance with InfiniBand, see details `here <https://ucx-py.readthedocs.io/en/latest/configuration.html#ucx-memtype-reg-whole-alloc-types>`_. If not using InfiniBand, that option may be omitted. In UCX 1.12 and newer, that option is default and may be omitted as well even when using InfiniBand.
 
 ``dask cuda worker`` with Manual Configuration
-------------------------------------------
+----------------------------------------------
 
 When using ``dask cuda worker`` with UCX communication and manual configuration, the scheduler, workers, and client must all be started manually, each using the same UCX configuration.