Merge pull request #969 from rapidsai/branch-22.08

[RELEASE] dask-cuda v22.08
rapidsai · Aug 17, 2022 · dab48ca · dab48ca
2 parents 2992966 + 9860cad
commit dab48ca
Show file tree

Hide file tree

Showing 22 changed files with 1,221 additions and 869 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,42 @@
+# dask-cuda 22.08.00 (17 Aug 2022)
+
+## 🚨 Breaking Changes
+
+- Fix useless property ([#944](https://github.com/rapidsai/dask-cuda/pull/944)) [@wence-](https://github.com/wence-)
+
+## 🐛 Bug Fixes
+
+- Fix `distributed` error related to `loop_in_thread` ([#963](https://github.com/rapidsai/dask-cuda/pull/963)) [@galipremsagar](https://github.com/galipremsagar)
+- Add `__rmatmul__` to `ProxyObject` ([#960](https://github.com/rapidsai/dask-cuda/pull/960)) [@jakirkham](https://github.com/jakirkham)
+- Always use versioneer command classes in setup.py ([#948](https://github.com/rapidsai/dask-cuda/pull/948)) [@wence-](https://github.com/wence-)
+- Do not dispatch removed `cudf.Frame._index` object ([#947](https://github.com/rapidsai/dask-cuda/pull/947)) [@pentschev](https://github.com/pentschev)
+- Fix useless property ([#944](https://github.com/rapidsai/dask-cuda/pull/944)) [@wence-](https://github.com/wence-)
+- LocalCUDACluster&#39;s memory limit: `None` means no limit ([#943](https://github.com/rapidsai/dask-cuda/pull/943)) [@madsbk](https://github.com/madsbk)
+- ProxyManager: support `memory_limit=None` ([#941](https://github.com/rapidsai/dask-cuda/pull/941)) [@madsbk](https://github.com/madsbk)
+- Remove deprecated `loop` kwarg to `Nanny` in `CUDAWorker` ([#934](https://github.com/rapidsai/dask-cuda/pull/934)) [@pentschev](https://github.com/pentschev)
+- Import `cleanup` fixture in `test_dask_cuda_worker.py` ([#924](https://github.com/rapidsai/dask-cuda/pull/924)) [@pentschev](https://github.com/pentschev)
+
+## 📖 Documentation
+
+- Switch docs to use common `js` &amp; `css` code ([#967](https://github.com/rapidsai/dask-cuda/pull/967)) [@galipremsagar](https://github.com/galipremsagar)
+- Switch `language` from `None` to `&quot;en&quot;` in docs build ([#939](https://github.com/rapidsai/dask-cuda/pull/939)) [@galipremsagar](https://github.com/galipremsagar)
+
+## 🚀 New Features
+
+- Add communications bandwidth to benchmarks ([#938](https://github.com/rapidsai/dask-cuda/pull/938)) [@pentschev](https://github.com/pentschev)
+
+## 🛠️ Improvements
+
+- Pin `dask` &amp; `distributed` for release ([#965](https://github.com/rapidsai/dask-cuda/pull/965)) [@galipremsagar](https://github.com/galipremsagar)
+- Test memory_limit=None for CUDAWorker ([#946](https://github.com/rapidsai/dask-cuda/pull/946)) [@wence-](https://github.com/wence-)
+- benchmarks: Record total number of workers in dataframe ([#945](https://github.com/rapidsai/dask-cuda/pull/945)) [@wence-](https://github.com/wence-)
+- Benchmark refactoring: tidy data and multi-node capability via `--scheduler-file` ([#940](https://github.com/rapidsai/dask-cuda/pull/940)) [@wence-](https://github.com/wence-)
+- Add util functions to simplify printing benchmarks results ([#937](https://github.com/rapidsai/dask-cuda/pull/937)) [@pentschev](https://github.com/pentschev)
+- Add --multiprocessing-method option to benchmarks ([#933](https://github.com/rapidsai/dask-cuda/pull/933)) [@wence-](https://github.com/wence-)
+- Remove click pinning ([#932](https://github.com/rapidsai/dask-cuda/pull/932)) [@charlesbluca](https://github.com/charlesbluca)
+- Remove compiler variables ([#929](https://github.com/rapidsai/dask-cuda/pull/929)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Unpin `dask` &amp; `distributed` for development ([#927](https://github.com/rapidsai/dask-cuda/pull/927)) [@galipremsagar](https://github.com/galipremsagar)
+
 # dask-cuda 22.06.00 (7 Jun 2022)
 
 ## 🚨 Breaking Changes

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
@@ -26,7 +26,7 @@ cd "$WORKSPACE"
 export GIT_DESCRIBE_TAG=`git describe --tags`
 export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`
 export UCX_PATH=$CONDA_PREFIX
-export UCXPY_VERSION=0.26.*
+export UCXPY_VERSION=0.27.*
 unset GIT_DESCRIBE_TAG
 
 # Enable NumPy's __array_function__ protocol (needed for NumPy 1.16.x,
@@ -38,7 +38,7 @@ export NUMPY_EXPERIMENTAL_ARRAY_FUNCTION=1
 export INSTALL_DASK_MAIN=0
 
 # Dask version to install when `INSTALL_DASK_MAIN=0`
-export DASK_STABLE_VERSION="2022.05.2"
+export DASK_STABLE_VERSION="2022.7.1"
 
 ################################################################################
 # SETUP - Check environment

diff --git a/conda/recipes/dask-cuda/meta.yaml b/conda/recipes/dask-cuda/meta.yaml
@@ -5,7 +5,6 @@
 {% set data = load_setup_py_data() %}
 
 {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
-{% set git_revision_count=environ.get('GIT_DESCRIBE_NUMBER', 0) %}
 {% set py_version=environ.get('CONDA_PY', 36) %}
 package:
   name: dask-cuda
@@ -15,13 +14,8 @@ source:
   git_url: ../../..
 
 build:
-  number: {{ git_revision_count }}
-  string: py{{ py_version }}_{{ git_revision_count }}
-  script_env:
-    - VERSION_SUFFIX
-    - CC
-    - CXX
-    - CUDAHOSTCXX
+  number: {{ GIT_DESCRIBE_NUMBER }}
+  string: py{{ py_version }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
 
 requirements:
   host:

diff --git a/dask_cuda/benchmarks/common.py b/dask_cuda/benchmarks/common.py
@@ -0,0 +1,192 @@
+from argparse import Namespace
+from functools import partial
+from typing import Any, Callable, List, Mapping, NamedTuple, Optional, Tuple
+from warnings import filterwarnings
+
+import numpy as np
+import pandas as pd
+
+import dask
+from distributed import Client
+
+from dask_cuda.benchmarks.utils import (
+    address_to_index,
+    aggregate_transfer_log_data,
+    bandwidth_statistics,
+    get_cluster_options,
+    peer_to_peer_bandwidths,
+    save_benchmark_data,
+    setup_memory_pools,
+    wait_for_cluster,
+)
+from dask_cuda.utils import all_to_all
+
+__all__ = ("execute_benchmark", "Config")
+
+
+class Config(NamedTuple):
+    """Benchmark configuration"""
+
+    args: Namespace
+    """Parsed benchmark arguments"""
+    bench_once: Callable[[Client, Namespace, Optional[str]], Any]
+    """Callable to run a single benchmark iteration
+
+    Parameters
+    ----------
+    client
+        distributed Client object
+    args
+        Benchmark parsed arguments
+    write_profile
+        Should a profile be written?
+
+    Returns
+    -------
+    Benchmark data to be interpreted by ``pretty_print_results`` and
+    ``create_tidy_results``.
+    """
+    create_tidy_results: Callable[
+        [Namespace, np.ndarray, List[Any]], Tuple[pd.DataFrame, np.ndarray]
+    ]
+    """Callable to create tidy results for saving to disk
+
+    Parameters
+    ----------
+    args
+        Benchmark parsed arguments
+    p2p_bw
+        Array of point-to-point bandwidths
+    results: list
+        List of results from running ``bench_once``
+    Returns
+    -------
+    tuple
+        two-tuple of a pandas dataframe and the point-to-point bandwidths
+    """
+    pretty_print_results: Callable[
+        [Namespace, Mapping[str, int], np.ndarray, List[Any]], None
+    ]
+    """Callable to pretty-print results for human consumption
+
+    Parameters
+    ----------
+    args
+        Benchmark parsed arguments
+    address_to_index
+        Mapping from worker addresses to indices
+    p2p_bw
+        Array of point-to-point bandwidths
+    results: list
+        List of results from running ``bench_once``
+    """
+
+
+def run_benchmark(client: Client, args: Namespace, config: Config):
+    """Run a benchmark a specified number of times
+
+    If ``args.profile`` is set, the final run is profiled."""
+    results = []
+    for _ in range(max(1, args.runs) - 1):
+        res = config.bench_once(client, args, write_profile=None)
+        results.append(res)
+    results.append(config.bench_once(client, args, write_profile=args.profile))
+    return results
+
+
+def gather_bench_results(client: Client, args: Namespace, config: Config):
+    """Collect benchmark results from the workers"""
+    address2index = address_to_index(client)
+    if args.all_to_all:
+        all_to_all(client)
+    results = run_benchmark(client, args, config)
+    # Collect aggregated peer-to-peer bandwidth
+    message_data = client.run(
+        partial(aggregate_transfer_log_data, bandwidth_statistics, args.ignore_size)
+    )
+    return address2index, results, message_data
+
+
+def run(client: Client, args: Namespace, config: Config):
+    """Run the full benchmark on the cluster
+
+    Waits for the cluster, sets up memory pools, prints and saves results"""
+    wait_for_cluster(client, shutdown_on_failure=True)
+    setup_memory_pools(
+        client,
+        args.type == "gpu",
+        args.rmm_pool_size,
+        args.disable_rmm_pool,
+        args.rmm_log_directory,
+    )
+    address_to_index, results, message_data = gather_bench_results(client, args, config)
+    p2p_bw = peer_to_peer_bandwidths(message_data, address_to_index)
+    config.pretty_print_results(args, address_to_index, p2p_bw, results)
+    if args.output_basename:
+        df, p2p_bw = config.create_tidy_results(args, p2p_bw, results)
+        df["num_workers"] = len(address_to_index)
+        save_benchmark_data(
+            args.output_basename,
+            address_to_index,
+            df,
+            p2p_bw,
+        )
+
+
+def run_client_from_existing_scheduler(args: Namespace, config: Config):
+    """Set up a client by connecting to a scheduler
+
+    Shuts down the cluster at the end of the benchmark conditional on
+    ``args.shutdown_cluster``.
+    """
+    if args.scheduler_address is not None:
+        kwargs = {"address": args.scheduler_address}
+    elif args.scheduler_file is not None:
+        kwargs = {"scheduler_file": args.scheduler_file}
+    else:
+        raise RuntimeError(
+            "Need to specify either --scheduler-file " "or --scheduler-address"
+        )
+    with Client(**kwargs) as client:
+        run(client, args, config)
+        if args.shutdown_cluster:
+            client.shutdown()
+
+
+def run_create_client(args, config):
+    """Create a client + cluster and run
+
+    Shuts down the cluster at the end of the benchmark"""
+    cluster_options = get_cluster_options(args)
+    Cluster = cluster_options["class"]
+    cluster_args = cluster_options["args"]
+    cluster_kwargs = cluster_options["kwargs"]
+    scheduler_addr = cluster_options["scheduler_addr"]
+
+    filterwarnings("ignore", message=".*NVLink.*rmm_pool_size.*", category=UserWarning)
+
+    with Cluster(*cluster_args, **cluster_kwargs) as cluster:
+        # Use the scheduler address with an SSHCluster rather than the cluster
+        # object, otherwise we can't shut it down.
+        with Client(scheduler_addr if args.multi_node else cluster) as client:
+            run(client, args, config)
+            # An SSHCluster will not automatically shut down, we have to
+            # ensure it does.
+            if args.multi_node:
+                client.shutdown()
+
+
+def execute_benchmark(config: Config):
+    """Run complete benchmark given a configuration"""
+    args = config.args
+    if args.multiprocessing_method == "forkserver":
+        import multiprocessing.forkserver as f
+
+        f.ensure_running()
+    with dask.config.set(
+        {"distributed.worker.multiprocessing-method": args.multiprocessing_method}
+    ):
+        if args.scheduler_file is not None or args.scheduler_address is not None:
+            run_client_from_existing_scheduler(args, config)
+        else:
+            run_create_client(args, config)