diff --git a/CHANGELOG.md b/CHANGELOG.md index 8b0cb14f..df09243b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,28 @@ -# dask-cuda 22.08.00 (17 Aug 2022) +# dask-cuda 22.10.00 (12 Oct 2022) + +## 🐛 Bug Fixes + +- Revert "Update rearrange_by_column patch for explicit comms" ([#1001](https://github.com/rapidsai/dask-cuda/pull/1001)) [@rjzamora](https://github.com/rjzamora) +- Address CI failures caused by upstream distributed and cupy changes ([#993](https://github.com/rapidsai/dask-cuda/pull/993)) [@rjzamora](https://github.com/rjzamora) +- DeviceSerialized.__reduce_ex__: convert frame to numpy arrays ([#977](https://github.com/rapidsai/dask-cuda/pull/977)) [@madsbk](https://github.com/madsbk) + +## 📖 Documentation + +- Remove line-break that's breaking link ([#982](https://github.com/rapidsai/dask-cuda/pull/982)) [@ntabris](https://github.com/ntabris) +- Dask-cuda best practices ([#976](https://github.com/rapidsai/dask-cuda/pull/976)) [@quasiben](https://github.com/quasiben) +## 🚀 New Features + +- Add Groupby benchmark ([#979](https://github.com/rapidsai/dask-cuda/pull/979)) [@rjzamora](https://github.com/rjzamora) + +## 🛠️ Improvements + +- Pin `dask` and `distributed` for release ([#1003](https://github.com/rapidsai/dask-cuda/pull/1003)) [@galipremsagar](https://github.com/galipremsagar) +- Update rearrange_by_column patch for explicit comms ([#992](https://github.com/rapidsai/dask-cuda/pull/992)) [@rjzamora](https://github.com/rjzamora) +- benchmarks: Add option to suppress output of point to point data ([#985](https://github.com/rapidsai/dask-cuda/pull/985)) [@wence-](https://github.com/wence-) +- Unpin `dask` and `distributed` for development ([#971](https://github.com/rapidsai/dask-cuda/pull/971)) [@galipremsagar](https://github.com/galipremsagar) + +# dask-cuda 22.08.00 (17 Aug 2022) ## 🚨 Breaking Changes - Fix useless property ([#944](https://github.com/rapidsai/dask-cuda/pull/944)) [@wence-](https://github.com/wence-) diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh index d2450cfe..e468b1cb 100755 --- a/ci/cpu/build.sh +++ b/ci/cpu/build.sh @@ -19,6 +19,10 @@ export CUDA_REL=${CUDA_VERSION%.*} export GPUCI_CONDA_RETRY_MAX=1 export GPUCI_CONDA_RETRY_SLEEP=30 +# Whether to keep `dask/label/dev` channel in the env. If INSTALL_DASK_MAIN=0, +# `dask/label/dev` channel is removed. +export INSTALL_DASK_MAIN=0 + # Switch to project root; also root of repo checkout cd "$WORKSPACE" @@ -43,9 +47,13 @@ gpuci_logger "Activate conda env" . /opt/conda/etc/profile.d/conda.sh conda activate rapids -# Remove rapidsai-nightly channel if we are building main branch +# Remove `rapidsai-nightly` & `dask/label/dev` channel if we are building main branch if [ "$SOURCE_BRANCH" = "main" ]; then conda config --system --remove channels rapidsai-nightly + conda config --system --remove channels dask/label/dev +elif [[ "${INSTALL_DASK_MAIN}" == 0 ]]; then +# Remove `dask/label/dev` channel if INSTALL_DASK_MAIN=0 + conda config --system --remove channels dask/label/dev fi gpuci_logger "Check compiler versions" @@ -61,8 +69,8 @@ conda list --show-channel-urls # FIX Added to deal with Anancoda SSL verification issues during conda builds conda config --set ssl_verify False -pip install git+https://github.com/dask/dask.git@main -pip install git+https://github.com/dask/distributed.git@main +pip install git+https://github.com/dask/dask.git@2022.9.2 +pip install git+https://github.com/dask/distributed.git@2022.9.2 ################################################################################ # BUILD - Package builds diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index fb050a6d..6db58ec0 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -26,7 +26,7 @@ cd "$WORKSPACE" export GIT_DESCRIBE_TAG=`git describe --tags` export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'` export UCX_PATH=$CONDA_PREFIX -export UCXPY_VERSION=0.27.* +export UCXPY_VERSION=0.28.* unset GIT_DESCRIBE_TAG # Enable NumPy's __array_function__ protocol (needed for NumPy 1.16.x, @@ -38,7 +38,7 @@ export NUMPY_EXPERIMENTAL_ARRAY_FUNCTION=1 export INSTALL_DASK_MAIN=0 # Dask version to install when `INSTALL_DASK_MAIN=0` -export DASK_STABLE_VERSION="2022.7.1" +export DASK_STABLE_VERSION="2022.9.2" ################################################################################ # SETUP - Check environment @@ -77,6 +77,7 @@ if [[ "${INSTALL_DASK_MAIN}" == 1 ]]; then else gpuci_logger "gpuci_mamba_retry install conda-forge::dask==${DASK_STABLE_VERSION} conda-forge::distributed==${DASK_STABLE_VERSION} conda-forge::dask-core==${DASK_STABLE_VERSION} --force-reinstall" gpuci_mamba_retry install conda-forge::dask==${DASK_STABLE_VERSION} conda-forge::distributed==${DASK_STABLE_VERSION} conda-forge::dask-core==${DASK_STABLE_VERSION} --force-reinstall + conda config --system --remove channels dask/label/dev fi diff --git a/dask_cuda/benchmarks/local_cudf_groupby.py b/dask_cuda/benchmarks/local_cudf_groupby.py new file mode 100644 index 00000000..379ff930 --- /dev/null +++ b/dask_cuda/benchmarks/local_cudf_groupby.py @@ -0,0 +1,273 @@ +import contextlib +from collections import ChainMap +from time import perf_counter as clock + +import pandas as pd + +import dask +import dask.dataframe as dd +from dask.distributed import performance_report, wait +from dask.utils import format_bytes, parse_bytes + +from dask_cuda.benchmarks.common import Config, execute_benchmark +from dask_cuda.benchmarks.utils import ( + parse_benchmark_args, + print_key_value, + print_separator, + print_throughput_bandwidth, +) + + +def apply_groupby( + df, + sort=False, + split_out=1, + split_every=8, + shuffle=None, +): + # Handle special "explicit-comms" case + config = {} + if shuffle == "explicit-comms": + shuffle = "tasks" + config = {"explicit-comms": True} + + with dask.config.set(config): + agg = df.groupby("key", sort=sort).agg( + {"int64": ["max", "count"], "float64": "mean"}, + split_out=split_out, + split_every=split_every, + shuffle=shuffle, + ) + + wait(agg.persist()) + return agg + + +def generate_chunk(chunk_info, unique_size=1, gpu=True): + # Setting a seed that triggers max amount of comm in the two-GPU case. + if gpu: + import cupy as xp + + import cudf as xdf + else: + import numpy as xp + import pandas as xdf + + i_chunk, local_size = chunk_info + xp.random.seed(i_chunk * 1_000) + return xdf.DataFrame( + { + "key": xp.random.randint(0, unique_size, size=local_size, dtype="int64"), + "int64": xp.random.permutation(xp.arange(local_size, dtype="int64")), + "float64": xp.random.permutation(xp.arange(local_size, dtype="float64")), + } + ) + + +def get_random_ddf(args): + + total_size = args.chunk_size * args.in_parts + chunk_kwargs = { + "unique_size": max(int(args.unique_ratio * total_size), 1), + "gpu": True if args.type == "gpu" else False, + } + + return dd.from_map( + generate_chunk, + [(i, args.chunk_size) for i in range(args.in_parts)], + meta=generate_chunk((0, 1), **chunk_kwargs), + enforce_metadata=False, + **chunk_kwargs, + ) + + +def bench_once(client, args, write_profile=None): + + # Generate random Dask dataframe + df = get_random_ddf(args) + + data_processed = len(df) * sum([t.itemsize for t in df.dtypes]) + shuffle = { + "True": "tasks", + "False": False, + }.get(args.shuffle, args.shuffle) + + if write_profile is None: + ctx = contextlib.nullcontext() + else: + ctx = performance_report(filename=args.profile) + + with ctx: + t1 = clock() + agg = apply_groupby( + df, + sort=args.sort, + split_out=args.split_out, + split_every=args.split_every, + shuffle=shuffle, + ) + t2 = clock() + + output_size = agg.memory_usage(index=True, deep=True).compute().sum() + return (data_processed, output_size, t2 - t1) + + +def pretty_print_results(args, address_to_index, p2p_bw, results): + if args.markdown: + print("```") + print("Groupby benchmark") + print_separator(separator="-") + print_key_value(key="Use shuffle", value=f"{args.shuffle}") + print_key_value(key="Output partitions", value=f"{args.split_out}") + print_key_value(key="Input partitions", value=f"{args.in_parts}") + print_key_value(key="Sort Groups", value=f"{args.sort}") + print_key_value(key="Rows-per-chunk", value=f"{args.chunk_size}") + print_key_value(key="Unique-group ratio", value=f"{args.unique_ratio}") + print_key_value(key="Protocol", value=f"{args.protocol}") + print_key_value(key="Device(s)", value=f"{args.devs}") + print_key_value(key="Tree-reduction width", value=f"{args.split_every}") + if args.device_memory_limit: + print_key_value( + key="Device memory limit", value=f"{format_bytes(args.device_memory_limit)}" + ) + print_key_value(key="RMM Pool", value=f"{not args.disable_rmm_pool}") + if args.protocol == "ucx": + print_key_value(key="TCP", value=f"{args.enable_tcp_over_ucx}") + print_key_value(key="InfiniBand", value=f"{args.enable_infiniband}") + print_key_value(key="NVLink", value=f"{args.enable_nvlink}") + print_key_value(key="Worker thread(s)", value=f"{args.threads_per_worker}") + print_key_value(key="Data processed", value=f"{format_bytes(results[0][0])}") + print_key_value(key="Output size", value=f"{format_bytes(results[0][1])}") + if args.markdown: + print("\n```") + data_processed, output_size, durations = zip(*results) + print_throughput_bandwidth( + args, durations, data_processed, p2p_bw, address_to_index + ) + + +def create_tidy_results(args, p2p_bw, results): + configuration = { + "dataframe_type": "cudf" if args.type == "gpu" else "pandas", + "shuffle": args.shuffle, + "sort": args.sort, + "split_out": args.split_out, + "split_every": args.split_every, + "in_parts": args.in_parts, + "rows_per_chunk": args.chunk_size, + "unique_ratio": args.unique_ratio, + "protocol": args.protocol, + "devs": args.devs, + "device_memory_limit": args.device_memory_limit, + "rmm_pool": not args.disable_rmm_pool, + "tcp": args.enable_tcp_over_ucx, + "ib": args.enable_infiniband, + "nvlink": args.enable_nvlink, + } + timing_data = pd.DataFrame( + [ + pd.Series( + data=ChainMap( + configuration, + { + "wallclock": duration, + "data_processed": data_processed, + "output_size": output_size, + }, + ) + ) + for data_processed, output_size, duration in results + ] + ) + return timing_data, p2p_bw + + +def parse_args(): + special_args = [ + { + "name": "--in-parts", + "default": 100, + "metavar": "n", + "type": int, + "help": "Number of input partitions (default '100')", + }, + { + "name": [ + "-c", + "--chunk-size", + ], + "default": 1_000_000, + "metavar": "n", + "type": int, + "help": "Chunk size (default 1_000_000)", + }, + { + "name": "--unique-ratio", + "default": 0.01, + "type": float, + "help": "Fraction of rows that are unique groups", + }, + { + "name": "--sort", + "default": False, + "action": "store_true", + "help": "Whether to sort the output group order.", + }, + { + "name": "--split_out", + "default": 1, + "type": int, + "help": "How many partitions to return.", + }, + { + "name": "--split_every", + "default": 8, + "type": int, + "help": "Tree-reduction width.", + }, + { + "name": "--shuffle", + "choices": ["False", "True", "tasks", "explicit-comms"], + "default": "False", + "type": str, + "help": "Whether to use shuffle-based groupby.", + }, + { + "name": [ + "-t", + "--type", + ], + "choices": ["cpu", "gpu"], + "default": "gpu", + "type": str, + "help": "Do shuffle with GPU or CPU dataframes (default 'gpu')", + }, + { + "name": "--ignore-size", + "default": "1 MiB", + "metavar": "nbytes", + "type": parse_bytes, + "help": "Ignore messages smaller than this (default '1 MB')", + }, + { + "name": "--runs", + "default": 3, + "type": int, + "help": "Number of runs", + }, + ] + + return parse_benchmark_args( + description="Distributed groupby (dask/cudf) benchmark", args_list=special_args + ) + + +if __name__ == "__main__": + execute_benchmark( + Config( + args=parse_args(), + bench_once=bench_once, + create_tidy_results=create_tidy_results, + pretty_print_results=pretty_print_results, + ) + ) diff --git a/dask_cuda/benchmarks/utils.py b/dask_cuda/benchmarks/utils.py index 8b110e8c..34454980 100644 --- a/dask_cuda/benchmarks/utils.py +++ b/dask_cuda/benchmarks/utils.py @@ -185,6 +185,11 @@ def parse_benchmark_args(description="Generic dask-cuda Benchmark", args_list=[] "Note: --devs is currently ignored in multi-node mode and for each host " "one worker per GPU will be launched.", ) + parser.add_argument( + "--no-show-p2p-bandwidth", + action="store_true", + help="Do not produce detailed point to point bandwidth stats in output", + ) parser.add_argument( "--all-to-all", action="store_true", @@ -549,28 +554,29 @@ def print_throughput_bandwidth( key="Wall clock", value=f"{format_time(durations.mean())} +/- {format_time(durations.std()) }", ) - print_separator(separator="=") - if args.markdown: - print("
\nWorker-Worker Transfer Rates\n\n```") - - print_key_value(key="(w1,w2)", value="25% 50% 75% (total nbytes)") - print_separator(separator="-") - for (source, dest) in np.ndindex(p2p_bw.shape[:2]): - bw = BandwidthStats(*p2p_bw[source, dest, ...]) - if bw.total_bytes > 0: - print_key_value( - key=f"({source},{dest})", - value=f"{format_bytes(bw.q25)}/s {format_bytes(bw.q50)}/s " - f"{format_bytes(bw.q75)}/s ({format_bytes(bw.total_bytes)})", - ) - print_separator(separator="=") - print_key_value(key="Worker index", value="Worker address") - print_separator(separator="-") - for address, index in sorted(address_to_index.items(), key=itemgetter(1)): - print_key_value(key=index, value=address) - print_separator(separator="=") - if args.markdown: - print("```\n
\n") + if not args.no_show_p2p_bandwidth: + print_separator(separator="=") + if args.markdown: + print("
\nWorker-Worker Transfer Rates\n\n```") + + print_key_value(key="(w1,w2)", value="25% 50% 75% (total nbytes)") + print_separator(separator="-") + for (source, dest) in np.ndindex(p2p_bw.shape[:2]): + bw = BandwidthStats(*p2p_bw[source, dest, ...]) + if bw.total_bytes > 0: + print_key_value( + key=f"({source},{dest})", + value=f"{format_bytes(bw.q25)}/s {format_bytes(bw.q50)}/s " + f"{format_bytes(bw.q75)}/s ({format_bytes(bw.total_bytes)})", + ) + print_separator(separator="=") + print_key_value(key="Worker index", value="Worker address") + print_separator(separator="-") + for address, index in sorted(address_to_index.items(), key=itemgetter(1)): + print_key_value(key=index, value=address) + print_separator(separator="=") + if args.markdown: + print("```\n
\n") if args.plot: plot_benchmark(throughputs, args.plot, historical=True) diff --git a/dask_cuda/device_host_file.py b/dask_cuda/device_host_file.py index ae8e53de..e89ba64b 100644 --- a/dask_cuda/device_host_file.py +++ b/dask_cuda/device_host_file.py @@ -3,6 +3,7 @@ import os import time +import numpy from zict import Buffer, File, Func from zict.common import ZictBase @@ -115,7 +116,11 @@ def __sizeof__(self): def __reduce_ex__(self, protocol): header, frames = device_serialize(self) - frames = [f.obj for f in frames] + # Since pickle cannot handle memoryviews, we convert them + # to NumPy arrays (zero-copy). + frames = [ + (numpy.asarray(f) if isinstance(f, memoryview) else f) for f in frames + ] return device_deserialize, (header, frames) diff --git a/dask_cuda/tests/test_proxy.py b/dask_cuda/tests/test_proxy.py index 0c1f9a7f..2c21023c 100644 --- a/dask_cuda/tests/test_proxy.py +++ b/dask_cuda/tests/test_proxy.py @@ -7,6 +7,7 @@ import numpy as np import pandas import pytest +from packaging import version from pandas.testing import assert_frame_equal, assert_series_equal import dask @@ -649,6 +650,8 @@ def test_cupy_broadcast_to(): def test_cupy_matmul(): cupy = pytest.importorskip("cupy") + if version.parse(cupy.__version__) >= version.parse("11.0"): + pytest.xfail("See: https://github.com/rapidsai/dask-cuda/issues/995") a, b = cupy.arange(10), cupy.arange(10) c = a @ b assert c == proxy_object.asproxy(a) @ b @@ -658,6 +661,8 @@ def test_cupy_matmul(): def test_cupy_imatmul(): cupy = pytest.importorskip("cupy") + if version.parse(cupy.__version__) >= version.parse("11.0"): + pytest.xfail("See: https://github.com/rapidsai/dask-cuda/issues/995") a = cupy.arange(9).reshape(3, 3) c = a.copy() c @= a diff --git a/dask_cuda/tests/test_spill.py b/dask_cuda/tests/test_spill.py index 73f211d1..f93b83ec 100644 --- a/dask_cuda/tests/test_spill.py +++ b/dask_cuda/tests/test_spill.py @@ -1,6 +1,5 @@ import os from time import sleep -from unittest.mock import patch import pytest from zict.file import _safe_key as safe_key @@ -207,57 +206,58 @@ async def test_cupy_cluster_device_spill(params): async def test_cudf_cluster_device_spill(params): cudf = pytest.importorskip("cudf") - # Disabling compression via environment variable seems to be the only way - # respected here. It is necessary to ensure spilled size matches the actual - # data size. - with patch.dict(os.environ, {"DASK_DISTRIBUTED__COMM__COMPRESSION": "False"}): - with dask.config.set({"distributed.worker.memory.terminate": False}): - async with LocalCUDACluster( - n_workers=1, - device_memory_limit=params["device_memory_limit"], - memory_limit=params["memory_limit"], - memory_target_fraction=params["host_target"], - memory_spill_fraction=params["host_spill"], - memory_pause_fraction=params["host_pause"], - asynchronous=True, - ) as cluster: - async with Client(cluster, asynchronous=True) as client: - - # There's a known issue with datetime64: - # https://github.com/numpy/numpy/issues/4983#issuecomment-441332940 - # The same error above happens when spilling datetime64 to disk - cdf = ( - dask.datasets.timeseries( - dtypes={"x": int, "y": float}, freq="400ms" - ) - .reset_index(drop=True) - .map_partitions(cudf.from_pandas) - ) + with dask.config.set( + { + "distributed.comm.compression": False, + "distributed.worker.memory.terminate": False, + } + ): + async with LocalCUDACluster( + n_workers=1, + device_memory_limit=params["device_memory_limit"], + memory_limit=params["memory_limit"], + memory_target_fraction=params["host_target"], + memory_spill_fraction=params["host_spill"], + memory_pause_fraction=params["host_pause"], + asynchronous=True, + ) as cluster: + async with Client(cluster, asynchronous=True) as client: - sizes = await client.compute( - cdf.map_partitions(lambda df: df.memory_usage()) + # There's a known issue with datetime64: + # https://github.com/numpy/numpy/issues/4983#issuecomment-441332940 + # The same error above happens when spilling datetime64 to disk + cdf = ( + dask.datasets.timeseries( + dtypes={"x": int, "y": float}, freq="400ms" ) - sizes = sizes.to_arrow().to_pylist() - nbytes = sum(sizes) + .reset_index(drop=True) + .map_partitions(cudf.from_pandas) + ) - cdf2 = cdf.persist() - await wait(cdf2) + sizes = await client.compute( + cdf.map_partitions(lambda df: df.memory_usage()) + ) + sizes = sizes.to_arrow().to_pylist() + nbytes = sum(sizes) - del cdf + cdf2 = cdf.persist() + await wait(cdf2) - host_chunks = await client.run(lambda: len(get_worker().data.host)) - disk_chunks = await client.run( - lambda: len(get_worker().data.disk or list()) - ) - for hc, dc in zip(host_chunks.values(), disk_chunks.values()): - if params["spills_to_disk"]: - assert dc > 0 - else: - assert hc > 0 - assert dc == 0 + del cdf + + host_chunks = await client.run(lambda: len(get_worker().data.host)) + disk_chunks = await client.run( + lambda: len(get_worker().data.disk or list()) + ) + for hc, dc in zip(host_chunks.values(), disk_chunks.values()): + if params["spills_to_disk"]: + assert dc > 0 + else: + assert hc > 0 + assert dc == 0 - await client.run(worker_assert, nbytes, 32, 2048) + await client.run(worker_assert, nbytes, 32, 2048) - del cdf2 + del cdf2 - await client.run(delayed_worker_assert, 0, 0, 0) + await client.run(delayed_worker_assert, 0, 0, 0) diff --git a/docs/source/examples/best-practices.rst b/docs/source/examples/best-practices.rst new file mode 100644 index 00000000..242e90ff --- /dev/null +++ b/docs/source/examples/best-practices.rst @@ -0,0 +1,117 @@ +Best Practices +============== + + +Multi-GPU Machines +~~~~~~~~~~~~~~~~~~ + +When choosing between two multi-GPU setups, it is best to pick the one where most GPUs are co-located with one-another. This could be a +`DGX `_, a cloud instance with `multi-gpu options `_ , a high-density GPU HPC instance, etc. This is done for two reasons: + +- Moving data between GPUs is costly and performance decreases when computation stops due to communication overheads, Host-to-Device/Device-to-Host transfers, etc +- Multi-GPU instances often come with accelerated networking like `NVLink `_. These accelerated +networking paths usually have much higher throughput/bandwidth compared with traditional networking *and* don't force and Host-to-Device/Device-to-Host transfers. See +`Accelerated Networking`_ for more discussion + +.. code-block:: python + + from dask_cuda import LocalCUDACluster + + cluster = LocalCUDACluster(n_workers=2) # will use GPUs 0,1 + cluster = LocalCUDACluster(CUDA_VISIBLE_DEVICES="3,4") # will use GPUs 3,4 + +For more discussion on controlling number of workers/using multiple GPUs see :ref:`controlling-number-of-workers` . + +GPU Memory Management +~~~~~~~~~~~~~~~~~~~~~ + +When using Dask-CUDA, especially with RAPIDS, it's best to use an |rmm-pool|__ to pre-allocate memory on the GPU. Allocating memory, while fast, takes a small amount of time, however, one can easily make +hundreds of thousand or even millions of allocations in trivial workflows causing significant performance degradations. With an RMM pool, allocations are sub-sampled from a larger pool and this greatly reduces the allocation time and thereby increases performance: + + + .. |rmm-pool| replace:: :abbr:`RMM (RAPIDS Memory Manager)` pool + __ https://docs.rapids.ai/api/rmm/stable/ + + +.. code-block:: python + + from dask_cuda import LocalCUDACluster + + cluster = LocalCUDACluster(CUDA_VISIBLE_DEVICES="0,1", + protocol="ucx", + rmm_pool_size="30GB") + + +We also recommend allocating most, though not all, of the GPU memory space. We do this because the `CUDA Context `_ takes a non-zero amount (typically 200-500 MBs) of GPU RAM on the device. + +Additionally, when using `Accelerated Networking`_ , we only need to register a single IPC handle for the whole pool (which is expensive, but only done once) since from the IPC point of viewer there's only a single allocation. As opposed to just using RMM without a pool where each new allocation must be registered with IPC. + +Accelerated Networking +~~~~~~~~~~~~~~~~~~~~~~ + +As discussed in `Multi-GPU Machines`_, accelerated networking has better bandwidth/throughput compared with traditional networking hardware and does +not force any costly Host-to-Device/Device-to-Host transfers. Dask-CUDA can leverage accelerated networking hardware with `UCX-Py `_. + +As an example, let's compare a merge benchmark when using 2 GPUs connected with NVLink. First we'll run with standard TCP comms: + +:: + + python local_cudf_merge.py -d 0,1 -p tcp -c 50_000_000 --rmm-pool-size 30GB + + +In the above, we used 2 GPUs (2 dask-cuda-workers), pre-allocated 30GB of GPU RAM (to make gpu memory allocations faster), and used TCP comms +when Dask needed to move data back-and-forth between workers. This setup results in an average wall clock time of: ``19.72 s +/- 694.36 ms``:: + + ================================================================================ + Wall clock | Throughput + -------------------------------------------------------------------------------- + 20.09 s | 151.93 MiB/s + 20.33 s | 150.10 MiB/s + 18.75 s | 162.75 MiB/s + ================================================================================ + Throughput | 154.73 MiB/s +/- 3.14 MiB/s + Bandwidth | 139.22 MiB/s +/- 2.98 MiB/s + Wall clock | 19.72 s +/- 694.36 ms + ================================================================================ + (w1,w2) | 25% 50% 75% (total nbytes) + -------------------------------------------------------------------------------- + (0,1) | 138.48 MiB/s 150.16 MiB/s 157.36 MiB/s (8.66 GiB) + (1,0) | 107.01 MiB/s 162.38 MiB/s 188.59 MiB/s (8.66 GiB) + ================================================================================ + Worker index | Worker address + -------------------------------------------------------------------------------- + 0 | tcp://127.0.0.1:44055 + 1 | tcp://127.0.0.1:41095 + ================================================================================ + + +To compare, we'll now change the ``procotol`` from ``tcp`` to ``ucx``: + + python local_cudf_merge.py -d 0,1 -p ucx -c 50_000_000 --rmm-pool-size 30GB + + + +With UCX and NVLink, we greatly reduced the wall clock time to: ``347.43 ms +/- 5.41 ms``.:: + + ================================================================================ + Wall clock | Throughput + -------------------------------------------------------------------------------- + 354.87 ms | 8.40 GiB/s + 345.24 ms | 8.63 GiB/s + 342.18 ms | 8.71 GiB/s + ================================================================================ + Throughput | 8.58 GiB/s +/- 78.96 MiB/s + Bandwidth | 6.98 GiB/s +/- 46.05 MiB/s + Wall clock | 347.43 ms +/- 5.41 ms + ================================================================================ + (w1,w2) | 25% 50% 75% (total nbytes) + -------------------------------------------------------------------------------- + (0,1) | 17.38 GiB/s 17.94 GiB/s 18.88 GiB/s (8.66 GiB) + (1,0) | 16.55 GiB/s 17.80 GiB/s 18.87 GiB/s (8.66 GiB) + ================================================================================ + Worker index | Worker address + -------------------------------------------------------------------------------- + 0 | ucx://127.0.0.1:35954 + 1 | ucx://127.0.0.1:53584 + ================================================================================ + diff --git a/docs/source/examples/worker_count.rst b/docs/source/examples/worker_count.rst index 29c6502c..62954ffb 100644 --- a/docs/source/examples/worker_count.rst +++ b/docs/source/examples/worker_count.rst @@ -1,3 +1,5 @@ +.. _controlling-number-of-workers: + Controlling number of workers ============================= @@ -44,4 +46,4 @@ These UUIDs can then be passed to ``CUDA_VISIBLE_DEVICES`` in place of a GPU ind .. code-block:: bash $ CUDA_VISIBLE_DEVICES="GPU-dae76d0e-3414-958a-8f3e-fc6682b36f31" \ - > dask-cuda-worker 127.0.0.1:8786 + > dask-cuda-worker 127.0.0.1:8786 diff --git a/docs/source/index.rst b/docs/source/index.rst index efd7f62f..a43f2907 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -39,5 +39,6 @@ Contents :maxdepth: 1 :caption: Examples + examples/best-practices examples/worker_count examples/ucx diff --git a/requirements.txt b/requirements.txt index a384bfc2..3d673a95 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ -dask==2022.7.1 -distributed==2022.7.1 +dask==2022.9.2 +distributed==2022.9.2 pynvml>=11.0.0 numpy>=1.16.0 numba>=0.54