diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8b0cb14f..df09243b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,28 @@
-# dask-cuda 22.08.00 (17 Aug 2022)
+# dask-cuda 22.10.00 (12 Oct 2022)
+
+## 🐛 Bug Fixes
+
+- Revert &quot;Update rearrange_by_column patch for explicit comms&quot; ([#1001](https://github.com/rapidsai/dask-cuda/pull/1001)) [@rjzamora](https://github.com/rjzamora)
+- Address CI failures caused by upstream distributed and cupy changes ([#993](https://github.com/rapidsai/dask-cuda/pull/993)) [@rjzamora](https://github.com/rjzamora)
+- DeviceSerialized.__reduce_ex__: convert frame to numpy arrays ([#977](https://github.com/rapidsai/dask-cuda/pull/977)) [@madsbk](https://github.com/madsbk)
+
+## 📖 Documentation
+
+- Remove line-break that&#39;s breaking link ([#982](https://github.com/rapidsai/dask-cuda/pull/982)) [@ntabris](https://github.com/ntabris)
+- Dask-cuda best practices ([#976](https://github.com/rapidsai/dask-cuda/pull/976)) [@quasiben](https://github.com/quasiben)
 
+## 🚀 New Features
+
+- Add Groupby benchmark ([#979](https://github.com/rapidsai/dask-cuda/pull/979)) [@rjzamora](https://github.com/rjzamora)
+
+## 🛠️ Improvements
+
+- Pin `dask` and `distributed` for release ([#1003](https://github.com/rapidsai/dask-cuda/pull/1003)) [@galipremsagar](https://github.com/galipremsagar)
+- Update rearrange_by_column patch for explicit comms ([#992](https://github.com/rapidsai/dask-cuda/pull/992)) [@rjzamora](https://github.com/rjzamora)
+- benchmarks: Add option to suppress output of point to point data ([#985](https://github.com/rapidsai/dask-cuda/pull/985)) [@wence-](https://github.com/wence-)
+- Unpin `dask` and `distributed` for development ([#971](https://github.com/rapidsai/dask-cuda/pull/971)) [@galipremsagar](https://github.com/galipremsagar)
+
+# dask-cuda 22.08.00 (17 Aug 2022)
 ## 🚨 Breaking Changes
 
 - Fix useless property ([#944](https://github.com/rapidsai/dask-cuda/pull/944)) [@wence-](https://github.com/wence-)
diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh
index d2450cfe..e468b1cb 100755
--- a/ci/cpu/build.sh
+++ b/ci/cpu/build.sh
@@ -19,6 +19,10 @@ export CUDA_REL=${CUDA_VERSION%.*}
 export GPUCI_CONDA_RETRY_MAX=1
 export GPUCI_CONDA_RETRY_SLEEP=30
 
+# Whether to keep `dask/label/dev` channel in the env. If INSTALL_DASK_MAIN=0,
+# `dask/label/dev` channel is removed.
+export INSTALL_DASK_MAIN=0
+
 # Switch to project root; also root of repo checkout
 cd "$WORKSPACE"
 
@@ -43,9 +47,13 @@ gpuci_logger "Activate conda env"
 . /opt/conda/etc/profile.d/conda.sh
 conda activate rapids
 
-# Remove rapidsai-nightly channel if we are building main branch
+# Remove `rapidsai-nightly` & `dask/label/dev` channel if we are building main branch
 if [ "$SOURCE_BRANCH" = "main" ]; then
   conda config --system --remove channels rapidsai-nightly
+  conda config --system --remove channels dask/label/dev
+elif [[ "${INSTALL_DASK_MAIN}" == 0 ]]; then
+# Remove `dask/label/dev` channel if INSTALL_DASK_MAIN=0
+  conda config --system --remove channels dask/label/dev
 fi
 
 gpuci_logger "Check compiler versions"
@@ -61,8 +69,8 @@ conda list --show-channel-urls
 # FIX Added to deal with Anancoda SSL verification issues during conda builds
 conda config --set ssl_verify False
 
-pip install git+https://github.com/dask/dask.git@main
-pip install git+https://github.com/dask/distributed.git@main
+pip install git+https://github.com/dask/dask.git@2022.9.2
+pip install git+https://github.com/dask/distributed.git@2022.9.2
 
 ################################################################################
 # BUILD - Package builds
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index fb050a6d..6db58ec0 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -26,7 +26,7 @@ cd "$WORKSPACE"
 export GIT_DESCRIBE_TAG=`git describe --tags`
 export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`
 export UCX_PATH=$CONDA_PREFIX
-export UCXPY_VERSION=0.27.*
+export UCXPY_VERSION=0.28.*
 unset GIT_DESCRIBE_TAG
 
 # Enable NumPy's __array_function__ protocol (needed for NumPy 1.16.x,
@@ -38,7 +38,7 @@ export NUMPY_EXPERIMENTAL_ARRAY_FUNCTION=1
 export INSTALL_DASK_MAIN=0
 
 # Dask version to install when `INSTALL_DASK_MAIN=0`
-export DASK_STABLE_VERSION="2022.7.1"
+export DASK_STABLE_VERSION="2022.9.2"
 
 ################################################################################
 # SETUP - Check environment
@@ -77,6 +77,7 @@ if [[ "${INSTALL_DASK_MAIN}" == 1 ]]; then
 else
   gpuci_logger "gpuci_mamba_retry install conda-forge::dask==${DASK_STABLE_VERSION} conda-forge::distributed==${DASK_STABLE_VERSION} conda-forge::dask-core==${DASK_STABLE_VERSION} --force-reinstall"
   gpuci_mamba_retry install conda-forge::dask==${DASK_STABLE_VERSION} conda-forge::distributed==${DASK_STABLE_VERSION} conda-forge::dask-core==${DASK_STABLE_VERSION} --force-reinstall
+  conda config --system --remove channels dask/label/dev
 fi
 
 
diff --git a/dask_cuda/benchmarks/local_cudf_groupby.py b/dask_cuda/benchmarks/local_cudf_groupby.py
new file mode 100644
index 00000000..379ff930
--- /dev/null
+++ b/dask_cuda/benchmarks/local_cudf_groupby.py
@@ -0,0 +1,273 @@
+import contextlib
+from collections import ChainMap
+from time import perf_counter as clock
+
+import pandas as pd
+
+import dask
+import dask.dataframe as dd
+from dask.distributed import performance_report, wait
+from dask.utils import format_bytes, parse_bytes
+
+from dask_cuda.benchmarks.common import Config, execute_benchmark
+from dask_cuda.benchmarks.utils import (
+    parse_benchmark_args,
+    print_key_value,
+    print_separator,
+    print_throughput_bandwidth,
+)
+
+
+def apply_groupby(
+    df,
+    sort=False,
+    split_out=1,
+    split_every=8,
+    shuffle=None,
+):
+    # Handle special "explicit-comms" case
+    config = {}
+    if shuffle == "explicit-comms":
+        shuffle = "tasks"
+        config = {"explicit-comms": True}
+
+    with dask.config.set(config):
+        agg = df.groupby("key", sort=sort).agg(
+            {"int64": ["max", "count"], "float64": "mean"},
+            split_out=split_out,
+            split_every=split_every,
+            shuffle=shuffle,
+        )
+
+    wait(agg.persist())
+    return agg
+
+
+def generate_chunk(chunk_info, unique_size=1, gpu=True):
+    # Setting a seed that triggers max amount of comm in the two-GPU case.
+    if gpu:
+        import cupy as xp
+
+        import cudf as xdf
+    else:
+        import numpy as xp
+        import pandas as xdf
+
+    i_chunk, local_size = chunk_info
+    xp.random.seed(i_chunk * 1_000)
+    return xdf.DataFrame(
+        {
+            "key": xp.random.randint(0, unique_size, size=local_size, dtype="int64"),
+            "int64": xp.random.permutation(xp.arange(local_size, dtype="int64")),
+            "float64": xp.random.permutation(xp.arange(local_size, dtype="float64")),
+        }
+    )
+
+
+def get_random_ddf(args):
+
+    total_size = args.chunk_size * args.in_parts
+    chunk_kwargs = {
+        "unique_size": max(int(args.unique_ratio * total_size), 1),
+        "gpu": True if args.type == "gpu" else False,
+    }
+
+    return dd.from_map(
+        generate_chunk,
+        [(i, args.chunk_size) for i in range(args.in_parts)],
+        meta=generate_chunk((0, 1), **chunk_kwargs),
+        enforce_metadata=False,
+        **chunk_kwargs,
+    )
+
+
+def bench_once(client, args, write_profile=None):
+
+    # Generate random Dask dataframe
+    df = get_random_ddf(args)
+
+    data_processed = len(df) * sum([t.itemsize for t in df.dtypes])
+    shuffle = {
+        "True": "tasks",
+        "False": False,
+    }.get(args.shuffle, args.shuffle)
+
+    if write_profile is None:
+        ctx = contextlib.nullcontext()
+    else:
+        ctx = performance_report(filename=args.profile)
+
+    with ctx:
+        t1 = clock()
+        agg = apply_groupby(
+            df,
+            sort=args.sort,
+            split_out=args.split_out,
+            split_every=args.split_every,
+            shuffle=shuffle,
+        )
+        t2 = clock()
+
+    output_size = agg.memory_usage(index=True, deep=True).compute().sum()
+    return (data_processed, output_size, t2 - t1)
+
+
+def pretty_print_results(args, address_to_index, p2p_bw, results):
+    if args.markdown:
+        print("```")
+    print("Groupby benchmark")
+    print_separator(separator="-")
+    print_key_value(key="Use shuffle", value=f"{args.shuffle}")
+    print_key_value(key="Output partitions", value=f"{args.split_out}")
+    print_key_value(key="Input partitions", value=f"{args.in_parts}")
+    print_key_value(key="Sort Groups", value=f"{args.sort}")
+    print_key_value(key="Rows-per-chunk", value=f"{args.chunk_size}")
+    print_key_value(key="Unique-group ratio", value=f"{args.unique_ratio}")
+    print_key_value(key="Protocol", value=f"{args.protocol}")
+    print_key_value(key="Device(s)", value=f"{args.devs}")
+    print_key_value(key="Tree-reduction width", value=f"{args.split_every}")
+    if args.device_memory_limit:
+        print_key_value(
+            key="Device memory limit", value=f"{format_bytes(args.device_memory_limit)}"
+        )
+    print_key_value(key="RMM Pool", value=f"{not args.disable_rmm_pool}")
+    if args.protocol == "ucx":
+        print_key_value(key="TCP", value=f"{args.enable_tcp_over_ucx}")
+        print_key_value(key="InfiniBand", value=f"{args.enable_infiniband}")
+        print_key_value(key="NVLink", value=f"{args.enable_nvlink}")
+    print_key_value(key="Worker thread(s)", value=f"{args.threads_per_worker}")
+    print_key_value(key="Data processed", value=f"{format_bytes(results[0][0])}")
+    print_key_value(key="Output size", value=f"{format_bytes(results[0][1])}")
+    if args.markdown:
+        print("\n```")
+    data_processed, output_size, durations = zip(*results)
+    print_throughput_bandwidth(
+        args, durations, data_processed, p2p_bw, address_to_index
+    )
+
+
+def create_tidy_results(args, p2p_bw, results):
+    configuration = {
+        "dataframe_type": "cudf" if args.type == "gpu" else "pandas",
+        "shuffle": args.shuffle,
+        "sort": args.sort,
+        "split_out": args.split_out,
+        "split_every": args.split_every,
+        "in_parts": args.in_parts,
+        "rows_per_chunk": args.chunk_size,
+        "unique_ratio": args.unique_ratio,
+        "protocol": args.protocol,
+        "devs": args.devs,
+        "device_memory_limit": args.device_memory_limit,
+        "rmm_pool": not args.disable_rmm_pool,
+        "tcp": args.enable_tcp_over_ucx,
+        "ib": args.enable_infiniband,
+        "nvlink": args.enable_nvlink,
+    }
+    timing_data = pd.DataFrame(
+        [
+            pd.Series(
+                data=ChainMap(
+                    configuration,
+                    {
+                        "wallclock": duration,
+                        "data_processed": data_processed,
+                        "output_size": output_size,
+                    },
+                )
+            )
+            for data_processed, output_size, duration in results
+        ]
+    )
+    return timing_data, p2p_bw
+
+
+def parse_args():
+    special_args = [
+        {
+            "name": "--in-parts",
+            "default": 100,
+            "metavar": "n",
+            "type": int,
+            "help": "Number of input partitions (default '100')",
+        },
+        {
+            "name": [
+                "-c",
+                "--chunk-size",
+            ],
+            "default": 1_000_000,
+            "metavar": "n",
+            "type": int,
+            "help": "Chunk size (default 1_000_000)",
+        },
+        {
+            "name": "--unique-ratio",
+            "default": 0.01,
+            "type": float,
+            "help": "Fraction of rows that are unique groups",
+        },
+        {
+            "name": "--sort",
+            "default": False,
+            "action": "store_true",
+            "help": "Whether to sort the output group order.",
+        },
+        {
+            "name": "--split_out",
+            "default": 1,
+            "type": int,
+            "help": "How many partitions to return.",
+        },
+        {
+            "name": "--split_every",
+            "default": 8,
+            "type": int,
+            "help": "Tree-reduction width.",
+        },
+        {
+            "name": "--shuffle",
+            "choices": ["False", "True", "tasks", "explicit-comms"],
+            "default": "False",
+            "type": str,
+            "help": "Whether to use shuffle-based groupby.",
+        },
+        {
+            "name": [
+                "-t",
+                "--type",
+            ],
+            "choices": ["cpu", "gpu"],
+            "default": "gpu",
+            "type": str,
+            "help": "Do shuffle with GPU or CPU dataframes (default 'gpu')",
+        },
+        {
+            "name": "--ignore-size",
+            "default": "1 MiB",
+            "metavar": "nbytes",
+            "type": parse_bytes,
+            "help": "Ignore messages smaller than this (default '1 MB')",
+        },
+        {
+            "name": "--runs",
+            "default": 3,
+            "type": int,
+            "help": "Number of runs",
+        },
+    ]
+
+    return parse_benchmark_args(
+        description="Distributed groupby (dask/cudf) benchmark", args_list=special_args
+    )
+
+
+if __name__ == "__main__":
+    execute_benchmark(
+        Config(
+            args=parse_args(),
+            bench_once=bench_once,
+            create_tidy_results=create_tidy_results,
+            pretty_print_results=pretty_print_results,
+        )
+    )
diff --git a/dask_cuda/benchmarks/utils.py b/dask_cuda/benchmarks/utils.py
index 8b110e8c..34454980 100644
--- a/dask_cuda/benchmarks/utils.py
+++ b/dask_cuda/benchmarks/utils.py
@@ -185,6 +185,11 @@ def parse_benchmark_args(description="Generic dask-cuda Benchmark", args_list=[]
         "Note: --devs is currently ignored in multi-node mode and for each host "
         "one worker per GPU will be launched.",
     )
+    parser.add_argument(
+        "--no-show-p2p-bandwidth",
+        action="store_true",
+        help="Do not produce detailed point to point bandwidth stats in output",
+    )
     parser.add_argument(
         "--all-to-all",
         action="store_true",
@@ -549,28 +554,29 @@ def print_throughput_bandwidth(
         key="Wall clock",
         value=f"{format_time(durations.mean())} +/- {format_time(durations.std()) }",
     )
-    print_separator(separator="=")
-    if args.markdown:
-        print("<details>\n<summary>Worker-Worker Transfer Rates</summary>\n\n```")
-
-    print_key_value(key="(w1,w2)", value="25% 50% 75% (total nbytes)")
-    print_separator(separator="-")
-    for (source, dest) in np.ndindex(p2p_bw.shape[:2]):
-        bw = BandwidthStats(*p2p_bw[source, dest, ...])
-        if bw.total_bytes > 0:
-            print_key_value(
-                key=f"({source},{dest})",
-                value=f"{format_bytes(bw.q25)}/s {format_bytes(bw.q50)}/s "
-                f"{format_bytes(bw.q75)}/s ({format_bytes(bw.total_bytes)})",
-            )
-    print_separator(separator="=")
-    print_key_value(key="Worker index", value="Worker address")
-    print_separator(separator="-")
-    for address, index in sorted(address_to_index.items(), key=itemgetter(1)):
-        print_key_value(key=index, value=address)
-    print_separator(separator="=")
-    if args.markdown:
-        print("```\n</details>\n")
+    if not args.no_show_p2p_bandwidth:
+        print_separator(separator="=")
+        if args.markdown:
+            print("<details>\n<summary>Worker-Worker Transfer Rates</summary>\n\n```")
+
+        print_key_value(key="(w1,w2)", value="25% 50% 75% (total nbytes)")
+        print_separator(separator="-")
+        for (source, dest) in np.ndindex(p2p_bw.shape[:2]):
+            bw = BandwidthStats(*p2p_bw[source, dest, ...])
+            if bw.total_bytes > 0:
+                print_key_value(
+                    key=f"({source},{dest})",
+                    value=f"{format_bytes(bw.q25)}/s {format_bytes(bw.q50)}/s "
+                    f"{format_bytes(bw.q75)}/s ({format_bytes(bw.total_bytes)})",
+                )
+        print_separator(separator="=")
+        print_key_value(key="Worker index", value="Worker address")
+        print_separator(separator="-")
+        for address, index in sorted(address_to_index.items(), key=itemgetter(1)):
+            print_key_value(key=index, value=address)
+        print_separator(separator="=")
+        if args.markdown:
+            print("```\n</details>\n")
     if args.plot:
         plot_benchmark(throughputs, args.plot, historical=True)
 
diff --git a/dask_cuda/device_host_file.py b/dask_cuda/device_host_file.py
index ae8e53de..e89ba64b 100644
--- a/dask_cuda/device_host_file.py
+++ b/dask_cuda/device_host_file.py
@@ -3,6 +3,7 @@
 import os
 import time
 
+import numpy
 from zict import Buffer, File, Func
 from zict.common import ZictBase
 
@@ -115,7 +116,11 @@ def __sizeof__(self):
 
     def __reduce_ex__(self, protocol):
         header, frames = device_serialize(self)
-        frames = [f.obj for f in frames]
+        # Since pickle cannot handle memoryviews, we convert them
+        # to NumPy arrays (zero-copy).
+        frames = [
+            (numpy.asarray(f) if isinstance(f, memoryview) else f) for f in frames
+        ]
         return device_deserialize, (header, frames)
 
 
diff --git a/dask_cuda/tests/test_proxy.py b/dask_cuda/tests/test_proxy.py
index 0c1f9a7f..2c21023c 100644
--- a/dask_cuda/tests/test_proxy.py
+++ b/dask_cuda/tests/test_proxy.py
@@ -7,6 +7,7 @@
 import numpy as np
 import pandas
 import pytest
+from packaging import version
 from pandas.testing import assert_frame_equal, assert_series_equal
 
 import dask
@@ -649,6 +650,8 @@ def test_cupy_broadcast_to():
 
 def test_cupy_matmul():
     cupy = pytest.importorskip("cupy")
+    if version.parse(cupy.__version__) >= version.parse("11.0"):
+        pytest.xfail("See: https://github.com/rapidsai/dask-cuda/issues/995")
     a, b = cupy.arange(10), cupy.arange(10)
     c = a @ b
     assert c == proxy_object.asproxy(a) @ b
@@ -658,6 +661,8 @@ def test_cupy_matmul():
 
 def test_cupy_imatmul():
     cupy = pytest.importorskip("cupy")
+    if version.parse(cupy.__version__) >= version.parse("11.0"):
+        pytest.xfail("See: https://github.com/rapidsai/dask-cuda/issues/995")
     a = cupy.arange(9).reshape(3, 3)
     c = a.copy()
     c @= a
diff --git a/dask_cuda/tests/test_spill.py b/dask_cuda/tests/test_spill.py
index 73f211d1..f93b83ec 100644
--- a/dask_cuda/tests/test_spill.py
+++ b/dask_cuda/tests/test_spill.py
@@ -1,6 +1,5 @@
 import os
 from time import sleep
-from unittest.mock import patch
 
 import pytest
 from zict.file import _safe_key as safe_key
@@ -207,57 +206,58 @@ async def test_cupy_cluster_device_spill(params):
 async def test_cudf_cluster_device_spill(params):
     cudf = pytest.importorskip("cudf")
 
-    # Disabling compression via environment variable seems to be the only way
-    # respected here. It is necessary to ensure spilled size matches the actual
-    # data size.
-    with patch.dict(os.environ, {"DASK_DISTRIBUTED__COMM__COMPRESSION": "False"}):
-        with dask.config.set({"distributed.worker.memory.terminate": False}):
-            async with LocalCUDACluster(
-                n_workers=1,
-                device_memory_limit=params["device_memory_limit"],
-                memory_limit=params["memory_limit"],
-                memory_target_fraction=params["host_target"],
-                memory_spill_fraction=params["host_spill"],
-                memory_pause_fraction=params["host_pause"],
-                asynchronous=True,
-            ) as cluster:
-                async with Client(cluster, asynchronous=True) as client:
-
-                    # There's a known issue with datetime64:
-                    # https://github.com/numpy/numpy/issues/4983#issuecomment-441332940
-                    # The same error above happens when spilling datetime64 to disk
-                    cdf = (
-                        dask.datasets.timeseries(
-                            dtypes={"x": int, "y": float}, freq="400ms"
-                        )
-                        .reset_index(drop=True)
-                        .map_partitions(cudf.from_pandas)
-                    )
+    with dask.config.set(
+        {
+            "distributed.comm.compression": False,
+            "distributed.worker.memory.terminate": False,
+        }
+    ):
+        async with LocalCUDACluster(
+            n_workers=1,
+            device_memory_limit=params["device_memory_limit"],
+            memory_limit=params["memory_limit"],
+            memory_target_fraction=params["host_target"],
+            memory_spill_fraction=params["host_spill"],
+            memory_pause_fraction=params["host_pause"],
+            asynchronous=True,
+        ) as cluster:
+            async with Client(cluster, asynchronous=True) as client:
 
-                    sizes = await client.compute(
-                        cdf.map_partitions(lambda df: df.memory_usage())
+                # There's a known issue with datetime64:
+                # https://github.com/numpy/numpy/issues/4983#issuecomment-441332940
+                # The same error above happens when spilling datetime64 to disk
+                cdf = (
+                    dask.datasets.timeseries(
+                        dtypes={"x": int, "y": float}, freq="400ms"
                     )
-                    sizes = sizes.to_arrow().to_pylist()
-                    nbytes = sum(sizes)
+                    .reset_index(drop=True)
+                    .map_partitions(cudf.from_pandas)
+                )
 
-                    cdf2 = cdf.persist()
-                    await wait(cdf2)
+                sizes = await client.compute(
+                    cdf.map_partitions(lambda df: df.memory_usage())
+                )
+                sizes = sizes.to_arrow().to_pylist()
+                nbytes = sum(sizes)
 
-                    del cdf
+                cdf2 = cdf.persist()
+                await wait(cdf2)
 
-                    host_chunks = await client.run(lambda: len(get_worker().data.host))
-                    disk_chunks = await client.run(
-                        lambda: len(get_worker().data.disk or list())
-                    )
-                    for hc, dc in zip(host_chunks.values(), disk_chunks.values()):
-                        if params["spills_to_disk"]:
-                            assert dc > 0
-                        else:
-                            assert hc > 0
-                            assert dc == 0
+                del cdf
+
+                host_chunks = await client.run(lambda: len(get_worker().data.host))
+                disk_chunks = await client.run(
+                    lambda: len(get_worker().data.disk or list())
+                )
+                for hc, dc in zip(host_chunks.values(), disk_chunks.values()):
+                    if params["spills_to_disk"]:
+                        assert dc > 0
+                    else:
+                        assert hc > 0
+                        assert dc == 0
 
-                    await client.run(worker_assert, nbytes, 32, 2048)
+                await client.run(worker_assert, nbytes, 32, 2048)
 
-                    del cdf2
+                del cdf2
 
-                    await client.run(delayed_worker_assert, 0, 0, 0)
+                await client.run(delayed_worker_assert, 0, 0, 0)
diff --git a/docs/source/examples/best-practices.rst b/docs/source/examples/best-practices.rst
new file mode 100644
index 00000000..242e90ff
--- /dev/null
+++ b/docs/source/examples/best-practices.rst
@@ -0,0 +1,117 @@
+Best Practices
+==============
+
+
+Multi-GPU Machines
+~~~~~~~~~~~~~~~~~~
+
+When choosing between two multi-GPU setups, it is best to pick the one where most GPUs are co-located with one-another.  This could be a
+`DGX <https://www.nvidia.com/en-us/data-center/dgx-systems/>`_, a cloud instance with `multi-gpu options <https://rapids.ai/cloud>`_ , a high-density GPU HPC instance, etc.  This is done for two reasons:
+
+- Moving data between GPUs is costly and performance decreases when computation stops due to communication overheads, Host-to-Device/Device-to-Host transfers, etc
+- Multi-GPU instances often come with accelerated networking like `NVLink <https://www.nvidia.com/en-us/data-center/nvlink/>`_.  These accelerated
+networking paths usually have much higher throughput/bandwidth compared with traditional networking *and* don't force and Host-to-Device/Device-to-Host transfers.  See
+`Accelerated Networking`_ for more discussion
+
+.. code-block:: python
+
+    from dask_cuda import LocalCUDACluster
+
+    cluster = LocalCUDACluster(n_workers=2)                                # will use GPUs 0,1
+    cluster = LocalCUDACluster(CUDA_VISIBLE_DEVICES="3,4")                 # will use GPUs 3,4
+
+For more discussion on controlling number of workers/using multiple GPUs see :ref:`controlling-number-of-workers` .
+
+GPU Memory Management
+~~~~~~~~~~~~~~~~~~~~~
+
+When using Dask-CUDA, especially with RAPIDS, it's best to use an |rmm-pool|__ to pre-allocate memory on the GPU.  Allocating memory, while fast, takes a small amount of time, however, one can easily make
+hundreds of thousand or even millions of allocations in trivial workflows causing significant performance degradations.  With an RMM pool, allocations are sub-sampled from a larger pool and this greatly reduces the allocation time and thereby increases performance:
+
+
+  .. |rmm-pool| replace:: :abbr:`RMM (RAPIDS Memory Manager)` pool
+  __ https://docs.rapids.ai/api/rmm/stable/
+
+
+.. code-block:: python
+
+    from dask_cuda import LocalCUDACluster
+
+    cluster = LocalCUDACluster(CUDA_VISIBLE_DEVICES="0,1",
+                               protocol="ucx",
+                               rmm_pool_size="30GB")
+
+
+We also recommend allocating most, though not all, of the GPU memory space. We do this because the `CUDA Context <https://stackoverflow.com/questions/43244645/what-is-a-cuda-context#:~:text=The%20context%20holds%20all%20the,memory%20for%20zero%20copy%2C%20etc.>`_ takes a non-zero amount (typically 200-500 MBs) of GPU RAM on the device.
+
+Additionally, when using `Accelerated Networking`_ , we only need to register a single IPC handle for the whole pool (which is expensive, but only done once) since from the IPC point of viewer there's only a single allocation. As opposed to just using RMM without a pool where each new allocation must be registered with IPC.
+
+Accelerated Networking
+~~~~~~~~~~~~~~~~~~~~~~
+
+As discussed in `Multi-GPU Machines`_, accelerated networking has better bandwidth/throughput compared with traditional networking hardware and does
+not force any costly Host-to-Device/Device-to-Host transfers.  Dask-CUDA can leverage accelerated networking hardware with `UCX-Py <https://ucx-py.readthedocs.io/en/latest/>`_.
+
+As an example, let's compare a merge benchmark when using 2 GPUs connected with NVLink.  First we'll run with standard TCP comms:
+
+::
+
+    python local_cudf_merge.py -d 0,1 -p tcp -c 50_000_000 --rmm-pool-size 30GB
+
+
+In the above, we used 2 GPUs (2 dask-cuda-workers), pre-allocated 30GB of GPU RAM (to make gpu memory allocations faster), and used TCP comms
+when Dask needed to move data back-and-forth between workers. This setup results in an average wall clock time of: ``19.72 s +/- 694.36 ms``::
+
+    ================================================================================
+    Wall clock                | Throughput
+    --------------------------------------------------------------------------------
+    20.09 s                   | 151.93 MiB/s
+    20.33 s                   | 150.10 MiB/s
+    18.75 s                   | 162.75 MiB/s
+    ================================================================================
+    Throughput                | 154.73 MiB/s +/- 3.14 MiB/s
+    Bandwidth                 | 139.22 MiB/s +/- 2.98 MiB/s
+    Wall clock                | 19.72 s +/- 694.36 ms
+    ================================================================================
+    (w1,w2)                   | 25% 50% 75% (total nbytes)
+    --------------------------------------------------------------------------------
+    (0,1)                     | 138.48 MiB/s 150.16 MiB/s 157.36 MiB/s (8.66 GiB)
+    (1,0)                     | 107.01 MiB/s 162.38 MiB/s 188.59 MiB/s (8.66 GiB)
+    ================================================================================
+    Worker index              | Worker address
+    --------------------------------------------------------------------------------
+    0                         | tcp://127.0.0.1:44055
+    1                         | tcp://127.0.0.1:41095
+    ================================================================================
+
+
+To compare, we'll now change the ``procotol`` from ``tcp`` to ``ucx``:
+
+    python local_cudf_merge.py -d 0,1 -p ucx -c 50_000_000 --rmm-pool-size 30GB
+
+
+
+With UCX and NVLink, we greatly reduced the wall clock time to: ``347.43 ms +/- 5.41 ms``.::
+
+    ================================================================================
+    Wall clock                | Throughput
+    --------------------------------------------------------------------------------
+    354.87 ms                 | 8.40 GiB/s
+    345.24 ms                 | 8.63 GiB/s
+    342.18 ms                 | 8.71 GiB/s
+    ================================================================================
+    Throughput                | 8.58 GiB/s +/- 78.96 MiB/s
+    Bandwidth                 | 6.98 GiB/s +/- 46.05 MiB/s
+    Wall clock                | 347.43 ms +/- 5.41 ms
+    ================================================================================
+    (w1,w2)                   | 25% 50% 75% (total nbytes)
+    --------------------------------------------------------------------------------
+    (0,1)                     | 17.38 GiB/s 17.94 GiB/s 18.88 GiB/s (8.66 GiB)
+    (1,0)                     | 16.55 GiB/s 17.80 GiB/s 18.87 GiB/s (8.66 GiB)
+    ================================================================================
+    Worker index              | Worker address
+    --------------------------------------------------------------------------------
+    0                         | ucx://127.0.0.1:35954
+    1                         | ucx://127.0.0.1:53584
+    ================================================================================
+
diff --git a/docs/source/examples/worker_count.rst b/docs/source/examples/worker_count.rst
index 29c6502c..62954ffb 100644
--- a/docs/source/examples/worker_count.rst
+++ b/docs/source/examples/worker_count.rst
@@ -1,3 +1,5 @@
+.. _controlling-number-of-workers:
+
 Controlling number of workers
 =============================
 
@@ -44,4 +46,4 @@ These UUIDs can then be passed to ``CUDA_VISIBLE_DEVICES`` in place of a GPU ind
 .. code-block:: bash
 
     $ CUDA_VISIBLE_DEVICES="GPU-dae76d0e-3414-958a-8f3e-fc6682b36f31" \
-    > dask-cuda-worker 127.0.0.1:8786    
+    > dask-cuda-worker 127.0.0.1:8786
diff --git a/docs/source/index.rst b/docs/source/index.rst
index efd7f62f..a43f2907 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -39,5 +39,6 @@ Contents
    :maxdepth: 1
    :caption: Examples
 
+   examples/best-practices
    examples/worker_count
    examples/ucx
diff --git a/requirements.txt b/requirements.txt
index a384bfc2..3d673a95 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
-dask==2022.7.1
-distributed==2022.7.1
+dask==2022.9.2
+distributed==2022.9.2
 pynvml>=11.0.0
 numpy>=1.16.0
 numba>=0.54