From 18e562969d28beaff32665112b72b8b6bf872d25 Mon Sep 17 00:00:00 2001
From: Raymond Douglass <ray@raydouglass.com>
Date: Wed, 20 May 2020 11:06:44 -0400
Subject: [PATCH 001/126] DOC v0.15 Updates


From 5dedd6cf89d6d222cfcdb0397f6b2d2943dcdfa1 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Thu, 28 May 2020 16:07:13 -0700
Subject: [PATCH 002/126] Initialize dask-cuda-worker parent process' UCX
 configuration

---
 dask_cuda/dask_cuda_worker.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/dask_cuda/dask_cuda_worker.py b/dask_cuda/dask_cuda_worker.py
index 8b303f39..a1f3a280 100755
--- a/dask_cuda/dask_cuda_worker.py
+++ b/dask_cuda/dask_cuda_worker.py
@@ -23,6 +23,7 @@
 from tornado.ioloop import IOLoop, TimeoutError
 
 from .device_host_file import DeviceHostFile
+from .initialize import initialize
 from .local_cuda_cluster import cuda_visible_devices
 from .utils import (
     CPUAffinity,
@@ -308,6 +309,18 @@ def del_pid_file():
             )  # pragma: no cover
         rmm_pool_size = parse_bytes(rmm_pool_size)
 
+    # Ensure this parent dask-cuda-worker process uses the same UCX
+    # configuration as child worker processes created by it.
+    initialize(
+        create_cuda_context=False,
+        enable_tcp_over_ucx=enable_tcp_over_ucx,
+        enable_infiniband=enable_infiniband,
+        enable_nvlink=enable_nvlink,
+        enable_rdmacm=enable_rdmacm,
+        net_devices=net_devices,
+        cuda_device_index=0,
+    )
+
     nannies = [
         t(
             scheduler,

From dc4094cff1e73dbc7793053f587e45af413ed0da Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Tue, 2 Jun 2020 02:37:50 -0700
Subject: [PATCH 003/126] Initialize LocalCUDACluster parent process' UCX
 configuration

---
 dask_cuda/local_cuda_cluster.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/dask_cuda/local_cuda_cluster.py b/dask_cuda/local_cuda_cluster.py
index eeaab4ea..5fb55d7e 100644
--- a/dask_cuda/local_cuda_cluster.py
+++ b/dask_cuda/local_cuda_cluster.py
@@ -209,6 +209,15 @@ def __init__(
         self.set_ucx_net_devices = enable_infiniband
         self.host = kwargs.get("host", None)
 
+        initialize(
+            enable_tcp_over_ucx=enable_tcp_over_ucx,
+            enable_nvlink=enable_nvlink,
+            enable_infiniband=enable_infiniband,
+            enable_rdmacm=enable_rdmacm,
+            net_devices=ucx_net_devices,
+            cuda_device_index=0,
+        )
+
         super().__init__(
             n_workers=0,
             threads_per_worker=threads_per_worker,

From 27328b7c26c4f0f88b55631740fb79580bdf01a0 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Tue, 2 Jun 2020 02:45:14 -0700
Subject: [PATCH 004/126] Add missing `initialize` import to LocalCUDACluster

---
 dask_cuda/local_cuda_cluster.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dask_cuda/local_cuda_cluster.py b/dask_cuda/local_cuda_cluster.py
index 5fb55d7e..8bed6645 100644
--- a/dask_cuda/local_cuda_cluster.py
+++ b/dask_cuda/local_cuda_cluster.py
@@ -7,6 +7,7 @@
 from distributed.worker import parse_memory_limit
 
 from .device_host_file import DeviceHostFile
+from .initialize import initialize
 from .utils import (
     CPUAffinity,
     RMMPool,

From 7365dd58a0d72ce205829031dfd19bf33ac5c67a Mon Sep 17 00:00:00 2001
From: John Kirkham <jakirkham@gmail.com>
Date: Tue, 2 Jun 2020 12:05:33 -0700
Subject: [PATCH 005/126] Drop extra `.`

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 872da00f..170cc0fa 100644
--- a/README.md
+++ b/README.md
@@ -29,4 +29,4 @@ It only helps with deployment and management of Dask workers in multi-GPU
 systems.  Parallelizing GPU libraries like [RAPIDS](https://rapids.ai) and
 [CuPy](https://cupy.chainer.org) with Dask is an ongoing effort.  You may wish
 to read about this effort at [blog.dask.org](https://blog.dask.org) for more
-information..
+information.

From 23e1369e6eddc004289fddd67b04d6bcad98abb9 Mon Sep 17 00:00:00 2001
From: John Kirkham <jakirkham@gmail.com>
Date: Tue, 2 Jun 2020 12:05:47 -0700
Subject: [PATCH 006/126] Add RTD badge

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 170cc0fa..de6a5acc 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,5 @@
+[![RTD](https://readthedocs.org/projects/dask-cuda/badge/?version=latest)](https://dask-cuda.readthedocs.io/en/latest/?badge=latest)
+
 Dask CUDA
 =========
 

From 1e5c224a80a23dbfc18e66f93d22b610d9823433 Mon Sep 17 00:00:00 2001
From: John Kirkham <jakirkham@gmail.com>
Date: Tue, 2 Jun 2020 12:06:33 -0700
Subject: [PATCH 007/126] Link out to RTD docs

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index de6a5acc..da343f7c 100644
--- a/README.md
+++ b/README.md
@@ -31,4 +31,5 @@ It only helps with deployment and management of Dask workers in multi-GPU
 systems.  Parallelizing GPU libraries like [RAPIDS](https://rapids.ai) and
 [CuPy](https://cupy.chainer.org) with Dask is an ongoing effort.  You may wish
 to read about this effort at [blog.dask.org](https://blog.dask.org) for more
-information.
+information.  Additional information about Dask-CUDA can also be found in the
+[docs]( https://dask-cuda.readthedocs.io ).

From cc2da1fa14ee8a4fd2b8f6a9faf081b1ec72e495 Mon Sep 17 00:00:00 2001
From: John Kirkham <jakirkham@gmail.com>
Date: Tue, 2 Jun 2020 16:29:24 -0700
Subject: [PATCH 008/126] Handle `v` prefix in version

---
 setup.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index 79670b83..e25321a1 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -3,7 +3,7 @@ VCS = git
 style = pep440
 versionfile_source = dask_cuda/_version.py
 versionfile_build = dask_cuda/_version.py
-tag_prefix =
+tag_prefix = v
 parentdir_prefix = dask_cuda-
 
 [flake8]

From d319c82e407377b9c80e19a932029edb340ef42f Mon Sep 17 00:00:00 2001
From: John Kirkham <jakirkham@gmail.com>
Date: Tue, 2 Jun 2020 16:29:56 -0700
Subject: [PATCH 009/126] Add fullstops to sentences

---
 docs/source/conf.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 70483769..4c8c1c38 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -23,9 +23,9 @@
 copyright = "2020, NVIDIA"
 author = "NVIDIA"
 
-# The short X.Y version
+# The short X.Y version.
 version = "0.14"
-# The full version, including alpha/beta/rc tags
+# The full version, including alpha/beta/rc tags.
 release = "0.14.0"
 
 
From 69fdac92da361516d429f6c1e349482d80b4bbe5 Mon Sep 17 00:00:00 2001
From: John Kirkham <jakirkham@gmail.com>
Date: Tue, 2 Jun 2020 16:30:27 -0700
Subject: [PATCH 010/126] Swap long and short version order

---
 docs/source/conf.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 4c8c1c38..bded82b1 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -23,10 +23,10 @@
 copyright = "2020, NVIDIA"
 author = "NVIDIA"
 
-# The short X.Y version.
-version = "0.14"
 # The full version, including alpha/beta/rc tags.
 release = "0.14.0"
+# The short X.Y version.
+version = "0.14"
 
 
 # -- General configuration ---------------------------------------------------

From 822fb5bbe62a49bdc2483ec41375905ca733e0c2 Mon Sep 17 00:00:00 2001
From: John Kirkham <jakirkham@gmail.com>
Date: Tue, 2 Jun 2020 16:30:54 -0700
Subject: [PATCH 011/126] Space out versions

---
 docs/source/conf.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index bded82b1..83fe124b 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -25,6 +25,7 @@
 
 # The full version, including alpha/beta/rc tags.
 release = "0.14.0"
+
 # The short X.Y version.
 version = "0.14"
 

From e1898d5b750cc3c0bf31b60eaa0d4b442efb5852 Mon Sep 17 00:00:00 2001
From: John Kirkham <jakirkham@gmail.com>
Date: Tue, 2 Jun 2020 16:31:39 -0700
Subject: [PATCH 012/126] Use version from versioneer

---
 docs/source/conf.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 83fe124b..213aafb5 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -24,10 +24,10 @@
 author = "NVIDIA"
 
 # The full version, including alpha/beta/rc tags.
-release = "0.14.0"
+from dask_cuda import __version__ as release
 
 # The short X.Y version.
-version = "0.14"
+version = ".".join(release.split(".")[:2])
 
 
 # -- General configuration ---------------------------------------------------

From 037ccbc1486f1490f1d6f12be049612b334b1d7f Mon Sep 17 00:00:00 2001
From: John Kirkham <jakirkham@gmail.com>
Date: Thu, 4 Jun 2020 16:20:07 -0700
Subject: [PATCH 013/126] Rely on Dask's ability to serialize collections

As Dask is able to serialize collections of objects, simply serialize
all of the parts together in one step.
---
 dask_cuda/device_host_file.py | 19 +++----------------
 1 file changed, 3 insertions(+), 16 deletions(-)

diff --git a/dask_cuda/device_host_file.py b/dask_cuda/device_host_file.py
index 2ca738ce..5f094472 100644
--- a/dask_cuda/device_host_file.py
+++ b/dask_cuda/device_host_file.py
@@ -41,27 +41,14 @@ def __sizeof__(self):
 
 @dask_serialize.register(DeviceSerialized)
 def device_serialize(obj):
-    headers = []
-    all_frames = []
-    for part in obj.parts:
-        header, frames = serialize(part)
-        header["frame-start-stop"] = [len(all_frames), len(all_frames) + len(frames)]
-        headers.append(header)
-        all_frames.extend(frames)
-
+    headers, frames = serialize(obj.parts)
     header = {"sub-headers": headers, "main-header": obj.header}
-
-    return header, all_frames
+    return header, frames
 
 
 @dask_deserialize.register(DeviceSerialized)
 def device_deserialize(header, frames):
-    parts = []
-    for sub_header in header["sub-headers"]:
-        start, stop = sub_header.pop("frame-start-stop")
-        part = deserialize(sub_header, frames[start:stop])
-        parts.append(part)
-
+    parts = deserialize(header["sub-headers"], frames)
     return DeviceSerialized(header["main-header"], parts)
 
 
From 46072965084507223d46f964b5f01ae864a65491 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Fri, 5 Jun 2020 05:27:59 -0700
Subject: [PATCH 014/126] Ensure CI installs GPU build of UCX

---
 ci/gpu/build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 6428e522..ebffd95e 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -65,7 +65,7 @@ conda install "cudatoolkit=$CUDA_REL" \
 conda install -c conda-forge "pytest" "pytest-asyncio"
 
 # Use nightly build of ucx-py for now
-conda install -c rapidsai-nightly "ucx-py"
+conda install -c rapidsai-nightly "ucx-py ucx-proc=*=gpu"
 
 conda list
 

From 52f3398064e7d83b8c0b7f3b572c91368dbaae87 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Fri, 5 Jun 2020 05:46:00 -0700
Subject: [PATCH 015/126] Add conda-forge to install ucx-proc from

---
 ci/gpu/build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index ebffd95e..2aac189c 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -65,7 +65,7 @@ conda install "cudatoolkit=$CUDA_REL" \
 conda install -c conda-forge "pytest" "pytest-asyncio"
 
 # Use nightly build of ucx-py for now
-conda install -c rapidsai-nightly "ucx-py ucx-proc=*=gpu"
+conda install -c rapidsai-nightly -c conda-forge "ucx-py ucx-proc=*=gpu"
 
 conda list
 

From d9dff3e44fcbd70dba84c4c225b185a843772144 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Fri, 5 Jun 2020 06:09:49 -0700
Subject: [PATCH 016/126] Fix wrong formatting in build.sh

---
 ci/gpu/build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 2aac189c..b4333e5f 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -65,7 +65,7 @@ conda install "cudatoolkit=$CUDA_REL" \
 conda install -c conda-forge "pytest" "pytest-asyncio"
 
 # Use nightly build of ucx-py for now
-conda install -c rapidsai-nightly -c conda-forge "ucx-py ucx-proc=*=gpu"
+conda install -c rapidsai-nightly "ucx-py" "ucx-proc=*=gpu"
 
 conda list
 

From f0d19888fd3830db686cc44d5d5f005d7e9356d3 Mon Sep 17 00:00:00 2001
From: John Kirkham <jakirkham@gmail.com>
Date: Fri, 5 Jun 2020 09:21:26 -0700
Subject: [PATCH 017/126] Require Distributed 2.15.0+

This is needed for serialization fixes when working with collections of
objects (particularly when they contain 5+ elements).
---
 conda/recipes/dask-cuda/meta.yaml | 2 +-
 requirements.txt                  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/conda/recipes/dask-cuda/meta.yaml b/conda/recipes/dask-cuda/meta.yaml
index 3f626876..8a9fb5f3 100644
--- a/conda/recipes/dask-cuda/meta.yaml
+++ b/conda/recipes/dask-cuda/meta.yaml
@@ -25,7 +25,7 @@ requirements:
   run:
     - python x.x
     - dask-core >=2.4.0
-    - distributed >=2.7.0
+    - distributed >=2.15.0
     - pynvml >=8.0.3
     - numpy >=1.16.0
     - numba >=0.40.1
diff --git a/requirements.txt b/requirements.txt
index cd79b261..517bf771 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 dask>=2.9.0
-distributed>=2.11.0
+distributed>=2.15.0
 pynvml>=8.0.3
 numpy>=1.16.0
 numba>=0.40.1

From f36593eaeddf5347b3d45f0eff8cdbf76b5a60ee Mon Sep 17 00:00:00 2001
From: John Kirkham <jakirkham@gmail.com>
Date: Fri, 5 Jun 2020 13:20:23 -0700
Subject: [PATCH 018/126] Skip 2nd serialization pass of `DeviceSerialized`

As `"dask"` serialization already converts a CUDA object into headers
and frames that Dask is able to work with, drop code that tries to
serialize frames on host further (as they are already as simple as they
can be). Cuts a fair bit of boilerplate from the spilling path, which
should simplify things a bit.
---
 dask_cuda/device_host_file.py | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/dask_cuda/device_host_file.py b/dask_cuda/device_host_file.py
index 5f094472..ccd55583 100644
--- a/dask_cuda/device_host_file.py
+++ b/dask_cuda/device_host_file.py
@@ -13,7 +13,6 @@
 from distributed.utils import nbytes
 from distributed.worker import weight
 
-import numpy
 from zict import Buffer, File, Func
 from zict.common import ZictBase
 
@@ -31,37 +30,35 @@ class DeviceSerialized:
         that are in host memory
     """
 
-    def __init__(self, header, parts):
+    def __init__(self, header, frames):
         self.header = header
-        self.parts = parts
+        self.frames = frames
 
     def __sizeof__(self):
-        return sum(map(nbytes, self.parts))
+        return sum(map(nbytes, self.frames))
 
 
 @dask_serialize.register(DeviceSerialized)
 def device_serialize(obj):
-    headers, frames = serialize(obj.parts)
-    header = {"sub-headers": headers, "main-header": obj.header}
+    header = {"obj-header": dict(obj.header)}
+    frames = list(obj.frames)
     return header, frames
 
 
 @dask_deserialize.register(DeviceSerialized)
 def device_deserialize(header, frames):
-    parts = deserialize(header["sub-headers"], frames)
-    return DeviceSerialized(header["main-header"], parts)
+    return DeviceSerialized(header["obj-header"], frames)
 
 
 @nvtx_annotate("SPILL_D2H", color="red", domain="dask_cuda")
 def device_to_host(obj: object) -> DeviceSerialized:
     header, frames = serialize(obj, serializers=["dask", "pickle"])
-    frames = [numpy.asarray(f) for f in frames]
     return DeviceSerialized(header, frames)
 
 
 @nvtx_annotate("SPILL_H2D", color="green", domain="dask_cuda")
 def host_to_device(s: DeviceSerialized) -> object:
-    return deserialize(s.header, s.parts)
+    return deserialize(s.header, s.frames)
 
 
 class DeviceHostFile(ZictBase):

From 19df1131d566d1bd094e37cbc1343c5f4c16a8ba Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Fri, 5 Jun 2020 15:08:00 -0700
Subject: [PATCH 019/126] Mark tests using rmm.get_info xfail

---
 dask_cuda/tests/test_dask_cuda_worker.py   | 1 +
 dask_cuda/tests/test_local_cuda_cluster.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/dask_cuda/tests/test_dask_cuda_worker.py b/dask_cuda/tests/test_dask_cuda_worker.py
index c0297519..22f730aa 100644
--- a/dask_cuda/tests/test_dask_cuda_worker.py
+++ b/dask_cuda/tests/test_dask_cuda_worker.py
@@ -55,6 +55,7 @@ def get_visible_devices():
         del os.environ["CUDA_VISIBLE_DEVICES"]
 
 
+@pytest.mark.xfail(reason="rmm.get_info removed by https://github.com/rapidsai/rmm/pull/363")
 def test_rmm_pool(loop):  # noqa: F811
     rmm = pytest.importorskip("rmm")
     with popen(["dask-scheduler", "--port", "9369", "--no-dashboard"]):
diff --git a/dask_cuda/tests/test_local_cuda_cluster.py b/dask_cuda/tests/test_local_cuda_cluster.py
index 968786ce..f84466b6 100644
--- a/dask_cuda/tests/test_local_cuda_cluster.py
+++ b/dask_cuda/tests/test_local_cuda_cluster.py
@@ -106,6 +106,7 @@ async def test_n_workers():
         assert len(cluster.worker_spec) == 2
 
 
+@pytest.mark.xfail(reason="rmm.get_info removed by https://github.com/rapidsai/rmm/pull/363")
 @gen_test(timeout=20)
 async def test_rmm_pool():
     rmm = pytest.importorskip("rmm")

From ad278a57d27241df8b7e02a38856a2ea5cc11b51 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Fri, 5 Jun 2020 15:33:26 -0700
Subject: [PATCH 020/126] Use Numba memory manager in
 test_get_device_total_memory

---
 dask_cuda/tests/test_utils.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/dask_cuda/tests/test_utils.py b/dask_cuda/tests/test_utils.py
index 9780cd93..32f387f5 100644
--- a/dask_cuda/tests/test_utils.py
+++ b/dask_cuda/tests/test_utils.py
@@ -56,6 +56,9 @@ def test_cpu_affinity():
 
 
 def test_get_device_total_memory():
+    # Ensure Numba is using its own memory manager, rather than RMM's
+    cuda.set_memory_manager(cuda.cudadrv.driver.NumbaCUDAMemoryManager)
+
     for i in range(get_n_gpus()):
         with cuda.gpus[i]:
             assert (

From abc44ff282efb55703996e26fc7d3cc534b7f706 Mon Sep 17 00:00:00 2001
From: John Kirkham <jakirkham@gmail.com>
Date: Fri, 5 Jun 2020 20:35:21 -0700
Subject: [PATCH 021/126] Require Distributed 2.18.0+

---
 conda/recipes/dask-cuda/meta.yaml | 2 +-
 requirements.txt                  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/conda/recipes/dask-cuda/meta.yaml b/conda/recipes/dask-cuda/meta.yaml
index 8a9fb5f3..3404113c 100644
--- a/conda/recipes/dask-cuda/meta.yaml
+++ b/conda/recipes/dask-cuda/meta.yaml
@@ -25,7 +25,7 @@ requirements:
   run:
     - python x.x
     - dask-core >=2.4.0
-    - distributed >=2.15.0
+    - distributed >=2.18.0
     - pynvml >=8.0.3
     - numpy >=1.16.0
     - numba >=0.40.1
diff --git a/requirements.txt b/requirements.txt
index 517bf771..16141fc3 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 dask>=2.9.0
-distributed>=2.15.0
+distributed>=2.18.0
 pynvml>=8.0.3
 numpy>=1.16.0
 numba>=0.40.1

From ac122bf0e6ba69e5c83bab770663efef10f76a44 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Sat, 6 Jun 2020 12:26:33 -0700
Subject: [PATCH 022/126] Test RMM for resource type, remove xfail marks

---
 dask_cuda/tests/test_dask_cuda_worker.py   | 7 +++----
 dask_cuda/tests/test_local_cuda_cluster.py | 7 +++----
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/dask_cuda/tests/test_dask_cuda_worker.py b/dask_cuda/tests/test_dask_cuda_worker.py
index 22f730aa..c374b193 100644
--- a/dask_cuda/tests/test_dask_cuda_worker.py
+++ b/dask_cuda/tests/test_dask_cuda_worker.py
@@ -55,7 +55,6 @@ def get_visible_devices():
         del os.environ["CUDA_VISIBLE_DEVICES"]
 
 
-@pytest.mark.xfail(reason="rmm.get_info removed by https://github.com/rapidsai/rmm/pull/363")
 def test_rmm_pool(loop):  # noqa: F811
     rmm = pytest.importorskip("rmm")
     with popen(["dask-scheduler", "--port", "9369", "--no-dashboard"]):
@@ -79,6 +78,6 @@ def test_rmm_pool(loop):  # noqa: F811
                         assert time() - start < 10
                         sleep(0.1)
 
-                memory_info = client.run(rmm.get_info)
-                for v in memory_info.values():
-                    assert v.total == 2000000000
+                memory_resource_type = client.run(rmm.mr.get_default_resource_type)
+                for v in memory_resource_type.values():
+                    assert v is rmm._lib.memory_resource.CNMemMemoryResource
diff --git a/dask_cuda/tests/test_local_cuda_cluster.py b/dask_cuda/tests/test_local_cuda_cluster.py
index f84466b6..c6e9349f 100644
--- a/dask_cuda/tests/test_local_cuda_cluster.py
+++ b/dask_cuda/tests/test_local_cuda_cluster.py
@@ -106,13 +106,12 @@ async def test_n_workers():
         assert len(cluster.worker_spec) == 2
 
 
-@pytest.mark.xfail(reason="rmm.get_info removed by https://github.com/rapidsai/rmm/pull/363")
 @gen_test(timeout=20)
 async def test_rmm_pool():
     rmm = pytest.importorskip("rmm")
 
     async with LocalCUDACluster(rmm_pool_size="2GB", asynchronous=True) as cluster:
         async with Client(cluster, asynchronous=True) as client:
-            memory_info = await client.run(rmm.get_info)
-            for v in memory_info.values():
-                assert v.total == 2000000000
+            memory_resource_type = await client.run(rmm.mr.get_default_resource_type)
+            for v in memory_resource_type.values():
+                assert v is rmm._lib.memory_resource.CNMemMemoryResource

From c7c6eea4e9bf19e5130dd3d66329f8422007f00b Mon Sep 17 00:00:00 2001
From: John Kirkham <jakirkham@gmail.com>
Date: Sun, 7 Jun 2020 21:44:15 -0700
Subject: [PATCH 023/126] Skip shallow copy in serialization

---
 dask_cuda/device_host_file.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dask_cuda/device_host_file.py b/dask_cuda/device_host_file.py
index ccd55583..79bdcc01 100644
--- a/dask_cuda/device_host_file.py
+++ b/dask_cuda/device_host_file.py
@@ -40,8 +40,8 @@ def __sizeof__(self):
 
 @dask_serialize.register(DeviceSerialized)
 def device_serialize(obj):
-    header = {"obj-header": dict(obj.header)}
-    frames = list(obj.frames)
+    header = {"obj-header": obj.header}
+    frames = obj.frames
     return header, frames
 
 
From 7398e72fbfee80417d02be2e95da4df10f332078 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Tue, 9 Jun 2020 09:16:28 -0700
Subject: [PATCH 024/126] Mark test_get_device_total_memory xfail

---
 dask_cuda/tests/test_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dask_cuda/tests/test_utils.py b/dask_cuda/tests/test_utils.py
index 32f387f5..89418170 100644
--- a/dask_cuda/tests/test_utils.py
+++ b/dask_cuda/tests/test_utils.py
@@ -55,6 +55,7 @@ def test_cpu_affinity():
         assert list(os.sched_getaffinity(0)) == affinity
 
 
+@pytest.mark.xfail(reason="https://github.com/rapidsai/dask-cuda/issues/313")
 def test_get_device_total_memory():
     # Ensure Numba is using its own memory manager, rather than RMM's
     cuda.set_memory_manager(cuda.cudadrv.driver.NumbaCUDAMemoryManager)

From 81b86c96cc6304e937e82c2d41f14596a1ad48c6 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Tue, 9 Jun 2020 09:36:01 -0700
Subject: [PATCH 025/126] Fix dask-cuda-worker's interface argument

---
 dask_cuda/dask_cuda_worker.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/dask_cuda/dask_cuda_worker.py b/dask_cuda/dask_cuda_worker.py
index a1f3a280..df568e04 100755
--- a/dask_cuda/dask_cuda_worker.py
+++ b/dask_cuda/dask_cuda_worker.py
@@ -292,11 +292,8 @@ def del_pid_file():
             "dask-worker SCHEDULER_ADDRESS:8786"
         )
 
-    if interface:
-        if host:
-            raise ValueError("Can not specify both interface and host")
-        else:
-            host = get_ip_interface(interface)
+    if interface and host:
+        raise ValueError("Can not specify both interface and host")
 
     if rmm_pool_size is not None:
         try:
@@ -330,12 +327,13 @@ def del_pid_file():
             loop=loop,
             resources=resources,
             memory_limit=memory_limit,
-            interface=get_ucx_net_devices(
+            interface=interface or get_ucx_net_devices(
                 cuda_device_index=i,
                 ucx_net_devices=net_devices,
                 get_openfabrics=False,
                 get_network=True,
             ),
+            host=host,
             preload=(list(preload) or []) + ["dask_cuda.initialize"],
             preload_argv=(list(preload_argv) or []) + ["--create-cuda-context"],
             security=sec,

From 28f880d62ab9cc3aaaf8003b26d738e7aa87387d Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Tue, 9 Jun 2020 10:17:09 -0700
Subject: [PATCH 026/126] Fix dask-cuda-worker formatting

---
 dask_cuda/dask_cuda_worker.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/dask_cuda/dask_cuda_worker.py b/dask_cuda/dask_cuda_worker.py
index df568e04..1f399fbd 100755
--- a/dask_cuda/dask_cuda_worker.py
+++ b/dask_cuda/dask_cuda_worker.py
@@ -327,7 +327,8 @@ def del_pid_file():
             loop=loop,
             resources=resources,
             memory_limit=memory_limit,
-            interface=interface or get_ucx_net_devices(
+            interface=interface
+            or get_ucx_net_devices(
                 cuda_device_index=i,
                 ucx_net_devices=net_devices,
                 get_openfabrics=False,

From f84dc5ac1ef31b1b70de57f7f7331cb69a39eaf1 Mon Sep 17 00:00:00 2001
From: Benjamin Zaitlen <quasiben@gmail.com>
Date: Tue, 9 Jun 2020 12:30:40 -0700
Subject: [PATCH 027/126] lint

---
 dask_cuda/dask_cuda_worker.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dask_cuda/dask_cuda_worker.py b/dask_cuda/dask_cuda_worker.py
index 1f399fbd..151faba3 100755
--- a/dask_cuda/dask_cuda_worker.py
+++ b/dask_cuda/dask_cuda_worker.py
@@ -14,7 +14,7 @@
     enable_proctitle_on_current,
 )
 from distributed.security import Security
-from distributed.utils import get_ip_interface, parse_bytes
+from distributed.utils import parse_bytes
 from distributed.worker import parse_memory_limit
 
 import click

From 4c9172c3096bd192f37089b3fada139683c3d616 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Tue, 9 Jun 2020 14:18:21 -0700
Subject: [PATCH 028/126] Fix usage of --host in dask-cuda-worker

---
 dask_cuda/dask_cuda_worker.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/dask_cuda/dask_cuda_worker.py b/dask_cuda/dask_cuda_worker.py
index 151faba3..355ce76d 100755
--- a/dask_cuda/dask_cuda_worker.py
+++ b/dask_cuda/dask_cuda_worker.py
@@ -41,6 +41,18 @@
 pem_file_option_type = click.Path(exists=True, resolve_path=True)
 
 
+def _get_interface(interface, host, cuda_device_index, ucx_net_devices):
+    if host:
+        return None
+    else:
+        return interface or get_ucx_net_devices(
+            cuda_device_index=cuda_device_index,
+            ucx_net_devices=ucx_net_devices,
+            get_openfabrics=False,
+            get_network=True,
+        )
+
+
 @click.command(context_settings=dict(ignore_unknown_options=True))
 @click.argument("scheduler", type=str, required=False)
 @click.option(
@@ -327,13 +339,7 @@ def del_pid_file():
             loop=loop,
             resources=resources,
             memory_limit=memory_limit,
-            interface=interface
-            or get_ucx_net_devices(
-                cuda_device_index=i,
-                ucx_net_devices=net_devices,
-                get_openfabrics=False,
-                get_network=True,
-            ),
+            interface=_get_interface(interface, host, i, net_devices),
             host=host,
             preload=(list(preload) or []) + ["dask_cuda.initialize"],
             preload_argv=(list(preload_argv) or []) + ["--create-cuda-context"],

From 81277afad06aaff915c418272880e69bba19d0c6 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Wed, 10 Jun 2020 16:36:40 -0700
Subject: [PATCH 029/126] Close CUDA context before running
 test_get_device_total_memory

---
 dask_cuda/tests/test_utils.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/dask_cuda/tests/test_utils.py b/dask_cuda/tests/test_utils.py
index 89418170..5af0422e 100644
--- a/dask_cuda/tests/test_utils.py
+++ b/dask_cuda/tests/test_utils.py
@@ -55,9 +55,10 @@ def test_cpu_affinity():
         assert list(os.sched_getaffinity(0)) == affinity
 
 
-@pytest.mark.xfail(reason="https://github.com/rapidsai/dask-cuda/issues/313")
 def test_get_device_total_memory():
-    # Ensure Numba is using its own memory manager, rather than RMM's
+    # Close CUDA context and Ensure Numba is using its own memory manager,
+    # rather than RMM's
+    cuda.close()
     cuda.set_memory_manager(cuda.cudadrv.driver.NumbaCUDAMemoryManager)
 
     for i in range(get_n_gpus()):

From 506b4a6f205586e6a5190a273d26def759fad0d0 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Fri, 12 Jun 2020 03:28:48 -0700
Subject: [PATCH 030/126] Simplify test_get_device_total_memory

---
 dask_cuda/tests/test_utils.py | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/dask_cuda/tests/test_utils.py b/dask_cuda/tests/test_utils.py
index 5af0422e..bd679a36 100644
--- a/dask_cuda/tests/test_utils.py
+++ b/dask_cuda/tests/test_utils.py
@@ -56,17 +56,11 @@ def test_cpu_affinity():
 
 
 def test_get_device_total_memory():
-    # Close CUDA context and Ensure Numba is using its own memory manager,
-    # rather than RMM's
-    cuda.close()
-    cuda.set_memory_manager(cuda.cudadrv.driver.NumbaCUDAMemoryManager)
-
     for i in range(get_n_gpus()):
         with cuda.gpus[i]:
-            assert (
-                get_device_total_memory(i)
-                == cuda.current_context().get_memory_info()[1]
-            )
+            total_mem = get_device_total_memory(i)
+            assert type(total_mem) is int
+            assert total_mem > 0
 
 
 @pytest.mark.parametrize("enable_tcp", [True, False])

From 41c19e3c6f443f91c9bbd7018b3672a43547d047 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Mon, 15 Jun 2020 06:30:09 -0700
Subject: [PATCH 031/126] Fix RDMACM test

---
 dask_cuda/tests/test_dgx.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/dask_cuda/tests/test_dgx.py b/dask_cuda/tests/test_dgx.py
index 9c2b3b60..08032268 100644
--- a/dask_cuda/tests/test_dgx.py
+++ b/dask_cuda/tests/test_dgx.py
@@ -228,6 +228,8 @@ def _test_dask_cuda_worker_ucx_net_devices(enable_rdmacm):
     sched_env = os.environ.copy()
     sched_env["DASK_UCX__INFINIBAND"] = "True"
     sched_env["DASK_UCX__TCP"] = "True"
+    sched_env["DASK_UCX__CUDA_COPY"] = "True"
+    sched_env["DASK_UCX__NET_DEVICES"] = openfabrics_devices[0]
 
     if enable_rdmacm:
         sched_env["DASK_UCX__RDMACM"] = "True"
@@ -246,7 +248,10 @@ def _test_dask_cuda_worker_ucx_net_devices(enable_rdmacm):
 
     # Enable proper variables for client
     initialize(
-        enable_tcp_over_ucx=True, enable_infiniband=True, enable_rdmacm=enable_rdmacm
+        enable_tcp_over_ucx=True,
+        enable_infiniband=True,
+        enable_rdmacm=enable_rdmacm,
+        net_devices=openfabrics_devices[0],
     )
 
     with subprocess.Popen(

From 332a206ba5fe1d1767c1e54cd928ae501e4f406b Mon Sep 17 00:00:00 2001
From: Raymond Douglass <ray@raydouglass.com>
Date: Mon, 15 Jun 2020 11:24:06 -0400
Subject: [PATCH 032/126] Install dependencies via meta package

---
 ci/gpu/build.sh | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index b4333e5f..2b4015ae 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -57,9 +57,12 @@ conda list
 # Fixing Numpy version to avoid RuntimeWarning: numpy.ufunc size changed, may
 # indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject
 conda install "cudatoolkit=$CUDA_REL" \
-              "cupy>=6.5.0" "numpy=1.16.4" \
               "cudf=${MINOR_VERSION}" "dask-cudf=${MINOR_VERSION}" \
-              "dask>=2.8.1" "distributed>=2.8.1"
+              "rapids-build-env=$MINOR_VERSION.*"
+
+# https://docs.rapids.ai/maintainers/depmgmt/ 
+# conda remove -f rapids-build-env
+# conda install "your-pkg=1.0.0"
 
 # needed for async tests
 conda install -c conda-forge "pytest" "pytest-asyncio"

From bbe94fb5f9dbebb2df6b39c8c8ec4efeab88f6fe Mon Sep 17 00:00:00 2001
From: Raymond Douglass <ray@raydouglass.com>
Date: Mon, 15 Jun 2020 11:33:22 -0400
Subject: [PATCH 033/126] Install dependencies via meta package

---
 ci/gpu/build.sh | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 2b4015ae..c2c34db7 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -58,17 +58,13 @@ conda list
 # indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject
 conda install "cudatoolkit=$CUDA_REL" \
               "cudf=${MINOR_VERSION}" "dask-cudf=${MINOR_VERSION}" \
+              "ucx-py=$MINOR_VERSION.*" "ucx-proc=*=gpu" \
               "rapids-build-env=$MINOR_VERSION.*"
 
 # https://docs.rapids.ai/maintainers/depmgmt/ 
 # conda remove -f rapids-build-env
 # conda install "your-pkg=1.0.0"
 
-# needed for async tests
-conda install -c conda-forge "pytest" "pytest-asyncio"
-
-# Use nightly build of ucx-py for now
-conda install -c rapidsai-nightly "ucx-py" "ucx-proc=*=gpu"
 
 conda list
 

From c1e46b4a93b151534b9bc838dfbd57910c7e6689 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Mon, 15 Jun 2020 14:21:41 -0700
Subject: [PATCH 034/126] Ensure ucx_net_devices test runs successfully with
 many GPUs

---
 dask_cuda/tests/test_dgx.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/dask_cuda/tests/test_dgx.py b/dask_cuda/tests/test_dgx.py
index 08032268..8219adad 100644
--- a/dask_cuda/tests/test_dgx.py
+++ b/dask_cuda/tests/test_dgx.py
@@ -6,7 +6,7 @@
 import dask.array as da
 from dask_cuda import LocalCUDACluster
 from dask_cuda.initialize import initialize
-from dask_cuda.utils import get_gpu_count
+from dask_cuda.utils import get_n_gpus
 from distributed import Client
 from distributed.metrics import time
 from distributed.utils import get_ip_interface
@@ -277,11 +277,16 @@ def _test_dask_cuda_worker_ucx_net_devices(enable_rdmacm):
 
                 start = time()
                 while True:
-                    if len(client.scheduler_info()["workers"]) == get_gpu_count():
+                    n_gpus = get_n_gpus()
+                    if len(client.scheduler_info()["workers"]) == n_gpus:
                         break
-                    else:
-                        assert time() - start < 10
-                        sleep(0.1)
+                    elif time() - start > 2 * n_gpus:
+                        # We need to ensure processes are terminated to avoid hangs
+                        # if a timeout occurs, and then raise an assertion error
+                        worker_proc.kill()
+                        sched_proc.kill()
+                        assert time() - start < 2 * n_gpus
+                    sleep(0.1)
 
                 workers_tls = client.run(lambda: ucp.get_config()["TLS"])
                 workers_tls_priority = client.run(

From f4a965fe85685e6566a506ffc242b43c05d4918a Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Mon, 15 Jun 2020 16:07:40 -0700
Subject: [PATCH 035/126] Add new wait_workers test utility function

---
 dask_cuda/utils.py | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/dask_cuda/utils.py b/dask_cuda/utils.py
index b52156fb..022ac19d 100644
--- a/dask_cuda/utils.py
+++ b/dask_cuda/utils.py
@@ -1,5 +1,6 @@
 import math
 import os
+import time
 import warnings
 from multiprocessing import cpu_count
 
@@ -307,3 +308,37 @@ def get_preload_options(
         preload_options["preload_argv"].extend(initialize_ucx_argv)
 
     return preload_options
+
+
+def wait_workers(client, seconds_per_gpu=2, timeout_callback=None):
+    """
+    Wait for workers to be available. When a timeout occurs, a callback
+    is executed if specified. Generally used for tests.
+
+    Parameters
+    ----------
+    client: distributed.Client
+        Instance of client, used to query for number of workers connected.
+    seconds_per_gpu: float
+        Seconds to wait for each GPU on the system. For example, if its
+        value is 2 and there is a total of 8 GPUs (workers) being started,
+        a timeout will occur after 16 seconds.
+    timeout_callback: None or callable
+        A callback function to be executed if a timeout occurs, ignored if
+        None.
+
+    Returns
+    -------
+    True if all workers were started, False if a timeout occurs.
+    """
+    start = time.time()
+    while True:
+        n_gpus = get_n_gpus()
+        if len(client.scheduler_info()["workers"]) == n_gpus:
+            break
+        elif time.time() - start > seconds_per_gpu * n_gpus:
+            if callable(timeout_callback):
+                timeout_callback()
+            return False
+        time.sleep(0.1)
+    return True

From 8f069ea736d047ca889c33e30e725531624a5c8a Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Wed, 17 Jun 2020 06:04:14 -0700
Subject: [PATCH 036/126] Replace worker waiting test code by wait_workers

---
 dask_cuda/tests/test_dask_cuda_worker.py | 18 +++---------------
 dask_cuda/tests/test_dgx.py              | 21 ++++++++-------------
 2 files changed, 11 insertions(+), 28 deletions(-)

diff --git a/dask_cuda/tests/test_dask_cuda_worker.py b/dask_cuda/tests/test_dask_cuda_worker.py
index c374b193..2c99caa5 100644
--- a/dask_cuda/tests/test_dask_cuda_worker.py
+++ b/dask_cuda/tests/test_dask_cuda_worker.py
@@ -3,7 +3,7 @@
 import os
 from time import sleep
 
-from dask_cuda.utils import get_gpu_count
+from dask_cuda.utils import wait_workers
 from distributed import Client
 from distributed.metrics import time
 from distributed.system import MEMORY_LIMIT
@@ -29,13 +29,7 @@ def test_cuda_visible_devices_and_memory_limit(loop):  # noqa: F811
                 ]
             ):
                 with Client("127.0.0.1:9359", loop=loop) as client:
-                    start = time()
-                    while True:
-                        if len(client.scheduler_info()["workers"]) == 4:
-                            break
-                        else:
-                            assert time() - start < 10
-                            sleep(0.1)
+                    assert wait_workers(client)
 
                     def get_visible_devices():
                         return os.environ["CUDA_VISIBLE_DEVICES"]
@@ -70,13 +64,7 @@ def test_rmm_pool(loop):  # noqa: F811
             ]
         ):
             with Client("127.0.0.1:9369", loop=loop) as client:
-                start = time()
-                while True:
-                    if len(client.scheduler_info()["workers"]) == get_gpu_count():
-                        break
-                    else:
-                        assert time() - start < 10
-                        sleep(0.1)
+                assert wait_workers(client)
 
                 memory_resource_type = client.run(rmm.mr.get_default_resource_type)
                 for v in memory_resource_type.values():
diff --git a/dask_cuda/tests/test_dgx.py b/dask_cuda/tests/test_dgx.py
index 8219adad..38aaa633 100644
--- a/dask_cuda/tests/test_dgx.py
+++ b/dask_cuda/tests/test_dgx.py
@@ -6,7 +6,7 @@
 import dask.array as da
 from dask_cuda import LocalCUDACluster
 from dask_cuda.initialize import initialize
-from dask_cuda.utils import get_n_gpus
+from dask_cuda.utils import wait_workers
 from distributed import Client
 from distributed.metrics import time
 from distributed.utils import get_ip_interface
@@ -275,18 +275,13 @@ def _test_dask_cuda_worker_ucx_net_devices(enable_rdmacm):
         ) as worker_proc:
             with Client(sched_url, loop=loop) as client:
 
-                start = time()
-                while True:
-                    n_gpus = get_n_gpus()
-                    if len(client.scheduler_info()["workers"]) == n_gpus:
-                        break
-                    elif time() - start > 2 * n_gpus:
-                        # We need to ensure processes are terminated to avoid hangs
-                        # if a timeout occurs, and then raise an assertion error
-                        worker_proc.kill()
-                        sched_proc.kill()
-                        assert time() - start < 2 * n_gpus
-                    sleep(0.1)
+                def _timeout_callback():
+                    # We must ensure processes are terminated to avoid hangs
+                    # if a timeout occurs
+                    worker_proc.kill()
+                    sched_proc.kill()
+
+                assert wait_workers(client, timeout_callback=_timeout_callback)
 
                 workers_tls = client.run(lambda: ucp.get_config()["TLS"])
                 workers_tls_priority = client.run(

From 07d2f735cd13c2ad8863be1eb877e80fe83f5d22 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Wed, 17 Jun 2020 06:35:57 -0700
Subject: [PATCH 037/126] Remove unused imports

---
 dask_cuda/tests/test_dask_cuda_worker.py | 2 --
 dask_cuda/tests/test_dgx.py              | 1 -
 2 files changed, 3 deletions(-)

diff --git a/dask_cuda/tests/test_dask_cuda_worker.py b/dask_cuda/tests/test_dask_cuda_worker.py
index 2c99caa5..adc8fb6c 100644
--- a/dask_cuda/tests/test_dask_cuda_worker.py
+++ b/dask_cuda/tests/test_dask_cuda_worker.py
@@ -1,11 +1,9 @@
 from __future__ import absolute_import, division, print_function
 
 import os
-from time import sleep
 
 from dask_cuda.utils import wait_workers
 from distributed import Client
-from distributed.metrics import time
 from distributed.system import MEMORY_LIMIT
 from distributed.utils_test import loop  # noqa: F401
 from distributed.utils_test import popen
diff --git a/dask_cuda/tests/test_dgx.py b/dask_cuda/tests/test_dgx.py
index 38aaa633..9d588584 100644
--- a/dask_cuda/tests/test_dgx.py
+++ b/dask_cuda/tests/test_dgx.py
@@ -8,7 +8,6 @@
 from dask_cuda.initialize import initialize
 from dask_cuda.utils import wait_workers
 from distributed import Client
-from distributed.metrics import time
 from distributed.utils import get_ip_interface
 
 import numpy

From 37f2e703bd8ddcb9d776daa7f471c7c064e4d34b Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Wed, 17 Jun 2020 07:58:44 -0700
Subject: [PATCH 038/126] Add parameter min_timeout to wait_workers

---
 dask_cuda/utils.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/dask_cuda/utils.py b/dask_cuda/utils.py
index 022ac19d..c8189530 100644
--- a/dask_cuda/utils.py
+++ b/dask_cuda/utils.py
@@ -310,7 +310,7 @@ def get_preload_options(
     return preload_options
 
 
-def wait_workers(client, seconds_per_gpu=2, timeout_callback=None):
+def wait_workers(client, min_timeout=10, seconds_per_gpu=2, timeout_callback=None):
     """
     Wait for workers to be available. When a timeout occurs, a callback
     is executed if specified. Generally used for tests.
@@ -319,10 +319,13 @@ def wait_workers(client, seconds_per_gpu=2, timeout_callback=None):
     ----------
     client: distributed.Client
         Instance of client, used to query for number of workers connected.
+    min_timeout: float
+        Minimum number of seconds to wait before timeout.
     seconds_per_gpu: float
         Seconds to wait for each GPU on the system. For example, if its
         value is 2 and there is a total of 8 GPUs (workers) being started,
-        a timeout will occur after 16 seconds.
+        a timeout will occur after 16 seconds. Note that this value is only
+        used as timeout when larger than min_timeout.
     timeout_callback: None or callable
         A callback function to be executed if a timeout occurs, ignored if
         None.
@@ -332,11 +335,12 @@ def wait_workers(client, seconds_per_gpu=2, timeout_callback=None):
     True if all workers were started, False if a timeout occurs.
     """
     start = time.time()
+    n_gpus = get_n_gpus()
+    timeout = max(10, seconds_per_gpu * n_gpus)
     while True:
-        n_gpus = get_n_gpus()
         if len(client.scheduler_info()["workers"]) == n_gpus:
             break
-        elif time.time() - start > seconds_per_gpu * n_gpus:
+        elif time.time() - start > timeout:
             if callable(timeout_callback):
                 timeout_callback()
             return False

From 0a0add8369b5a34c8ddfdc5f18309adea4363538 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Wed, 17 Jun 2020 14:06:36 -0700
Subject: [PATCH 039/126] Fix test_local_cuda_cluster for 10 or more devices

---
 dask_cuda/tests/test_local_cuda_cluster.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dask_cuda/tests/test_local_cuda_cluster.py b/dask_cuda/tests/test_local_cuda_cluster.py
index c6e9349f..2958dc29 100644
--- a/dask_cuda/tests/test_local_cuda_cluster.py
+++ b/dask_cuda/tests/test_local_cuda_cluster.py
@@ -34,7 +34,7 @@ def get_visible_devices():
             assert full_mem >= MEMORY_LIMIT - 1024 and full_mem < MEMORY_LIMIT + 1024
 
             for w, devices in result.items():
-                ident = devices[0]
+                ident = devices.split(",")[0]
                 assert int(ident) == cluster.scheduler.workers[w].name
 
             with pytest.raises(ValueError):

From 3146ecababe0ac213e6042cd0f932d994620b471 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Wed, 17 Jun 2020 15:04:34 -0700
Subject: [PATCH 040/126] Use set in test_cpu_affinity to avoid different
 ordering

---
 dask_cuda/tests/test_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dask_cuda/tests/test_utils.py b/dask_cuda/tests/test_utils.py
index bd679a36..60d52d51 100644
--- a/dask_cuda/tests/test_utils.py
+++ b/dask_cuda/tests/test_utils.py
@@ -52,7 +52,7 @@ def test_cpu_affinity():
     for i in range(get_n_gpus()):
         affinity = get_cpu_affinity(i)
         os.sched_setaffinity(0, affinity)
-        assert list(os.sched_getaffinity(0)) == affinity
+        assert os.sched_getaffinity(0) == set(affinity)
 
 
 def test_get_device_total_memory():

From 14b8b33c93af59e9d157d75923a75c16a4769af6 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Wed, 17 Jun 2020 15:41:24 -0700
Subject: [PATCH 041/126] Add support for DGX A100 where possible

---
 dask_cuda/tests/test_dgx.py | 50 ++++++++++++++++++++++++++-----------
 1 file changed, 36 insertions(+), 14 deletions(-)

diff --git a/dask_cuda/tests/test_dgx.py b/dask_cuda/tests/test_dgx.py
index 9d588584..1faf2a44 100644
--- a/dask_cuda/tests/test_dgx.py
+++ b/dask_cuda/tests/test_dgx.py
@@ -1,6 +1,7 @@
 import multiprocessing as mp
 import os
 import subprocess
+from enum import Enum, auto
 from time import sleep
 
 import dask.array as da
@@ -19,25 +20,38 @@
 psutil = pytest.importorskip("psutil")
 
 
-def _check_dgx_version():
-    dgx_server = None
+class DGXVersion(Enum):
+    DGX_1 = auto()
+    DGX_2 = auto()
+    DGX_A100 = auto()
+
+
+def _get_dgx_name():
+    product_name_file = "/sys/class/dmi/id/product_name"
+
+    if not os.path.isfile(product_name_file):
+        return None
 
-    if not os.path.isfile("/etc/dgx-release"):
-        return dgx_server
+    for line in open(product_name_file):
+        return line
 
-    for line in open("/etc/dgx-release"):
-        if line.startswith("DGX_PLATFORM"):
-            if "DGX Server for DGX-1" in line:
-                dgx_server = 1
-            elif "DGX Server for DGX-2" in line:
-                dgx_server = 2
-            break
+
+def _get_dgx_version():
+    dgx_server = None
+    dgx_name = _get_dgx_name()
+
+    if "DGX-1" in dgx_name:
+        dgx_server = DGXVersion.DGX_1
+    elif "DGX-2" in dgx_name:
+        dgx_server = DGXVersion.DGX_2
+    elif "DGXA100" in dgx_name:
+        dgx_server = DGXVersion.DGX_A100
 
     return dgx_server
 
 
 def _get_dgx_net_devices():
-    if _check_dgx_version() == 1:
+    if _get_dgx_version() == DGXVersion.DGX_A100:
         return [
             "mlx5_0:1,ib0",
             "mlx5_0:1,ib0",
@@ -48,7 +62,7 @@ def _get_dgx_net_devices():
             "mlx5_3:1,ib3",
             "mlx5_3:1,ib3",
         ]
-    elif _check_dgx_version() == 2:
+    elif _get_dgx_version() == DGXVersion.DGX_2:
         return [
             "mlx5_0:1,ib0",
             "mlx5_0:1,ib0",
@@ -71,7 +85,7 @@ def _get_dgx_net_devices():
         return None
 
 
-if _check_dgx_version() is None:
+if _get_dgx_version() is None:
     pytest.skip("Not a DGX server", allow_module_level=True)
 
 
@@ -200,6 +214,10 @@ def check_ucx_options():
         {"enable_infiniband": True, "enable_nvlink": True, "enable_rdmacm": True},
     ],
 )
+@pytest.mark.skipif(
+    _get_dgx_version() == DGXVersion.DGX_A100,
+    reason="Automatic InfiniBand device detection Unsupported for %s" % _get_dgx_name(),
+)
 def test_ucx_infiniband_nvlink(params):
     p = mp.Process(
         target=_test_ucx_infiniband_nvlink,
@@ -311,6 +329,10 @@ def _timeout_callback():
 
 
 @pytest.mark.parametrize("enable_rdmacm", [False, True])
+@pytest.mark.skipif(
+    _get_dgx_version() == DGXVersion.DGX_A100,
+    reason="Automatic InfiniBand device detection Unsupported for %s" % _get_dgx_name(),
+)
 def test_dask_cuda_worker_ucx_net_devices(enable_rdmacm):
     p = mp.Process(
         target=_test_dask_cuda_worker_ucx_net_devices, args=(enable_rdmacm,),

From a302a07242409e553012dc7d293df5e8fa768a06 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Wed, 17 Jun 2020 15:45:23 -0700
Subject: [PATCH 042/126] Fix DGX-1 name

---
 dask_cuda/tests/test_dgx.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dask_cuda/tests/test_dgx.py b/dask_cuda/tests/test_dgx.py
index 1faf2a44..d867a9ae 100644
--- a/dask_cuda/tests/test_dgx.py
+++ b/dask_cuda/tests/test_dgx.py
@@ -51,7 +51,7 @@ def _get_dgx_version():
 
 
 def _get_dgx_net_devices():
-    if _get_dgx_version() == DGXVersion.DGX_A100:
+    if _get_dgx_version() == DGXVersion.DGX_1:
         return [
             "mlx5_0:1,ib0",
             "mlx5_0:1,ib0",

From 63a5721a72de008a2345a8435b7d74019b35db5b Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Thu, 18 Jun 2020 06:20:48 -0700
Subject: [PATCH 043/126] Fix usage of min_timeout in wait_workers

---
 dask_cuda/utils.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/dask_cuda/utils.py b/dask_cuda/utils.py
index c8189530..7f9e5be0 100644
--- a/dask_cuda/utils.py
+++ b/dask_cuda/utils.py
@@ -334,15 +334,16 @@ def wait_workers(client, min_timeout=10, seconds_per_gpu=2, timeout_callback=Non
     -------
     True if all workers were started, False if a timeout occurs.
     """
-    start = time.time()
     n_gpus = get_n_gpus()
-    timeout = max(10, seconds_per_gpu * n_gpus)
+    timeout = max(min_timeout, seconds_per_gpu * n_gpus)
+
+    start = time.time()
     while True:
         if len(client.scheduler_info()["workers"]) == n_gpus:
-            break
+            return True
         elif time.time() - start > timeout:
             if callable(timeout_callback):
                 timeout_callback()
             return False
-        time.sleep(0.1)
-    return True
+        else:
+            time.sleep(0.1)

From ba3f35379f685b2638818801e00591c625fcc6c3 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Fri, 19 Jun 2020 07:48:33 -0700
Subject: [PATCH 044/126] Add new n_gpus argument to wait_workers

---
 dask_cuda/tests/test_dask_cuda_worker.py | 6 +++---
 dask_cuda/utils.py                       | 9 +++++++--
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/dask_cuda/tests/test_dask_cuda_worker.py b/dask_cuda/tests/test_dask_cuda_worker.py
index adc8fb6c..e2132888 100644
--- a/dask_cuda/tests/test_dask_cuda_worker.py
+++ b/dask_cuda/tests/test_dask_cuda_worker.py
@@ -2,7 +2,7 @@
 
 import os
 
-from dask_cuda.utils import wait_workers
+from dask_cuda.utils import get_gpu_count, wait_workers
 from distributed import Client
 from distributed.system import MEMORY_LIMIT
 from distributed.utils_test import loop  # noqa: F401
@@ -27,7 +27,7 @@ def test_cuda_visible_devices_and_memory_limit(loop):  # noqa: F811
                 ]
             ):
                 with Client("127.0.0.1:9359", loop=loop) as client:
-                    assert wait_workers(client)
+                    assert wait_workers(client, n_gpus=4)
 
                     def get_visible_devices():
                         return os.environ["CUDA_VISIBLE_DEVICES"]
@@ -62,7 +62,7 @@ def test_rmm_pool(loop):  # noqa: F811
             ]
         ):
             with Client("127.0.0.1:9369", loop=loop) as client:
-                assert wait_workers(client)
+                assert wait_workers(client, n_gpus=get_gpu_count())
 
                 memory_resource_type = client.run(rmm.mr.get_default_resource_type)
                 for v in memory_resource_type.values():
diff --git a/dask_cuda/utils.py b/dask_cuda/utils.py
index 7f9e5be0..e102c864 100644
--- a/dask_cuda/utils.py
+++ b/dask_cuda/utils.py
@@ -310,7 +310,9 @@ def get_preload_options(
     return preload_options
 
 
-def wait_workers(client, min_timeout=10, seconds_per_gpu=2, timeout_callback=None):
+def wait_workers(
+    client, min_timeout=10, seconds_per_gpu=2, n_gpus=None, timeout_callback=None
+):
     """
     Wait for workers to be available. When a timeout occurs, a callback
     is executed if specified. Generally used for tests.
@@ -326,6 +328,9 @@ def wait_workers(client, min_timeout=10, seconds_per_gpu=2, timeout_callback=Non
         value is 2 and there is a total of 8 GPUs (workers) being started,
         a timeout will occur after 16 seconds. Note that this value is only
         used as timeout when larger than min_timeout.
+    n_gpus: None or int
+        If specified, will wait for a that amount of GPUs (i.e., Dask workers)
+        to come online, else waits for a total of `get_n_gpus` workers.
     timeout_callback: None or callable
         A callback function to be executed if a timeout occurs, ignored if
         None.
@@ -334,7 +339,7 @@ def wait_workers(client, min_timeout=10, seconds_per_gpu=2, timeout_callback=Non
     -------
     True if all workers were started, False if a timeout occurs.
     """
-    n_gpus = get_n_gpus()
+    n_gpus = n_gpus or get_n_gpus()
     timeout = max(min_timeout, seconds_per_gpu * n_gpus)
 
     start = time.time()

From 2f6433e5a28612b0717ed1b1089e44196f976013 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Fri, 19 Jun 2020 12:23:42 -0700
Subject: [PATCH 045/126] Revert worker waiting code in test_dask_cuda_worker

---
 dask_cuda/tests/test_dask_cuda_worker.py | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/dask_cuda/tests/test_dask_cuda_worker.py b/dask_cuda/tests/test_dask_cuda_worker.py
index e2132888..c374b193 100644
--- a/dask_cuda/tests/test_dask_cuda_worker.py
+++ b/dask_cuda/tests/test_dask_cuda_worker.py
@@ -1,9 +1,11 @@
 from __future__ import absolute_import, division, print_function
 
 import os
+from time import sleep
 
-from dask_cuda.utils import get_gpu_count, wait_workers
+from dask_cuda.utils import get_gpu_count
 from distributed import Client
+from distributed.metrics import time
 from distributed.system import MEMORY_LIMIT
 from distributed.utils_test import loop  # noqa: F401
 from distributed.utils_test import popen
@@ -27,7 +29,13 @@ def test_cuda_visible_devices_and_memory_limit(loop):  # noqa: F811
                 ]
             ):
                 with Client("127.0.0.1:9359", loop=loop) as client:
-                    assert wait_workers(client, n_gpus=4)
+                    start = time()
+                    while True:
+                        if len(client.scheduler_info()["workers"]) == 4:
+                            break
+                        else:
+                            assert time() - start < 10
+                            sleep(0.1)
 
                     def get_visible_devices():
                         return os.environ["CUDA_VISIBLE_DEVICES"]
@@ -62,7 +70,13 @@ def test_rmm_pool(loop):  # noqa: F811
             ]
         ):
             with Client("127.0.0.1:9369", loop=loop) as client:
-                assert wait_workers(client, n_gpus=get_gpu_count())
+                start = time()
+                while True:
+                    if len(client.scheduler_info()["workers"]) == get_gpu_count():
+                        break
+                    else:
+                        assert time() - start < 10
+                        sleep(0.1)
 
                 memory_resource_type = client.run(rmm.mr.get_default_resource_type)
                 for v in memory_resource_type.values():

From 8637bf869e71106e30813dc38af01f86d1553b31 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Fri, 19 Jun 2020 14:53:40 -0700
Subject: [PATCH 046/126] Only create Security object if TLS files are
 specified

---
 dask_cuda/dask_cuda_worker.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/dask_cuda/dask_cuda_worker.py b/dask_cuda/dask_cuda_worker.py
index 355ce76d..ac2eb747 100755
--- a/dask_cuda/dask_cuda_worker.py
+++ b/dask_cuda/dask_cuda_worker.py
@@ -247,9 +247,12 @@ def main(
     enable_proctitle_on_current()
     enable_proctitle_on_children()
 
-    sec = Security(
-        tls_ca_file=tls_ca_file, tls_worker_cert=tls_cert, tls_worker_key=tls_key
-    )
+    if tls_ca_file and tls_cert and tls_worker_key:
+        sec = Security(
+            tls_ca_file=tls_ca_file, tls_worker_cert=tls_cert, tls_worker_key=tls_key
+        )
+    else:
+        sec = None
 
     try:
         nprocs = len(os.environ["CUDA_VISIBLE_DEVICES"].split(","))
@@ -343,7 +346,7 @@ def del_pid_file():
             host=host,
             preload=(list(preload) or []) + ["dask_cuda.initialize"],
             preload_argv=(list(preload_argv) or []) + ["--create-cuda-context"],
-            security=sec,
+            #security=sec,
             env={"CUDA_VISIBLE_DEVICES": cuda_visible_devices(i)},
             plugins={CPUAffinity(get_cpu_affinity(i)), RMMPool(rmm_pool_size)},
             name=name if nprocs == 1 or not name else name + "-" + str(i),

From a2a9358cb6a12367aa562f285b7c6ae7e4c54d57 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Fri, 19 Jun 2020 14:59:58 -0700
Subject: [PATCH 047/126] Fix argument tls_key argument name

---
 dask_cuda/dask_cuda_worker.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dask_cuda/dask_cuda_worker.py b/dask_cuda/dask_cuda_worker.py
index ac2eb747..12258ffb 100755
--- a/dask_cuda/dask_cuda_worker.py
+++ b/dask_cuda/dask_cuda_worker.py
@@ -247,7 +247,7 @@ def main(
     enable_proctitle_on_current()
     enable_proctitle_on_children()
 
-    if tls_ca_file and tls_cert and tls_worker_key:
+    if tls_ca_file and tls_cert and tls_key:
         sec = Security(
             tls_ca_file=tls_ca_file, tls_worker_cert=tls_cert, tls_worker_key=tls_key
         )
@@ -346,7 +346,7 @@ def del_pid_file():
             host=host,
             preload=(list(preload) or []) + ["dask_cuda.initialize"],
             preload_argv=(list(preload_argv) or []) + ["--create-cuda-context"],
-            #security=sec,
+            security=sec,
             env={"CUDA_VISIBLE_DEVICES": cuda_visible_devices(i)},
             plugins={CPUAffinity(get_cpu_affinity(i)), RMMPool(rmm_pool_size)},
             name=name if nprocs == 1 or not name else name + "-" + str(i),

From ee9bbd9250d45ded2ee810a5954d3133a905c13e Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Fri, 19 Jun 2020 15:31:32 -0700
Subject: [PATCH 048/126] Revert "Revert worker waiting code in
 test_dask_cuda_worker"

This reverts commit 2f6433e5a28612b0717ed1b1089e44196f976013.
---
 dask_cuda/tests/test_dask_cuda_worker.py | 20 +++-----------------
 1 file changed, 3 insertions(+), 17 deletions(-)

diff --git a/dask_cuda/tests/test_dask_cuda_worker.py b/dask_cuda/tests/test_dask_cuda_worker.py
index c374b193..e2132888 100644
--- a/dask_cuda/tests/test_dask_cuda_worker.py
+++ b/dask_cuda/tests/test_dask_cuda_worker.py
@@ -1,11 +1,9 @@
 from __future__ import absolute_import, division, print_function
 
 import os
-from time import sleep
 
-from dask_cuda.utils import get_gpu_count
+from dask_cuda.utils import get_gpu_count, wait_workers
 from distributed import Client
-from distributed.metrics import time
 from distributed.system import MEMORY_LIMIT
 from distributed.utils_test import loop  # noqa: F401
 from distributed.utils_test import popen
@@ -29,13 +27,7 @@ def test_cuda_visible_devices_and_memory_limit(loop):  # noqa: F811
                 ]
             ):
                 with Client("127.0.0.1:9359", loop=loop) as client:
-                    start = time()
-                    while True:
-                        if len(client.scheduler_info()["workers"]) == 4:
-                            break
-                        else:
-                            assert time() - start < 10
-                            sleep(0.1)
+                    assert wait_workers(client, n_gpus=4)
 
                     def get_visible_devices():
                         return os.environ["CUDA_VISIBLE_DEVICES"]
@@ -70,13 +62,7 @@ def test_rmm_pool(loop):  # noqa: F811
             ]
         ):
             with Client("127.0.0.1:9369", loop=loop) as client:
-                start = time()
-                while True:
-                    if len(client.scheduler_info()["workers"]) == get_gpu_count():
-                        break
-                    else:
-                        assert time() - start < 10
-                        sleep(0.1)
+                assert wait_workers(client, n_gpus=get_gpu_count())
 
                 memory_resource_type = client.run(rmm.mr.get_default_resource_type)
                 for v in memory_resource_type.values():

From 46d298c73caf865bb6efdc05ae1999febe535a1d Mon Sep 17 00:00:00 2001
From: Jacob Tomlinson <jtomlinson@nvidia.com>
Date: Wed, 24 Jun 2020 06:11:57 -0700
Subject: [PATCH 049/126] Refactor worker into CUDAWorker class

---
 dask_cuda/dask_cuda_worker.py | 342 +++++++++++++++++++++-------------
 1 file changed, 208 insertions(+), 134 deletions(-)

diff --git a/dask_cuda/dask_cuda_worker.py b/dask_cuda/dask_cuda_worker.py
index 12258ffb..2a3f537d 100755
--- a/dask_cuda/dask_cuda_worker.py
+++ b/dask_cuda/dask_cuda_worker.py
@@ -1,5 +1,6 @@
 from __future__ import absolute_import, division, print_function
 
+import asyncio
 import atexit
 import logging
 import multiprocessing
@@ -244,160 +245,233 @@ def main(
     net_devices,
     **kwargs,
 ):
-    enable_proctitle_on_current()
-    enable_proctitle_on_children()
-
-    if tls_ca_file and tls_cert and tls_key:
-        sec = Security(
-            tls_ca_file=tls_ca_file, tls_worker_cert=tls_cert, tls_worker_key=tls_key
-        )
-    else:
-        sec = None
+    worker = CUDAWorker(
+        scheduler,
+        host,
+        nthreads,
+        name,
+        memory_limit,
+        device_memory_limit,
+        rmm_pool_size,
+        pid_file,
+        resources,
+        dashboard,
+        dashboard_address,
+        local_directory,
+        scheduler_file,
+        interface,
+        death_timeout,
+        preload,
+        dashboard_prefix,
+        tls_ca_file,
+        tls_cert,
+        tls_key,
+        enable_tcp_over_ucx,
+        enable_infiniband,
+        enable_nvlink,
+        enable_rdmacm,
+        net_devices,
+        **kwargs,
+    )
 
-    try:
-        nprocs = len(os.environ["CUDA_VISIBLE_DEVICES"].split(","))
-    except KeyError:
-        nprocs = get_n_gpus()
+    async def on_signal(signum):
+        logger.info("Exiting on signal %d", signum)
+        await worker.close()
 
-    if not nthreads:
-        nthreads = min(1, multiprocessing.cpu_count() // nprocs)
+    async def run():
+        await worker
+        await worker.finished()
 
-    memory_limit = parse_memory_limit(memory_limit, nthreads, total_cores=nprocs)
+    loop = IOLoop.current()
 
-    if pid_file:
-        with open(pid_file, "w") as f:
-            f.write(str(os.getpid()))
+    install_signal_handlers(loop, cleanup=on_signal)
 
-        def del_pid_file():
-            if os.path.exists(pid_file):
-                os.remove(pid_file)
+    try:
+        loop.run_sync(run)
+    except (KeyboardInterrupt, TimeoutError):
+        pass
+    finally:
+        logger.info("End worker")
 
-        atexit.register(del_pid_file)
 
-    services = {}
+class CUDAWorker:
+    def __init__(
+        self,
+        scheduler,
+        host,
+        nthreads,
+        name,
+        memory_limit,
+        device_memory_limit,
+        rmm_pool_size,
+        pid_file,
+        resources,
+        dashboard,
+        dashboard_address,
+        local_directory,
+        scheduler_file,
+        interface,
+        death_timeout,
+        preload,
+        dashboard_prefix,
+        tls_ca_file,
+        tls_cert,
+        tls_key,
+        enable_tcp_over_ucx,
+        enable_infiniband,
+        enable_nvlink,
+        enable_rdmacm,
+        net_devices,
+        **kwargs,
+    ):
+        enable_proctitle_on_current()
+        enable_proctitle_on_children()
+
+        if tls_ca_file and tls_cert and tls_key:
+            sec = Security(
+                tls_ca_file=tls_ca_file,
+                tls_worker_cert=tls_cert,
+                tls_worker_key=tls_key,
+            )
+        else:
+            sec = None
 
-    if dashboard:
         try:
-            from distributed.dashboard import BokehWorker
-        except ImportError:
-            pass
-        else:
-            if dashboard_prefix:
-                result = (BokehWorker, {"prefix": dashboard_prefix})
-            else:
-                result = BokehWorker
-            services[("dashboard", dashboard_address)] = result
+            nprocs = len(os.environ["CUDA_VISIBLE_DEVICES"].split(","))
+        except KeyError:
+            nprocs = get_n_gpus()
 
-    if resources:
-        resources = resources.replace(",", " ").split()
-        resources = dict(pair.split("=") for pair in resources)
-        resources = valmap(float, resources)
-    else:
-        resources = None
+        if not nthreads:
+            nthreads = min(1, multiprocessing.cpu_count() // nprocs)
 
-    loop = IOLoop.current()
+        memory_limit = parse_memory_limit(memory_limit, nthreads, total_cores=nprocs)
 
-    preload_argv = kwargs.get("preload_argv", [])
-    kwargs = {"worker_port": None, "listen_address": None}
-    t = Nanny
+        if pid_file:
+            with open(pid_file, "w") as f:
+                f.write(str(os.getpid()))
 
-    if not scheduler and not scheduler_file and "scheduler-address" not in config:
-        raise ValueError(
-            "Need to provide scheduler address like\n"
-            "dask-worker SCHEDULER_ADDRESS:8786"
-        )
+            def del_pid_file():
+                if os.path.exists(pid_file):
+                    os.remove(pid_file)
 
-    if interface and host:
-        raise ValueError("Can not specify both interface and host")
+            atexit.register(del_pid_file)
 
-    if rmm_pool_size is not None:
-        try:
-            import rmm  # noqa F401
-        except ImportError:
-            raise ValueError(
-                "RMM pool requested but module 'rmm' is not available. "
-                "For installation instructions, please see "
-                "https://github.com/rapidsai/rmm"
-            )  # pragma: no cover
-        rmm_pool_size = parse_bytes(rmm_pool_size)
-
-    # Ensure this parent dask-cuda-worker process uses the same UCX
-    # configuration as child worker processes created by it.
-    initialize(
-        create_cuda_context=False,
-        enable_tcp_over_ucx=enable_tcp_over_ucx,
-        enable_infiniband=enable_infiniband,
-        enable_nvlink=enable_nvlink,
-        enable_rdmacm=enable_rdmacm,
-        net_devices=net_devices,
-        cuda_device_index=0,
-    )
-
-    nannies = [
-        t(
-            scheduler,
-            scheduler_file=scheduler_file,
-            nthreads=nthreads,
-            services=services,
-            loop=loop,
-            resources=resources,
-            memory_limit=memory_limit,
-            interface=_get_interface(interface, host, i, net_devices),
-            host=host,
-            preload=(list(preload) or []) + ["dask_cuda.initialize"],
-            preload_argv=(list(preload_argv) or []) + ["--create-cuda-context"],
-            security=sec,
-            env={"CUDA_VISIBLE_DEVICES": cuda_visible_devices(i)},
-            plugins={CPUAffinity(get_cpu_affinity(i)), RMMPool(rmm_pool_size)},
-            name=name if nprocs == 1 or not name else name + "-" + str(i),
-            local_directory=local_directory,
-            config={
-                "ucx": get_ucx_config(
-                    enable_tcp_over_ucx=enable_tcp_over_ucx,
-                    enable_infiniband=enable_infiniband,
-                    enable_nvlink=enable_nvlink,
-                    enable_rdmacm=enable_rdmacm,
-                    net_devices=net_devices,
-                    cuda_device_index=i,
-                )
-            },
-            data=(
-                DeviceHostFile,
-                {
-                    "device_memory_limit": get_device_total_memory(index=i)
-                    if (device_memory_limit == "auto" or device_memory_limit == int(0))
-                    else parse_bytes(device_memory_limit),
-                    "memory_limit": memory_limit,
-                    "local_directory": local_directory,
-                },
-            ),
-            **kwargs,
-        )
-        for i in range(nprocs)
-    ]
+        services = {}
 
-    @gen.coroutine
-    def close_all():
-        # Unregister all workers from scheduler
-        yield [n._close(timeout=2) for n in nannies]
+        if dashboard:
+            try:
+                from distributed.dashboard import BokehWorker
+            except ImportError:
+                pass
+            else:
+                if dashboard_prefix:
+                    result = (BokehWorker, {"prefix": dashboard_prefix})
+                else:
+                    result = BokehWorker
+                services[("dashboard", dashboard_address)] = result
+
+        if resources:
+            resources = resources.replace(",", " ").split()
+            resources = dict(pair.split("=") for pair in resources)
+            resources = valmap(float, resources)
+        else:
+            resources = None
 
-    def on_signal(signum):
-        logger.info("Exiting on signal %d", signum)
-        close_all()
+        loop = IOLoop.current()
 
-    @gen.coroutine
-    def run():
-        yield nannies
-        yield [n.finished() for n in nannies]
+        preload_argv = kwargs.get("preload_argv", [])
+        kwargs = {"worker_port": None, "listen_address": None}
+        t = Nanny
 
-    install_signal_handlers(loop, cleanup=on_signal)
+        if not scheduler and not scheduler_file and "scheduler-address" not in config:
+            raise ValueError(
+                "Need to provide scheduler address like\n"
+                "dask-worker SCHEDULER_ADDRESS:8786"
+            )
+
+        if interface and host:
+            raise ValueError("Can not specify both interface and host")
+
+        if rmm_pool_size is not None:
+            try:
+                import rmm  # noqa F401
+            except ImportError:
+                raise ValueError(
+                    "RMM pool requested but module 'rmm' is not available. "
+                    "For installation instructions, please see "
+                    "https://github.com/rapidsai/rmm"
+                )  # pragma: no cover
+            rmm_pool_size = parse_bytes(rmm_pool_size)
+
+        # Ensure this parent dask-cuda-worker process uses the same UCX
+        # configuration as child worker processes created by it.
+        initialize(
+            create_cuda_context=False,
+            enable_tcp_over_ucx=enable_tcp_over_ucx,
+            enable_infiniband=enable_infiniband,
+            enable_nvlink=enable_nvlink,
+            enable_rdmacm=enable_rdmacm,
+            net_devices=net_devices,
+            cuda_device_index=0,
+        )
 
-    try:
-        loop.run_sync(run)
-    except (KeyboardInterrupt, TimeoutError):
-        pass
-    finally:
-        logger.info("End worker")
+        self.nannies = [
+            t(
+                scheduler,
+                scheduler_file=scheduler_file,
+                nthreads=nthreads,
+                services=services,
+                loop=loop,
+                resources=resources,
+                memory_limit=memory_limit,
+                interface=_get_interface(interface, host, i, net_devices),
+                host=host,
+                preload=(list(preload) or []) + ["dask_cuda.initialize"],
+                preload_argv=(list(preload_argv) or []) + ["--create-cuda-context"],
+                security=sec,
+                env={"CUDA_VISIBLE_DEVICES": cuda_visible_devices(i)},
+                plugins={CPUAffinity(get_cpu_affinity(i)), RMMPool(rmm_pool_size)},
+                name=name if nprocs == 1 or not name else name + "-" + str(i),
+                local_directory=local_directory,
+                config={
+                    "ucx": get_ucx_config(
+                        enable_tcp_over_ucx=enable_tcp_over_ucx,
+                        enable_infiniband=enable_infiniband,
+                        enable_nvlink=enable_nvlink,
+                        enable_rdmacm=enable_rdmacm,
+                        net_devices=net_devices,
+                        cuda_device_index=i,
+                    )
+                },
+                data=(
+                    DeviceHostFile,
+                    {
+                        "device_memory_limit": get_device_total_memory(index=i)
+                        if (
+                            device_memory_limit == "auto"
+                            or device_memory_limit == int(0)
+                        )
+                        else parse_bytes(device_memory_limit),
+                        "memory_limit": memory_limit,
+                        "local_directory": local_directory,
+                    },
+                ),
+                **kwargs,
+            )
+            for i in range(nprocs)
+        ]
+
+    def __await__(self):
+        return self._wait().__await__()
+
+    async def _wait(self):
+        await asyncio.gather(*self.nannies)
+
+    async def finished(self):
+        await asyncio.gather(*[n.finished() for n in self.nannies])
+
+    async def close(self, timeout=2):
+        await asyncio.gather(*[n.close(timeout=timeout) for n in self.nannies])
 
 
 def go():

From 178aba8484b55f38dbacae7b411af7db6fa783b8 Mon Sep 17 00:00:00 2001
From: Jacob Tomlinson <jtomlinson@nvidia.com>
Date: Wed, 24 Jun 2020 06:45:52 -0700
Subject: [PATCH 050/126] Remove unused import

---
 dask_cuda/dask_cuda_worker.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/dask_cuda/dask_cuda_worker.py b/dask_cuda/dask_cuda_worker.py
index 2a3f537d..0aa04744 100755
--- a/dask_cuda/dask_cuda_worker.py
+++ b/dask_cuda/dask_cuda_worker.py
@@ -20,7 +20,6 @@
 
 import click
 from toolz import valmap
-from tornado import gen
 from tornado.ioloop import IOLoop, TimeoutError
 
 from .device_host_file import DeviceHostFile

From a8c0ce7b45512cae112aa10992dd6cdd17cd5f0e Mon Sep 17 00:00:00 2001
From: Jacob Tomlinson <jtomlinson@nvidia.com>
Date: Wed, 24 Jun 2020 07:23:47 -0700
Subject: [PATCH 051/126] Move security options to CLI function and set
 defaults on class

---
 dask_cuda/dask_cuda_worker.py | 70 +++++++++++++++++------------------
 1 file changed, 33 insertions(+), 37 deletions(-)

diff --git a/dask_cuda/dask_cuda_worker.py b/dask_cuda/dask_cuda_worker.py
index 0aa04744..479b372c 100755
--- a/dask_cuda/dask_cuda_worker.py
+++ b/dask_cuda/dask_cuda_worker.py
@@ -244,6 +244,15 @@ def main(
     net_devices,
     **kwargs,
 ):
+    if tls_ca_file and tls_cert and tls_key:
+        security = Security(
+            tls_ca_file=tls_ca_file,
+            tls_worker_cert=tls_cert,
+            tls_worker_key=tls_key,
+        )
+    else:
+        security = None
+
     worker = CUDAWorker(
         scheduler,
         host,
@@ -262,9 +271,7 @@ def main(
         death_timeout,
         preload,
         dashboard_prefix,
-        tls_ca_file,
-        tls_cert,
-        tls_key,
+        security,
         enable_tcp_over_ucx,
         enable_infiniband,
         enable_nvlink,
@@ -297,44 +304,33 @@ class CUDAWorker:
     def __init__(
         self,
         scheduler,
-        host,
-        nthreads,
-        name,
-        memory_limit,
-        device_memory_limit,
-        rmm_pool_size,
-        pid_file,
-        resources,
-        dashboard,
-        dashboard_address,
-        local_directory,
-        scheduler_file,
-        interface,
-        death_timeout,
-        preload,
-        dashboard_prefix,
-        tls_ca_file,
-        tls_cert,
-        tls_key,
-        enable_tcp_over_ucx,
-        enable_infiniband,
-        enable_nvlink,
-        enable_rdmacm,
-        net_devices,
+        host=None,
+        nthreads=0,
+        name=None,
+        memory_limit="auto",
+        device_memory_limit="auto",
+        rmm_pool_size=None,
+        pid_file=None,
+        resources=None,
+        dashboard=True,
+        dashboard_address=":0",
+        local_directory=None,
+        scheduler_file=None,
+        interface=None,
+        death_timeout=None,
+        preload=[],
+        dashboard_prefix=None,
+        security=None,
+        enable_tcp_over_ucx=False,
+        enable_infiniband=False,
+        enable_nvlink=False,
+        enable_rdmacm=False,
+        net_devices=None,
         **kwargs,
     ):
         enable_proctitle_on_current()
         enable_proctitle_on_children()
 
-        if tls_ca_file and tls_cert and tls_key:
-            sec = Security(
-                tls_ca_file=tls_ca_file,
-                tls_worker_cert=tls_cert,
-                tls_worker_key=tls_key,
-            )
-        else:
-            sec = None
-
         try:
             nprocs = len(os.environ["CUDA_VISIBLE_DEVICES"].split(","))
         except KeyError:
@@ -427,7 +423,7 @@ def del_pid_file():
                 host=host,
                 preload=(list(preload) or []) + ["dask_cuda.initialize"],
                 preload_argv=(list(preload_argv) or []) + ["--create-cuda-context"],
-                security=sec,
+                security=security,
                 env={"CUDA_VISIBLE_DEVICES": cuda_visible_devices(i)},
                 plugins={CPUAffinity(get_cpu_affinity(i)), RMMPool(rmm_pool_size)},
                 name=name if nprocs == 1 or not name else name + "-" + str(i),

From a602bb7add964e0ed6b84e392b9c96e972e8a21a Mon Sep 17 00:00:00 2001
From: Jacob Tomlinson <jtomlinson@nvidia.com>
Date: Wed, 24 Jun 2020 07:59:17 -0700
Subject: [PATCH 052/126] Move CLI code to cli directory, move CUDAWorker to
 cudaworker.py, expose CUDAWorker at top level

---
 dask_cuda/__init__.py                   |   1 +
 dask_cuda/{ => cli}/dask_cuda_worker.py | 212 +-----------------------
 dask_cuda/cuda_worker.py                | 212 ++++++++++++++++++++++++
 setup.py                                |   2 +-
 4 files changed, 217 insertions(+), 210 deletions(-)
 rename dask_cuda/{ => cli}/dask_cuda_worker.py (51%)
 create mode 100644 dask_cuda/cuda_worker.py

diff --git a/dask_cuda/__init__.py b/dask_cuda/__init__.py
index bbc1ac59..91b049f7 100644
--- a/dask_cuda/__init__.py
+++ b/dask_cuda/__init__.py
@@ -1,5 +1,6 @@
 from ._version import get_versions
 from .local_cuda_cluster import LocalCUDACluster
+from .cuda_worker import CUDAWorker
 
 __version__ = get_versions()["version"]
 del get_versions
diff --git a/dask_cuda/dask_cuda_worker.py b/dask_cuda/cli/dask_cuda_worker.py
similarity index 51%
rename from dask_cuda/dask_cuda_worker.py
rename to dask_cuda/cli/dask_cuda_worker.py
index 479b372c..ea07b63c 100755
--- a/dask_cuda/dask_cuda_worker.py
+++ b/dask_cuda/cli/dask_cuda_worker.py
@@ -1,39 +1,16 @@
 from __future__ import absolute_import, division, print_function
 
-import asyncio
-import atexit
 import logging
-import multiprocessing
-import os
 
-from distributed import Nanny
 from distributed.cli.utils import check_python_3, install_signal_handlers
-from distributed.config import config
 from distributed.preloading import validate_preload_argv
-from distributed.proctitle import (
-    enable_proctitle_on_children,
-    enable_proctitle_on_current,
-)
+
 from distributed.security import Security
-from distributed.utils import parse_bytes
-from distributed.worker import parse_memory_limit
 
 import click
-from toolz import valmap
 from tornado.ioloop import IOLoop, TimeoutError
 
-from .device_host_file import DeviceHostFile
-from .initialize import initialize
-from .local_cuda_cluster import cuda_visible_devices
-from .utils import (
-    CPUAffinity,
-    RMMPool,
-    get_cpu_affinity,
-    get_device_total_memory,
-    get_n_gpus,
-    get_ucx_config,
-    get_ucx_net_devices,
-)
+from ..cuda_worker import CUDAWorker
 
 logger = logging.getLogger(__name__)
 
@@ -41,18 +18,6 @@
 pem_file_option_type = click.Path(exists=True, resolve_path=True)
 
 
-def _get_interface(interface, host, cuda_device_index, ucx_net_devices):
-    if host:
-        return None
-    else:
-        return interface or get_ucx_net_devices(
-            cuda_device_index=cuda_device_index,
-            ucx_net_devices=ucx_net_devices,
-            get_openfabrics=False,
-            get_network=True,
-        )
-
-
 @click.command(context_settings=dict(ignore_unknown_options=True))
 @click.argument("scheduler", type=str, required=False)
 @click.option(
@@ -246,9 +211,7 @@ def main(
 ):
     if tls_ca_file and tls_cert and tls_key:
         security = Security(
-            tls_ca_file=tls_ca_file,
-            tls_worker_cert=tls_cert,
-            tls_worker_key=tls_key,
+            tls_ca_file=tls_ca_file, tls_worker_cert=tls_cert, tls_worker_key=tls_key,
         )
     else:
         security = None
@@ -300,175 +263,6 @@ async def run():
         logger.info("End worker")
 
 
-class CUDAWorker:
-    def __init__(
-        self,
-        scheduler,
-        host=None,
-        nthreads=0,
-        name=None,
-        memory_limit="auto",
-        device_memory_limit="auto",
-        rmm_pool_size=None,
-        pid_file=None,
-        resources=None,
-        dashboard=True,
-        dashboard_address=":0",
-        local_directory=None,
-        scheduler_file=None,
-        interface=None,
-        death_timeout=None,
-        preload=[],
-        dashboard_prefix=None,
-        security=None,
-        enable_tcp_over_ucx=False,
-        enable_infiniband=False,
-        enable_nvlink=False,
-        enable_rdmacm=False,
-        net_devices=None,
-        **kwargs,
-    ):
-        enable_proctitle_on_current()
-        enable_proctitle_on_children()
-
-        try:
-            nprocs = len(os.environ["CUDA_VISIBLE_DEVICES"].split(","))
-        except KeyError:
-            nprocs = get_n_gpus()
-
-        if not nthreads:
-            nthreads = min(1, multiprocessing.cpu_count() // nprocs)
-
-        memory_limit = parse_memory_limit(memory_limit, nthreads, total_cores=nprocs)
-
-        if pid_file:
-            with open(pid_file, "w") as f:
-                f.write(str(os.getpid()))
-
-            def del_pid_file():
-                if os.path.exists(pid_file):
-                    os.remove(pid_file)
-
-            atexit.register(del_pid_file)
-
-        services = {}
-
-        if dashboard:
-            try:
-                from distributed.dashboard import BokehWorker
-            except ImportError:
-                pass
-            else:
-                if dashboard_prefix:
-                    result = (BokehWorker, {"prefix": dashboard_prefix})
-                else:
-                    result = BokehWorker
-                services[("dashboard", dashboard_address)] = result
-
-        if resources:
-            resources = resources.replace(",", " ").split()
-            resources = dict(pair.split("=") for pair in resources)
-            resources = valmap(float, resources)
-        else:
-            resources = None
-
-        loop = IOLoop.current()
-
-        preload_argv = kwargs.get("preload_argv", [])
-        kwargs = {"worker_port": None, "listen_address": None}
-        t = Nanny
-
-        if not scheduler and not scheduler_file and "scheduler-address" not in config:
-            raise ValueError(
-                "Need to provide scheduler address like\n"
-                "dask-worker SCHEDULER_ADDRESS:8786"
-            )
-
-        if interface and host:
-            raise ValueError("Can not specify both interface and host")
-
-        if rmm_pool_size is not None:
-            try:
-                import rmm  # noqa F401
-            except ImportError:
-                raise ValueError(
-                    "RMM pool requested but module 'rmm' is not available. "
-                    "For installation instructions, please see "
-                    "https://github.com/rapidsai/rmm"
-                )  # pragma: no cover
-            rmm_pool_size = parse_bytes(rmm_pool_size)
-
-        # Ensure this parent dask-cuda-worker process uses the same UCX
-        # configuration as child worker processes created by it.
-        initialize(
-            create_cuda_context=False,
-            enable_tcp_over_ucx=enable_tcp_over_ucx,
-            enable_infiniband=enable_infiniband,
-            enable_nvlink=enable_nvlink,
-            enable_rdmacm=enable_rdmacm,
-            net_devices=net_devices,
-            cuda_device_index=0,
-        )
-
-        self.nannies = [
-            t(
-                scheduler,
-                scheduler_file=scheduler_file,
-                nthreads=nthreads,
-                services=services,
-                loop=loop,
-                resources=resources,
-                memory_limit=memory_limit,
-                interface=_get_interface(interface, host, i, net_devices),
-                host=host,
-                preload=(list(preload) or []) + ["dask_cuda.initialize"],
-                preload_argv=(list(preload_argv) or []) + ["--create-cuda-context"],
-                security=security,
-                env={"CUDA_VISIBLE_DEVICES": cuda_visible_devices(i)},
-                plugins={CPUAffinity(get_cpu_affinity(i)), RMMPool(rmm_pool_size)},
-                name=name if nprocs == 1 or not name else name + "-" + str(i),
-                local_directory=local_directory,
-                config={
-                    "ucx": get_ucx_config(
-                        enable_tcp_over_ucx=enable_tcp_over_ucx,
-                        enable_infiniband=enable_infiniband,
-                        enable_nvlink=enable_nvlink,
-                        enable_rdmacm=enable_rdmacm,
-                        net_devices=net_devices,
-                        cuda_device_index=i,
-                    )
-                },
-                data=(
-                    DeviceHostFile,
-                    {
-                        "device_memory_limit": get_device_total_memory(index=i)
-                        if (
-                            device_memory_limit == "auto"
-                            or device_memory_limit == int(0)
-                        )
-                        else parse_bytes(device_memory_limit),
-                        "memory_limit": memory_limit,
-                        "local_directory": local_directory,
-                    },
-                ),
-                **kwargs,
-            )
-            for i in range(nprocs)
-        ]
-
-    def __await__(self):
-        return self._wait().__await__()
-
-    async def _wait(self):
-        await asyncio.gather(*self.nannies)
-
-    async def finished(self):
-        await asyncio.gather(*[n.finished() for n in self.nannies])
-
-    async def close(self, timeout=2):
-        await asyncio.gather(*[n.close(timeout=timeout) for n in self.nannies])
-
-
 def go():
     check_python_3()
     main()
diff --git a/dask_cuda/cuda_worker.py b/dask_cuda/cuda_worker.py
new file mode 100644
index 00000000..753aedd5
--- /dev/null
+++ b/dask_cuda/cuda_worker.py
@@ -0,0 +1,212 @@
+from __future__ import absolute_import, division, print_function
+
+import asyncio
+import atexit
+import multiprocessing
+import os
+
+from distributed import Nanny
+from distributed.config import config
+from distributed.proctitle import (
+    enable_proctitle_on_children,
+    enable_proctitle_on_current,
+)
+from distributed.utils import parse_bytes
+from distributed.worker import parse_memory_limit
+
+from toolz import valmap
+from tornado.ioloop import IOLoop
+
+from ..device_host_file import DeviceHostFile
+from ..initialize import initialize
+from ..local_cuda_cluster import cuda_visible_devices
+from ..utils import (
+    CPUAffinity,
+    RMMPool,
+    get_cpu_affinity,
+    get_device_total_memory,
+    get_n_gpus,
+    get_ucx_config,
+    get_ucx_net_devices,
+)
+
+
+def _get_interface(interface, host, cuda_device_index, ucx_net_devices):
+    if host:
+        return None
+    else:
+        return interface or get_ucx_net_devices(
+            cuda_device_index=cuda_device_index,
+            ucx_net_devices=ucx_net_devices,
+            get_openfabrics=False,
+            get_network=True,
+        )
+
+
+class CUDAWorker:
+    def __init__(
+        self,
+        scheduler,
+        host=None,
+        nthreads=0,
+        name=None,
+        memory_limit="auto",
+        device_memory_limit="auto",
+        rmm_pool_size=None,
+        pid_file=None,
+        resources=None,
+        dashboard=True,
+        dashboard_address=":0",
+        local_directory=None,
+        scheduler_file=None,
+        interface=None,
+        death_timeout=None,
+        preload=[],
+        dashboard_prefix=None,
+        security=None,
+        enable_tcp_over_ucx=False,
+        enable_infiniband=False,
+        enable_nvlink=False,
+        enable_rdmacm=False,
+        net_devices=None,
+        **kwargs,
+    ):
+        enable_proctitle_on_current()
+        enable_proctitle_on_children()
+
+        try:
+            nprocs = len(os.environ["CUDA_VISIBLE_DEVICES"].split(","))
+        except KeyError:
+            nprocs = get_n_gpus()
+
+        if not nthreads:
+            nthreads = min(1, multiprocessing.cpu_count() // nprocs)
+
+        memory_limit = parse_memory_limit(memory_limit, nthreads, total_cores=nprocs)
+
+        if pid_file:
+            with open(pid_file, "w") as f:
+                f.write(str(os.getpid()))
+
+            def del_pid_file():
+                if os.path.exists(pid_file):
+                    os.remove(pid_file)
+
+            atexit.register(del_pid_file)
+
+        services = {}
+
+        if dashboard:
+            try:
+                from distributed.dashboard import BokehWorker
+            except ImportError:
+                pass
+            else:
+                if dashboard_prefix:
+                    result = (BokehWorker, {"prefix": dashboard_prefix})
+                else:
+                    result = BokehWorker
+                services[("dashboard", dashboard_address)] = result
+
+        if resources:
+            resources = resources.replace(",", " ").split()
+            resources = dict(pair.split("=") for pair in resources)
+            resources = valmap(float, resources)
+        else:
+            resources = None
+
+        loop = IOLoop.current()
+
+        preload_argv = kwargs.get("preload_argv", [])
+        kwargs = {"worker_port": None, "listen_address": None}
+        t = Nanny
+
+        if not scheduler and not scheduler_file and "scheduler-address" not in config:
+            raise ValueError(
+                "Need to provide scheduler address like\n"
+                "dask-worker SCHEDULER_ADDRESS:8786"
+            )
+
+        if interface and host:
+            raise ValueError("Can not specify both interface and host")
+
+        if rmm_pool_size is not None:
+            try:
+                import rmm  # noqa F401
+            except ImportError:
+                raise ValueError(
+                    "RMM pool requested but module 'rmm' is not available. "
+                    "For installation instructions, please see "
+                    "https://github.com/rapidsai/rmm"
+                )  # pragma: no cover
+            rmm_pool_size = parse_bytes(rmm_pool_size)
+
+        # Ensure this parent dask-cuda-worker process uses the same UCX
+        # configuration as child worker processes created by it.
+        initialize(
+            create_cuda_context=False,
+            enable_tcp_over_ucx=enable_tcp_over_ucx,
+            enable_infiniband=enable_infiniband,
+            enable_nvlink=enable_nvlink,
+            enable_rdmacm=enable_rdmacm,
+            net_devices=net_devices,
+            cuda_device_index=0,
+        )
+
+        self.nannies = [
+            t(
+                scheduler,
+                scheduler_file=scheduler_file,
+                nthreads=nthreads,
+                services=services,
+                loop=loop,
+                resources=resources,
+                memory_limit=memory_limit,
+                interface=_get_interface(interface, host, i, net_devices),
+                host=host,
+                preload=(list(preload) or []) + ["dask_cuda.initialize"],
+                preload_argv=(list(preload_argv) or []) + ["--create-cuda-context"],
+                security=security,
+                env={"CUDA_VISIBLE_DEVICES": cuda_visible_devices(i)},
+                plugins={CPUAffinity(get_cpu_affinity(i)), RMMPool(rmm_pool_size)},
+                name=name if nprocs == 1 or not name else name + "-" + str(i),
+                local_directory=local_directory,
+                config={
+                    "ucx": get_ucx_config(
+                        enable_tcp_over_ucx=enable_tcp_over_ucx,
+                        enable_infiniband=enable_infiniband,
+                        enable_nvlink=enable_nvlink,
+                        enable_rdmacm=enable_rdmacm,
+                        net_devices=net_devices,
+                        cuda_device_index=i,
+                    )
+                },
+                data=(
+                    DeviceHostFile,
+                    {
+                        "device_memory_limit": get_device_total_memory(index=i)
+                        if (
+                            device_memory_limit == "auto"
+                            or device_memory_limit == int(0)
+                        )
+                        else parse_bytes(device_memory_limit),
+                        "memory_limit": memory_limit,
+                        "local_directory": local_directory,
+                    },
+                ),
+                **kwargs,
+            )
+            for i in range(nprocs)
+        ]
+
+    def __await__(self):
+        return self._wait().__await__()
+
+    async def _wait(self):
+        await asyncio.gather(*self.nannies)
+
+    async def finished(self):
+        await asyncio.gather(*[n.finished() for n in self.nannies])
+
+    async def close(self, timeout=2):
+        await asyncio.gather(*[n.close(timeout=timeout) for n in self.nannies])
diff --git a/setup.py b/setup.py
index b83131cc..64753fa9 100644
--- a/setup.py
+++ b/setup.py
@@ -38,6 +38,6 @@
     install_requires=open("requirements.txt").read().strip().split("\n"),
     entry_points="""
         [console_scripts]
-        dask-cuda-worker=dask_cuda.dask_cuda_worker:go
+        dask-cuda-worker=dask_cuda.cli.dask_cuda_worker:go
       """,
 )

From bbfb0693108d52928e27497c17686687d04859b2 Mon Sep 17 00:00:00 2001
From: Jacob Tomlinson <jtomlinson@nvidia.com>
Date: Thu, 25 Jun 2020 03:18:48 -0700
Subject: [PATCH 053/126] Fix relative imports

---
 dask_cuda/cuda_worker.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/dask_cuda/cuda_worker.py b/dask_cuda/cuda_worker.py
index 753aedd5..c187c1de 100644
--- a/dask_cuda/cuda_worker.py
+++ b/dask_cuda/cuda_worker.py
@@ -17,10 +17,10 @@
 from toolz import valmap
 from tornado.ioloop import IOLoop
 
-from ..device_host_file import DeviceHostFile
-from ..initialize import initialize
-from ..local_cuda_cluster import cuda_visible_devices
-from ..utils import (
+from .device_host_file import DeviceHostFile
+from .initialize import initialize
+from .local_cuda_cluster import cuda_visible_devices
+from .utils import (
     CPUAffinity,
     RMMPool,
     get_cpu_affinity,

From aefae848894714851b550ee946fa67202efcf02c Mon Sep 17 00:00:00 2001
From: Benjamin Zaitlen <quasiben@gmail.com>
Date: Thu, 25 Jun 2020 11:10:13 -0700
Subject: [PATCH 054/126] optionally allow benchmark to use cpu and external
 dask cluster

---
 dask_cuda/benchmarks/local_cudf_merge.py | 79 +++++++++++++++---------
 dask_cuda/benchmarks/utils.py            |  7 +++
 2 files changed, 57 insertions(+), 29 deletions(-)

diff --git a/dask_cuda/benchmarks/local_cudf_merge.py b/dask_cuda/benchmarks/local_cudf_merge.py
index abbe1191..c2f49c4e 100644
--- a/dask_cuda/benchmarks/local_cudf_merge.py
+++ b/dask_cuda/benchmarks/local_cudf_merge.py
@@ -14,17 +14,23 @@
     setup_memory_pool,
 )
 
-import cudf
-import cupy
 import numpy
 
 # Benchmarking cuDF merge operation based on
 # <https://gist.github.com/rjzamora/0ffc35c19b5180ab04bbf7c793c45955>
 
 
-def generate_chunk(i_chunk, local_size, num_chunks, chunk_type, frac_match):
+def generate_chunk(i_chunk, local_size, num_chunks, chunk_type, frac_match,
+        gpu):
     # Setting a seed that triggers max amount of comm in the two-GPU case.
-    cupy.random.seed(17561648246761420848)
+    if gpu:
+        import cupy as xp
+        import cudf as xdf
+    else:
+        import numpy as xp
+        import pandas as xdf
+
+    xp.random.seed(2**32 - 1)
 
     chunk_type = chunk_type or "build"
     frac_match = frac_match or 1.0
@@ -40,15 +46,15 @@ def generate_chunk(i_chunk, local_size, num_chunks, chunk_type, frac_match):
         start = local_size * i_chunk
         stop = start + local_size
 
-        parts_array = cupy.arange(num_chunks, dtype="int64")
-        suffle_array = cupy.repeat(parts_array, math.ceil(local_size / num_chunks))
+        parts_array = xp.arange(num_chunks, dtype="int64")
+        suffle_array = xp.repeat(parts_array, math.ceil(local_size / num_chunks))
 
-        df = cudf.DataFrame(
+        df = xdf.DataFrame(
             {
-                "key": cupy.arange(start, stop=stop, dtype="int64"),
-                "shuffle": cupy.random.permutation(suffle_array)[:local_size],
-                "payload": cupy.random.permutation(
-                    cupy.arange(local_size, dtype="int64")
+                "key": xp.arange(start, stop=stop, dtype="int64"),
+                "shuffle": xp.random.permutation(suffle_array)[:local_size],
+                "payload": xp.random.permutation(
+                    xp.arange(local_size, dtype="int64")
                 ),
             }
         )
@@ -69,25 +75,25 @@ def generate_chunk(i_chunk, local_size, num_chunks, chunk_type, frac_match):
         for i in range(num_chunks):
             bgn = (local_size * i) + (sub_local_size * i_chunk)
             end = bgn + sub_local_size
-            ar = cupy.arange(bgn, stop=end, dtype="int64")
-            arrays.append(cupy.random.permutation(ar)[:sub_local_size_use])
-        key_array_match = cupy.concatenate(tuple(arrays), axis=0)
+            ar = xp.arange(bgn, stop=end, dtype="int64")
+            arrays.append(xp.random.permutation(ar)[:sub_local_size_use])
+        key_array_match = xp.concatenate(tuple(arrays), axis=0)
 
         # Step 2. Add values that DON'T match
         missing_size = local_size - key_array_match.shape[0]
         start = local_size * num_chunks + local_size * i_chunk
         stop = start + missing_size
-        key_array_no_match = cupy.arange(start, stop=stop, dtype="int64")
+        key_array_no_match = xp.arange(start, stop=stop, dtype="int64")
 
         # Step 3. Combine and create the final dataframe chunk (dask_cudf partition)
-        key_array_combine = cupy.concatenate(
+        key_array_combine = xp.concatenate(
             (key_array_match, key_array_no_match), axis=0
         )
-        df = cudf.DataFrame(
+        df = xdf.DataFrame(
             {
-                "key": cupy.random.permutation(key_array_combine),
-                "payload": cupy.random.permutation(
-                    cupy.arange(local_size, dtype="int64")
+                "key": xp.random.permutation(key_array_combine),
+                "payload": xp.random.permutation(
+                    xp.arange(local_size, dtype="int64")
                 ),
             }
         )
@@ -97,13 +103,15 @@ def generate_chunk(i_chunk, local_size, num_chunks, chunk_type, frac_match):
 def get_random_ddf(chunk_size, num_chunks, frac_match, chunk_type, args):
 
     parts = [chunk_size for i in range(num_chunks)]
-    meta = generate_chunk(0, 4, 1, chunk_type, None)
+    device_type = True if args.type is 'gpu' else False
+    meta = generate_chunk(0, 4, 1, chunk_type, None, device_type)
     divisions = [None] * (len(parts) + 1)
 
     name = "generate-data-" + tokenize(chunk_size, num_chunks, frac_match, chunk_type)
 
     graph = {
-        (name, i): (generate_chunk, i, part, len(parts), chunk_type, frac_match)
+        (name, i): (generate_chunk, i, part, len(parts), chunk_type,
+            frac_match, device_type)
         for i, part in enumerate(parts)
     }
 
@@ -177,15 +185,18 @@ def main(args):
     cluster_kwargs = cluster_options["kwargs"]
     scheduler_addr = cluster_options["scheduler_addr"]
 
-    cluster = Cluster(*cluster_args, **cluster_kwargs)
-    if args.multi_node:
-        import time
+    if args.sched_addr:
+        client = Client(args.sched_addr)
+    else:
+        cluster = Cluster(*cluster_args, **cluster_kwargs)
+        if args.multi_node:
+            import time
 
-        # Allow some time for workers to start and connect to scheduler
-        # TODO: make this a command-line argument?
-        time.sleep(15)
+            # Allow some time for workers to start and connect to scheduler
+            # TODO: make this a command-line argument?
+            time.sleep(15)
 
-    client = Client(scheduler_addr if args.multi_node else cluster)
+        client = Client(scheduler_addr if args.multi_node else cluster)
 
     client.run(setup_memory_pool, disable_pool=args.no_rmm_pool)
     # Create an RMM pool on the scheduler due to occasional deserialization
@@ -227,6 +238,7 @@ def main(args):
     print("Merge benchmark")
     print("-------------------------------")
     print(f"backend        | {args.backend}")
+    print(f"backend        | {args.type}")
     print(f"rows-per-chunk | {args.chunk_size}")
     print(f"protocol       | {args.protocol}")
     print(f"device(s)      | {args.devs}")
@@ -278,6 +290,15 @@ def parse_args():
             "type": str,
             "help": "The backend to use.",
         },
+        {
+            "name": ["-t", "--type",],
+            "choices": ["cpu", "gpu"],
+            "default": "gpu",
+            "type": str,
+            "help": "Do merge with GPU or CPU dataframes",
+        },
+
+
         {
             "name": ["-c", "--chunk-size",],
             "default": 1_000_000,
diff --git a/dask_cuda/benchmarks/utils.py b/dask_cuda/benchmarks/utils.py
index 80fea4df..753ec1be 100644
--- a/dask_cuda/benchmarks/utils.py
+++ b/dask_cuda/benchmarks/utils.py
@@ -82,6 +82,13 @@ def parse_benchmark_args(description="Generic dask-cuda Benchmark", args_list=[]
         dest="multi_node",
         help="Runs a multi-node cluster on the hosts specified by --hosts.",
     )
+    parser.add_argument(
+        "--scheduler-address",
+        default=None,
+        type=str,
+        dest="sched_addr",
+        help="Scheduler Address -- assumes cluster is created outside of benchmark.",
+    )
     parser.add_argument(
         "--hosts",
         default=None,

From 8798ad1946d97a95cd6650f9ab3f1cb851af4ff4 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Mon, 29 Jun 2020 13:31:04 -0700
Subject: [PATCH 055/126] Add missing __init__.py to dask_cuda/cli

---
 dask_cuda/cli/__init__.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 dask_cuda/cli/__init__.py

diff --git a/dask_cuda/cli/__init__.py b/dask_cuda/cli/__init__.py
new file mode 100644
index 00000000..e69de29b

From f670e0faf9c811b2acacf21544e63d452549c851 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Mon, 29 Jun 2020 15:02:36 -0700
Subject: [PATCH 056/126] Create local_directory if it doesn't exist

---
 dask_cuda/device_host_file.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/dask_cuda/device_host_file.py b/dask_cuda/device_host_file.py
index 5f094472..a62c5544 100644
--- a/dask_cuda/device_host_file.py
+++ b/dask_cuda/device_host_file.py
@@ -91,8 +91,10 @@ def __init__(
     ):
         if local_directory is None:
             local_directory = dask.config.get("temporary-directory") or os.getcwd()
+
+        if not os.path.exists(local_directory):
             os.makedirs(local_directory, exist_ok=True)
-            local_directory = os.path.join(local_directory, "dask-worker-space")
+        local_directory = os.path.join(local_directory, "dask-worker-space")
 
         self.disk_func_path = os.path.join(local_directory, "storage")
 

From 66b7ce428e6647cff5219861faa30bc448b33dc3 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Mon, 29 Jun 2020 15:02:57 -0700
Subject: [PATCH 057/126] Fix dask-cuda-worker default for --local-directory

---
 dask_cuda/cli/dask_cuda_worker.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dask_cuda/cli/dask_cuda_worker.py b/dask_cuda/cli/dask_cuda_worker.py
index ea07b63c..eb1da172 100755
--- a/dask_cuda/cli/dask_cuda_worker.py
+++ b/dask_cuda/cli/dask_cuda_worker.py
@@ -107,7 +107,7 @@
 )
 @click.option("--pid-file", type=str, default="", help="File to write the process PID")
 @click.option(
-    "--local-directory", default="", type=str, help="Directory to place worker files"
+    "--local-directory", default=None, type=str, help="Directory to place worker files"
 )
 @click.option(
     "--resources",

From 17785c351ef2ef9bac51c3c7876e07da2c76d865 Mon Sep 17 00:00:00 2001
From: Benjamin Zaitlen <quasiben@gmail.com>
Date: Tue, 30 Jun 2020 14:56:46 -0700
Subject: [PATCH 058/126] add dask distributed gpu tests

---
 ci/gpu/build.sh | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index c2c34db7..b0d14fe7 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -101,4 +101,18 @@ else
     cd $WORKSPACE
     ls dask_cuda/tests/
     UCXPY_IFNAME=eth0 UCX_WARN_UNUSED_ENV_VARS=n UCX_MEMTYPE_CACHE=n py.test -vs --cache-clear --junitxml=${WORKSPACE}/junit-dask-cuda.xml --cov-config=.coveragerc --cov=dask_cuda --cov-report=xml:${WORKSPACE}/dask-cuda-coverage.xml --cov-report term dask_cuda/tests/
+
+    logger "Running dask.distributed GPU tests"
+    # Test downstream packages, which requires Python v3.7
+    if [ $(python -c "import sys; print(sys.version_info[1])") -ge "7" ]; then
+        logger "TEST OF DASK/UCX..."
+        py.test --cache-clear -vs `python -c "import distributed.protocol.tests.test_cupy as m;print(m.__file__)"`
+        py.test --cache-clear -vs `python -c "import distributed.protocol.tests.test_numba as m;print(m.__file__)"`
+        py.test --cache-clear -vs `python -c "import distributed.protocol.tests.test_rmm as m;print(m.__file__)"`
+        py.test --cache-clear -vs `python -c "import distributed.protocol.tests.test_collection_cuda as m;print(m.__file__)"`
+        py.test --cache-clear -vs `python -c "import distributed.comm.tests.test_ucx as m;print(m.__file__)"`
+        py.test --cache-clear -vs `python -c "import distributed.tests.test_nanny as m;print(m.__file__)"`
+        py.test --cache-clear -vs `python -c "import distributed.tests.test_gpu_metrics as m;print(m.__file__)"`
+        py.test --cache-clear -m "slow" -vs `python -c "import distributed.comm.tests.test_ucx as m;print(m.__file__)"`
+    fi
 fi

From c4bf454bdae795f97af5f2cb6858ba557231c82c Mon Sep 17 00:00:00 2001
From: Benjamin Zaitlen <quasiben@gmail.com>
Date: Wed, 1 Jul 2020 06:47:59 -0700
Subject: [PATCH 059/126] fix rmm_pool arg name in docs

---
 docs/source/ucx.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/ucx.rst b/docs/source/ucx.rst
index 9df9f9fd..732d5a8b 100644
--- a/docs/source/ucx.rst
+++ b/docs/source/ucx.rst
@@ -124,7 +124,7 @@ All options discussed previously are also available in ``LocalCUDACluster``. It
         enable_nvlink = True
         enable_infiniband = True
         ucx_net_devices="auto"
-        rmm_pool="24GB"
+        rmm_pool_size="24GB"
     )
     client = Client(cluster)
 

From 85b80b3267ff3a4e0f5b5a2ac9b5d14b0f839585 Mon Sep 17 00:00:00 2001
From: Benjamin Zaitlen <quasiben@gmail.com>
Date: Wed, 1 Jul 2020 06:56:37 -0700
Subject: [PATCH 060/126] update docstring

---
 dask_cuda/local_cuda_cluster.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dask_cuda/local_cuda_cluster.py b/dask_cuda/local_cuda_cluster.py
index 8bed6645..02fcd77a 100644
--- a/dask_cuda/local_cuda_cluster.py
+++ b/dask_cuda/local_cuda_cluster.py
@@ -100,7 +100,7 @@ class LocalCUDACluster(LocalCluster):
         configured or is disconnected, for that reason it's limited to
         InfiniBand only and will still cause unpredictable errors if not _ALL_
         interfaces are connected and properly configured.
-    rmm_pool: None, int or str
+    rmm_pool_size: None, int or str
         When None (default), no RMM pool is initialized. If a different value
         is given, it can be an integer (bytes) or string (like 5GB or 5000M)."
 

From b971ef8d8ea3c34e23d3bbe4cc14aa6e05fc6278 Mon Sep 17 00:00:00 2001
From: Benjamin Zaitlen <quasiben@gmail.com>
Date: Wed, 1 Jul 2020 06:59:34 -0700
Subject: [PATCH 061/126] remove ucx tests

---
 ci/gpu/build.sh | 2 --
 1 file changed, 2 deletions(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index b0d14fe7..0c92c6e2 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -110,9 +110,7 @@ else
         py.test --cache-clear -vs `python -c "import distributed.protocol.tests.test_numba as m;print(m.__file__)"`
         py.test --cache-clear -vs `python -c "import distributed.protocol.tests.test_rmm as m;print(m.__file__)"`
         py.test --cache-clear -vs `python -c "import distributed.protocol.tests.test_collection_cuda as m;print(m.__file__)"`
-        py.test --cache-clear -vs `python -c "import distributed.comm.tests.test_ucx as m;print(m.__file__)"`
         py.test --cache-clear -vs `python -c "import distributed.tests.test_nanny as m;print(m.__file__)"`
         py.test --cache-clear -vs `python -c "import distributed.tests.test_gpu_metrics as m;print(m.__file__)"`
-        py.test --cache-clear -m "slow" -vs `python -c "import distributed.comm.tests.test_ucx as m;print(m.__file__)"`
     fi
 fi

From 3ee7e8d39b53f53eb7183b82fbb1391f0d92377d Mon Sep 17 00:00:00 2001
From: Benjamin Zaitlen <quasiben@gmail.com>
Date: Wed, 1 Jul 2020 13:02:22 -0700
Subject: [PATCH 062/126] add warning when using nvlink without rmm defined

---
 dask_cuda/local_cuda_cluster.py            | 9 +++++++++
 dask_cuda/tests/test_local_cuda_cluster.py | 9 +++++++++
 2 files changed, 18 insertions(+)

diff --git a/dask_cuda/local_cuda_cluster.py b/dask_cuda/local_cuda_cluster.py
index 8bed6645..adcc51b3 100644
--- a/dask_cuda/local_cuda_cluster.py
+++ b/dask_cuda/local_cuda_cluster.py
@@ -1,5 +1,6 @@
 import copy
 import os
+import warnings
 
 import dask
 from dask.distributed import LocalCluster
@@ -167,6 +168,14 @@ def __init__(
                     "https://github.com/rapidsai/rmm"
                 )  # pragma: no cover
             self.rmm_pool_size = parse_bytes(self.rmm_pool_size)
+        else:
+            if enable_nvlink:
+                warnings.warn(
+                    "When using NVLink we recommend setting a "
+                    "`rmm_pool_size`.  Please see: "
+                    "https://dask-cuda.readthedocs.io/en/latest/ucx.html#important-notes "
+                    "for more details"
+                )
 
         if not processes:
             raise ValueError(
diff --git a/dask_cuda/tests/test_local_cuda_cluster.py b/dask_cuda/tests/test_local_cuda_cluster.py
index 2958dc29..bbe91d8e 100644
--- a/dask_cuda/tests/test_local_cuda_cluster.py
+++ b/dask_cuda/tests/test_local_cuda_cluster.py
@@ -115,3 +115,12 @@ async def test_rmm_pool():
             memory_resource_type = await client.run(rmm.mr.get_default_resource_type)
             for v in memory_resource_type.values():
                 assert v is rmm._lib.memory_resource.CNMemMemoryResource
+
+
+@gen_test(timeout=20)
+async def test_warn_no_rmm_defined():
+    with pytest.warns(Warning) as info:
+        async with LocalCUDACluster(asynchronous=True, enable_nvlink=True) as cluster:
+            pass
+
+    assert "When using NVLink" in str(info[0].message)

From 76f766ebbba0e2eff54484d3da8e5d8952478316 Mon Sep 17 00:00:00 2001
From: Benjamin Zaitlen <quasiben@users.noreply.github.com>
Date: Wed, 1 Jul 2020 16:33:30 -0400
Subject: [PATCH 063/126] Update dask_cuda/local_cuda_cluster.py

Co-authored-by: Peter Andreas Entschev <peter@entschev.com>
---
 dask_cuda/local_cuda_cluster.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/dask_cuda/local_cuda_cluster.py b/dask_cuda/local_cuda_cluster.py
index adcc51b3..178fd187 100644
--- a/dask_cuda/local_cuda_cluster.py
+++ b/dask_cuda/local_cuda_cluster.py
@@ -172,7 +172,8 @@ def __init__(
             if enable_nvlink:
                 warnings.warn(
                     "When using NVLink we recommend setting a "
-                    "`rmm_pool_size`.  Please see: "
+                    "`rmm_pool_size` or setting an RMM pool via `client.run`. "
+                    "Please see: "
                     "https://dask-cuda.readthedocs.io/en/latest/ucx.html#important-notes "
                     "for more details"
                 )

From c12da4feda9feb25b92ec1b5e4bc5651b5a60df5 Mon Sep 17 00:00:00 2001
From: Benjamin Zaitlen <quasiben@gmail.com>
Date: Thu, 2 Jul 2020 10:16:51 -0700
Subject: [PATCH 064/126] warn and add test for dask-cuda-workers

---
 dask_cuda/cli/dask_cuda_worker.py        |  1 -
 dask_cuda/cuda_worker.py                 | 11 +++++++++
 dask_cuda/tests/test_dask_cuda_worker.py | 31 ++++++++++++++++++++++++
 3 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/dask_cuda/cli/dask_cuda_worker.py b/dask_cuda/cli/dask_cuda_worker.py
index eb1da172..d70b0706 100755
--- a/dask_cuda/cli/dask_cuda_worker.py
+++ b/dask_cuda/cli/dask_cuda_worker.py
@@ -4,7 +4,6 @@
 
 from distributed.cli.utils import check_python_3, install_signal_handlers
 from distributed.preloading import validate_preload_argv
-
 from distributed.security import Security
 
 import click
diff --git a/dask_cuda/cuda_worker.py b/dask_cuda/cuda_worker.py
index c187c1de..f2e955af 100644
--- a/dask_cuda/cuda_worker.py
+++ b/dask_cuda/cuda_worker.py
@@ -4,6 +4,7 @@
 import atexit
 import multiprocessing
 import os
+import warnings
 
 from distributed import Nanny
 from distributed.config import config
@@ -140,6 +141,16 @@ def del_pid_file():
                     "https://github.com/rapidsai/rmm"
                 )  # pragma: no cover
             rmm_pool_size = parse_bytes(rmm_pool_size)
+        else:
+            if enable_nvlink:
+                warnings.warn(
+                    "When using NVLink we recommend setting a "
+                    "`rmm_pool_size`.  Please see: "
+                    "https://dask-cuda.readthedocs.io/en/latest/ucx.html#important-notes "
+                    "for more details"
+                )
+
+
 
         # Ensure this parent dask-cuda-worker process uses the same UCX
         # configuration as child worker processes created by it.
diff --git a/dask_cuda/tests/test_dask_cuda_worker.py b/dask_cuda/tests/test_dask_cuda_worker.py
index e2132888..c1d86383 100644
--- a/dask_cuda/tests/test_dask_cuda_worker.py
+++ b/dask_cuda/tests/test_dask_cuda_worker.py
@@ -67,3 +67,34 @@ def test_rmm_pool(loop):  # noqa: F811
                 memory_resource_type = client.run(rmm.mr.get_default_resource_type)
                 for v in memory_resource_type.values():
                     assert v is rmm._lib.memory_resource.CNMemMemoryResource
+
+
+def test_nvlink_no_rmm_warning(loop):  # noqa: F811
+    os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
+    try:
+        with popen(["dask-scheduler", "--port", "9359", "--no-dashboard"]):
+            with popen(
+                [
+                    "dask-cuda-worker",
+                    "127.0.0.1:9359",
+                    "--host",
+                    "127.0.0.1",
+                    "--enable-nvlink",
+                ],
+                stdout=True,
+                stderr=True,
+            ) as proc:
+                with Client("127.0.0.1:9359", loop=loop) as client:
+                    assert wait_workers(client, n_gpus=2)
+
+                # grab first 5 lines of dask-cuda-worker startup
+                lines = []
+                for idx, line in enumerate(proc.stderr):
+                    lines.append(line)
+                    if idx == 5:
+                        break
+
+                assert any(b"When using NVLink we" in line for line in lines)
+
+    finally:
+        del os.environ["CUDA_VISIBLE_DEVICES"]

From de61d5ae26d0bf70109f2c120ea3d7f2a77d8e9d Mon Sep 17 00:00:00 2001
From: Benjamin Zaitlen <quasiben@gmail.com>
Date: Thu, 2 Jul 2020 12:57:28 -0700
Subject: [PATCH 065/126] lint

---
 dask_cuda/cuda_worker.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/dask_cuda/cuda_worker.py b/dask_cuda/cuda_worker.py
index f2e955af..4c9230b8 100644
--- a/dask_cuda/cuda_worker.py
+++ b/dask_cuda/cuda_worker.py
@@ -150,8 +150,6 @@ def del_pid_file():
                     "for more details"
                 )
 
-
-
         # Ensure this parent dask-cuda-worker process uses the same UCX
         # configuration as child worker processes created by it.
         initialize(

From 458ca6c440294ea7e2c84c12e0b06ecb818df60c Mon Sep 17 00:00:00 2001
From: Benjamin Zaitlen <quasiben@gmail.com>
Date: Fri, 3 Jul 2020 06:30:11 -0700
Subject: [PATCH 066/126] lint again

---
 dask_cuda/cuda_worker.py                   | 4 ++--
 dask_cuda/local_cuda_cluster.py            | 4 ++--
 dask_cuda/tests/test_local_cuda_cluster.py | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/dask_cuda/cuda_worker.py b/dask_cuda/cuda_worker.py
index 4c9230b8..7dea00ba 100644
--- a/dask_cuda/cuda_worker.py
+++ b/dask_cuda/cuda_worker.py
@@ -146,8 +146,8 @@ def del_pid_file():
                 warnings.warn(
                     "When using NVLink we recommend setting a "
                     "`rmm_pool_size`.  Please see: "
-                    "https://dask-cuda.readthedocs.io/en/latest/ucx.html#important-notes "
-                    "for more details"
+                    "https://dask-cuda.readthedocs.io/en/latest/ucx.html"
+                    "#important-notes for more details"
                 )
 
         # Ensure this parent dask-cuda-worker process uses the same UCX
diff --git a/dask_cuda/local_cuda_cluster.py b/dask_cuda/local_cuda_cluster.py
index 178fd187..c5022fb2 100644
--- a/dask_cuda/local_cuda_cluster.py
+++ b/dask_cuda/local_cuda_cluster.py
@@ -174,8 +174,8 @@ def __init__(
                     "When using NVLink we recommend setting a "
                     "`rmm_pool_size` or setting an RMM pool via `client.run`. "
                     "Please see: "
-                    "https://dask-cuda.readthedocs.io/en/latest/ucx.html#important-notes "
-                    "for more details"
+                    "https://dask-cuda.readthedocs.io/en/latest/ucx.html"
+                    "#important-notes for more details"
                 )
 
         if not processes:
diff --git a/dask_cuda/tests/test_local_cuda_cluster.py b/dask_cuda/tests/test_local_cuda_cluster.py
index bbe91d8e..1274c7c3 100644
--- a/dask_cuda/tests/test_local_cuda_cluster.py
+++ b/dask_cuda/tests/test_local_cuda_cluster.py
@@ -120,7 +120,7 @@ async def test_rmm_pool():
 @gen_test(timeout=20)
 async def test_warn_no_rmm_defined():
     with pytest.warns(Warning) as info:
-        async with LocalCUDACluster(asynchronous=True, enable_nvlink=True) as cluster:
+        async with LocalCUDACluster(asynchronous=True, enable_nvlink=True):
             pass
 
     assert "When using NVLink" in str(info[0].message)

From bffcab7dfb9b546337d81bafead8d85cc7092df7 Mon Sep 17 00:00:00 2001
From: Benjamin Zaitlen <quasiben@gmail.com>
Date: Mon, 6 Jul 2020 07:59:35 -0700
Subject: [PATCH 067/126] fix test

---
 dask_cuda/tests/test_dask_cuda_worker.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/dask_cuda/tests/test_dask_cuda_worker.py b/dask_cuda/tests/test_dask_cuda_worker.py
index c1d86383..9734b879 100644
--- a/dask_cuda/tests/test_dask_cuda_worker.py
+++ b/dask_cuda/tests/test_dask_cuda_worker.py
@@ -85,7 +85,8 @@ def test_nvlink_no_rmm_warning(loop):  # noqa: F811
                 stderr=True,
             ) as proc:
                 with Client("127.0.0.1:9359", loop=loop) as client:
-                    assert wait_workers(client, n_gpus=2)
+                    # CI only has one GPU
+                    assert wait_workers(client, n_gpus=1)
 
                 # grab first 5 lines of dask-cuda-worker startup
                 lines = []

From acce917fcf183dff48c66210ac53fdbeec6f320f Mon Sep 17 00:00:00 2001
From: Benjamin Zaitlen <quasiben@gmail.com>
Date: Mon, 6 Jul 2020 12:27:28 -0500
Subject: [PATCH 068/126] handle case when rmm/cupy/cudf is not on system (CPU
 ONLY)

---
 dask_cuda/benchmarks/utils.py               | 7 +++++--
 dask_cuda/explicit_comms/dataframe_merge.py | 5 ++++-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/dask_cuda/benchmarks/utils.py b/dask_cuda/benchmarks/utils.py
index 753ec1be..0af50e89 100644
--- a/dask_cuda/benchmarks/utils.py
+++ b/dask_cuda/benchmarks/utils.py
@@ -183,8 +183,11 @@ def get_scheduler_workers(dask_scheduler=None):
 
 
 def setup_memory_pool(pool_size=None, disable_pool=False):
-    import rmm
-    import cupy
+    try:
+        import rmm
+        import cupy
+    except ImportError:
+        return None
 
     rmm.reinitialize(
         pool_allocator=not disable_pool, devices=0, initial_pool_size=pool_size,
diff --git a/dask_cuda/explicit_comms/dataframe_merge.py b/dask_cuda/explicit_comms/dataframe_merge.py
index 41e35d87..1ec7b593 100644
--- a/dask_cuda/explicit_comms/dataframe_merge.py
+++ b/dask_cuda/explicit_comms/dataframe_merge.py
@@ -3,7 +3,10 @@
 from dask.dataframe.shuffle import partitioning_index, shuffle_group
 from distributed.protocol import to_serialize
 
-import cudf
+try:
+    import cudf
+except ImportError:
+    pass
 import pandas
 
 from . import comms

From 6f00039dcc684718714cc58456a01b65a52ee158 Mon Sep 17 00:00:00 2001
From: Benjamin Zaitlen <quasiben@gmail.com>
Date: Mon, 6 Jul 2020 08:32:42 -0700
Subject: [PATCH 069/126] fix test

---
 dask_cuda/tests/test_dask_cuda_worker.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/dask_cuda/tests/test_dask_cuda_worker.py b/dask_cuda/tests/test_dask_cuda_worker.py
index 9734b879..8123f448 100644
--- a/dask_cuda/tests/test_dask_cuda_worker.py
+++ b/dask_cuda/tests/test_dask_cuda_worker.py
@@ -72,11 +72,11 @@ def test_rmm_pool(loop):  # noqa: F811
 def test_nvlink_no_rmm_warning(loop):  # noqa: F811
     os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
     try:
-        with popen(["dask-scheduler", "--port", "9359", "--no-dashboard"]):
+        with popen(["dask-scheduler", "--port", "9379", "--no-dashboard"]):
             with popen(
                 [
                     "dask-cuda-worker",
-                    "127.0.0.1:9359",
+                    "127.0.0.1:9379",
                     "--host",
                     "127.0.0.1",
                     "--enable-nvlink",
@@ -84,9 +84,8 @@ def test_nvlink_no_rmm_warning(loop):  # noqa: F811
                 stdout=True,
                 stderr=True,
             ) as proc:
-                with Client("127.0.0.1:9359", loop=loop) as client:
-                    # CI only has one GPU
-                    assert wait_workers(client, n_gpus=1)
+                with Client("127.0.0.1:9379", loop=loop) as client:
+                    assert wait_workers(client, n_gpus=2)
 
                 # grab first 5 lines of dask-cuda-worker startup
                 lines = []

From b9fa2b14abe365a2224586c2f0bf597a0c071de9 Mon Sep 17 00:00:00 2001
From: Benjamin Zaitlen <quasiben@users.noreply.github.com>
Date: Tue, 7 Jul 2020 16:45:05 -0400
Subject: [PATCH 070/126] Update dask_cuda/benchmarks/local_cudf_merge.py

Co-authored-by: Peter Andreas Entschev <peter@entschev.com>
---
 dask_cuda/benchmarks/local_cudf_merge.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dask_cuda/benchmarks/local_cudf_merge.py b/dask_cuda/benchmarks/local_cudf_merge.py
index c2f49c4e..95f69442 100644
--- a/dask_cuda/benchmarks/local_cudf_merge.py
+++ b/dask_cuda/benchmarks/local_cudf_merge.py
@@ -238,7 +238,7 @@ def main(args):
     print("Merge benchmark")
     print("-------------------------------")
     print(f"backend        | {args.backend}")
-    print(f"backend        | {args.type}")
+    print(f"merge type     | {args.type}")
     print(f"rows-per-chunk | {args.chunk_size}")
     print(f"protocol       | {args.protocol}")
     print(f"device(s)      | {args.devs}")

From d2cfab5e6802d9cf34c6502405143e930ea48465 Mon Sep 17 00:00:00 2001
From: Benjamin Zaitlen <quasiben@gmail.com>
Date: Tue, 7 Jul 2020 14:11:48 -0700
Subject: [PATCH 071/126] apply peters suggestion and combine tests

---
 dask_cuda/tests/test_dask_cuda_worker.py | 49 ++++++++----------------
 1 file changed, 15 insertions(+), 34 deletions(-)

diff --git a/dask_cuda/tests/test_dask_cuda_worker.py b/dask_cuda/tests/test_dask_cuda_worker.py
index 8123f448..25359fe8 100644
--- a/dask_cuda/tests/test_dask_cuda_worker.py
+++ b/dask_cuda/tests/test_dask_cuda_worker.py
@@ -11,7 +11,7 @@
 import pytest
 
 
-def test_cuda_visible_devices_and_memory_limit(loop):  # noqa: F811
+def test_cuda_visible_devices_and_memory_limit_and_warning(loop):  # noqa: F811
     os.environ["CUDA_VISIBLE_DEVICES"] = "2,3,7,8"
     try:
         with popen(["dask-scheduler", "--port", "9359", "--no-dashboard"]):
@@ -24,8 +24,11 @@ def test_cuda_visible_devices_and_memory_limit(loop):  # noqa: F811
                     "--device-memory-limit",
                     "1 MB",
                     "--no-dashboard",
-                ]
-            ):
+                    "--enable-nvlink",
+                ],
+                stdout=True,
+                stderr=True,
+            ) as proc:
                 with Client("127.0.0.1:9359", loop=loop) as client:
                     assert wait_workers(client, n_gpus=4)
 
@@ -42,6 +45,15 @@ def get_visible_devices():
                     for w in workers.values():
                         assert w["memory_limit"] == MEMORY_LIMIT // len(workers)
 
+                    # grab first 5 lines of dask-cuda-worker startup
+                    lines = []
+                    for idx, line in enumerate(proc.stderr):
+                        lines.append(line)
+                        if idx == 5:
+                            break
+
+                    assert any(b"When using NVLink we" in line for line in lines)
+
                     assert len(expected) == 0
     finally:
         del os.environ["CUDA_VISIBLE_DEVICES"]
@@ -67,34 +79,3 @@ def test_rmm_pool(loop):  # noqa: F811
                 memory_resource_type = client.run(rmm.mr.get_default_resource_type)
                 for v in memory_resource_type.values():
                     assert v is rmm._lib.memory_resource.CNMemMemoryResource
-
-
-def test_nvlink_no_rmm_warning(loop):  # noqa: F811
-    os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
-    try:
-        with popen(["dask-scheduler", "--port", "9379", "--no-dashboard"]):
-            with popen(
-                [
-                    "dask-cuda-worker",
-                    "127.0.0.1:9379",
-                    "--host",
-                    "127.0.0.1",
-                    "--enable-nvlink",
-                ],
-                stdout=True,
-                stderr=True,
-            ) as proc:
-                with Client("127.0.0.1:9379", loop=loop) as client:
-                    assert wait_workers(client, n_gpus=2)
-
-                # grab first 5 lines of dask-cuda-worker startup
-                lines = []
-                for idx, line in enumerate(proc.stderr):
-                    lines.append(line)
-                    if idx == 5:
-                        break
-
-                assert any(b"When using NVLink we" in line for line in lines)
-
-    finally:
-        del os.environ["CUDA_VISIBLE_DEVICES"]

From 6db88a214add2555f5f0e7e1b6b38149633e0611 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Wed, 8 Jul 2020 13:07:54 +0200
Subject: [PATCH 072/126] isort: now configured like cuDF

---
 setup.cfg | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index e25321a1..76dd01bd 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -31,8 +31,17 @@ order_by_type=True
 known_dask=
     dask
     distributed
+known_rapids=
+    rmm
+    cuml
+    cugraph
+    dask_cudf
+    cudf
+    ucp
+known_first_party=
     dask_cuda
-sections=FUTURE,STDLIB,DASK,FIRSTPARTY,LOCALFOLDER
+default_section=THIRDPARTY
+sections=FUTURE,STDLIB,THIRDPARTY,DASK,RAPIDS,FIRSTPARTY,LOCALFOLDER
 skip=
     .eggs
     .git

From 42770023cee3305fcf88e52d946758d48fd71979 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Wed, 8 Jul 2020 13:08:14 +0200
Subject: [PATCH 073/126] reformat: flake8, black, and isort

---
 dask_cuda/benchmarks/local_cudf_merge.py         | 10 ++++++----
 dask_cuda/benchmarks/local_cupy_transpose_sum.py |  9 +++++----
 dask_cuda/benchmarks/utils.py                    |  4 +++-
 dask_cuda/cli/dask_cuda_worker.py                |  7 +++----
 dask_cuda/cuda_worker.py                         |  6 +++---
 dask_cuda/device_host_file.py                    |  6 +++---
 dask_cuda/explicit_comms/dataframe_merge.py      |  3 ++-
 dask_cuda/explicit_comms/utils.py                |  6 +++---
 dask_cuda/initialize.py                          |  4 ++--
 dask_cuda/local_cuda_cluster.py                  |  2 +-
 dask_cuda/tests/test_dask_cuda_worker.py         |  5 +++--
 dask_cuda/tests/test_device_host_file.py         | 11 ++++++-----
 dask_cuda/tests/test_dgx.py                      | 15 ++++++++-------
 dask_cuda/tests/test_explicit_comms.py           | 12 +++++++-----
 dask_cuda/tests/test_initialize.py               | 13 +++++++------
 dask_cuda/tests/test_local_cuda_cluster.py       |  7 ++++---
 dask_cuda/tests/test_spill.py                    | 11 ++++++-----
 dask_cuda/tests/test_ucx_options.py              |  8 ++++----
 dask_cuda/tests/test_utils.py                    |  6 +++---
 dask_cuda/tests/test_worker_spec.py              |  5 +++--
 setup.py                                         |  3 ++-
 21 files changed, 84 insertions(+), 69 deletions(-)

diff --git a/dask_cuda/benchmarks/local_cudf_merge.py b/dask_cuda/benchmarks/local_cudf_merge.py
index abbe1191..a0e15e6c 100644
--- a/dask_cuda/benchmarks/local_cudf_merge.py
+++ b/dask_cuda/benchmarks/local_cudf_merge.py
@@ -2,10 +2,16 @@
 from collections import defaultdict
 from time import perf_counter as clock
 
+import cupy
+import numpy
+
 from dask.base import tokenize
 from dask.dataframe.core import new_dd_object
 from dask.distributed import Client, performance_report, wait
 from dask.utils import format_bytes, format_time, parse_bytes
+
+import cudf
+
 from dask_cuda import explicit_comms
 from dask_cuda.benchmarks.utils import (
     get_cluster_options,
@@ -14,10 +20,6 @@
     setup_memory_pool,
 )
 
-import cudf
-import cupy
-import numpy
-
 # Benchmarking cuDF merge operation based on
 # <https://gist.github.com/rjzamora/0ffc35c19b5180ab04bbf7c793c45955>
 
diff --git a/dask_cuda/benchmarks/local_cupy_transpose_sum.py b/dask_cuda/benchmarks/local_cupy_transpose_sum.py
index 86177dd2..fe9c9d3f 100644
--- a/dask_cuda/benchmarks/local_cupy_transpose_sum.py
+++ b/dask_cuda/benchmarks/local_cupy_transpose_sum.py
@@ -2,9 +2,13 @@
 from collections import defaultdict
 from time import perf_counter as clock
 
-import dask.array as da
+import cupy
+import numpy as np
+
+from dask import array as da
 from dask.distributed import Client, performance_report, wait
 from dask.utils import format_bytes, format_time, parse_bytes
+
 from dask_cuda.benchmarks.utils import (
     get_cluster_options,
     get_scheduler_workers,
@@ -12,9 +16,6 @@
     setup_memory_pool,
 )
 
-import cupy
-import numpy as np
-
 
 async def _run(client, args):
     # Create a simple random array
diff --git a/dask_cuda/benchmarks/utils.py b/dask_cuda/benchmarks/utils.py
index 80fea4df..68fac72c 100644
--- a/dask_cuda/benchmarks/utils.py
+++ b/dask_cuda/benchmarks/utils.py
@@ -1,6 +1,7 @@
 import argparse
 
 from dask.distributed import SSHCluster
+
 from dask_cuda.local_cuda_cluster import LocalCUDACluster
 
 
@@ -176,9 +177,10 @@ def get_scheduler_workers(dask_scheduler=None):
 
 
 def setup_memory_pool(pool_size=None, disable_pool=False):
-    import rmm
     import cupy
 
+    import rmm
+
     rmm.reinitialize(
         pool_allocator=not disable_pool, devices=0, initial_pool_size=pool_size,
     )
diff --git a/dask_cuda/cli/dask_cuda_worker.py b/dask_cuda/cli/dask_cuda_worker.py
index eb1da172..b343a224 100755
--- a/dask_cuda/cli/dask_cuda_worker.py
+++ b/dask_cuda/cli/dask_cuda_worker.py
@@ -2,14 +2,13 @@
 
 import logging
 
+import click
+from tornado.ioloop import IOLoop, TimeoutError
+
 from distributed.cli.utils import check_python_3, install_signal_handlers
 from distributed.preloading import validate_preload_argv
-
 from distributed.security import Security
 
-import click
-from tornado.ioloop import IOLoop, TimeoutError
-
 from ..cuda_worker import CUDAWorker
 
 logger = logging.getLogger(__name__)
diff --git a/dask_cuda/cuda_worker.py b/dask_cuda/cuda_worker.py
index c187c1de..d57d9e81 100644
--- a/dask_cuda/cuda_worker.py
+++ b/dask_cuda/cuda_worker.py
@@ -5,6 +5,9 @@
 import multiprocessing
 import os
 
+from toolz import valmap
+from tornado.ioloop import IOLoop
+
 from distributed import Nanny
 from distributed.config import config
 from distributed.proctitle import (
@@ -14,9 +17,6 @@
 from distributed.utils import parse_bytes
 from distributed.worker import parse_memory_limit
 
-from toolz import valmap
-from tornado.ioloop import IOLoop
-
 from .device_host_file import DeviceHostFile
 from .initialize import initialize
 from .local_cuda_cluster import cuda_visible_devices
diff --git a/dask_cuda/device_host_file.py b/dask_cuda/device_host_file.py
index 3f7b0a7f..90cdd869 100644
--- a/dask_cuda/device_host_file.py
+++ b/dask_cuda/device_host_file.py
@@ -1,6 +1,9 @@
 import functools
 import os
 
+from zict import Buffer, File, Func
+from zict.common import ZictBase
+
 import dask
 from distributed.protocol import (
     dask_deserialize,
@@ -13,9 +16,6 @@
 from distributed.utils import nbytes
 from distributed.worker import weight
 
-from zict import Buffer, File, Func
-from zict.common import ZictBase
-
 from .is_device_object import is_device_object
 from .utils import nvtx_annotate
 
diff --git a/dask_cuda/explicit_comms/dataframe_merge.py b/dask_cuda/explicit_comms/dataframe_merge.py
index 41e35d87..2c8d807e 100644
--- a/dask_cuda/explicit_comms/dataframe_merge.py
+++ b/dask_cuda/explicit_comms/dataframe_merge.py
@@ -1,10 +1,11 @@
 import asyncio
 
+import pandas
+
 from dask.dataframe.shuffle import partitioning_index, shuffle_group
 from distributed.protocol import to_serialize
 
 import cudf
-import pandas
 
 from . import comms
 
diff --git a/dask_cuda/explicit_comms/utils.py b/dask_cuda/explicit_comms/utils.py
index be1c443e..ec779f3a 100644
--- a/dask_cuda/explicit_comms/utils.py
+++ b/dask_cuda/explicit_comms/utils.py
@@ -1,11 +1,11 @@
 from collections import OrderedDict
 
-import dask.dataframe as dd
-from distributed import default_client, wait
-
 from toolz import first
 from tornado import gen
 
+from dask import dataframe as dd
+from distributed import default_client, wait
+
 
 def workers_to_parts(futures):
     """
diff --git a/dask_cuda/initialize.py b/dask_cuda/initialize.py
index a87c0798..e53c10de 100644
--- a/dask_cuda/initialize.py
+++ b/dask_cuda/initialize.py
@@ -25,11 +25,11 @@
 """
 import logging
 
-import dask
-
 import click
 import numba.cuda
 
+import dask
+
 from .utils import get_ucx_config
 
 logger = logging.getLogger(__name__)
diff --git a/dask_cuda/local_cuda_cluster.py b/dask_cuda/local_cuda_cluster.py
index 02fcd77a..d5a6e83d 100644
--- a/dask_cuda/local_cuda_cluster.py
+++ b/dask_cuda/local_cuda_cluster.py
@@ -198,7 +198,7 @@ def __init__(
 
         if ucx_net_devices == "auto":
             try:
-                from ucp._libs.topological_distance import TopologicalDistance  # noqa
+                from ucp._libs.topological_distance import TopologicalDistance  # NOQA
             except ImportError:
                 raise ValueError(
                     "ucx_net_devices set to 'auto' but UCX-Py is not "
diff --git a/dask_cuda/tests/test_dask_cuda_worker.py b/dask_cuda/tests/test_dask_cuda_worker.py
index e2132888..fe88a7c2 100644
--- a/dask_cuda/tests/test_dask_cuda_worker.py
+++ b/dask_cuda/tests/test_dask_cuda_worker.py
@@ -2,13 +2,14 @@
 
 import os
 
-from dask_cuda.utils import get_gpu_count, wait_workers
+import pytest
+
 from distributed import Client
 from distributed.system import MEMORY_LIMIT
 from distributed.utils_test import loop  # noqa: F401
 from distributed.utils_test import popen
 
-import pytest
+from dask_cuda.utils import get_gpu_count, wait_workers
 
 
 def test_cuda_visible_devices_and_memory_limit(loop):  # noqa: F811
diff --git a/dask_cuda/tests/test_device_host_file.py b/dask_cuda/tests/test_device_host_file.py
index 00e96d0a..0d12f97e 100644
--- a/dask_cuda/tests/test_device_host_file.py
+++ b/dask_cuda/tests/test_device_host_file.py
@@ -1,17 +1,18 @@
 import os
 from random import randint
 
+import numpy as np
+import pytest
+
 import dask
-import dask.array as da
+from dask import array as da
+from distributed.protocol import deserialize_bytes, serialize_bytelist
+
 from dask_cuda.device_host_file import (
     DeviceHostFile,
     device_to_host,
     host_to_device,
 )
-from distributed.protocol import deserialize_bytes, serialize_bytelist
-
-import numpy as np
-import pytest
 
 cupy = pytest.importorskip("cupy")
 
diff --git a/dask_cuda/tests/test_dgx.py b/dask_cuda/tests/test_dgx.py
index d867a9ae..b2378ffc 100644
--- a/dask_cuda/tests/test_dgx.py
+++ b/dask_cuda/tests/test_dgx.py
@@ -4,17 +4,18 @@
 from enum import Enum, auto
 from time import sleep
 
-import dask.array as da
-from dask_cuda import LocalCUDACluster
-from dask_cuda.initialize import initialize
-from dask_cuda.utils import wait_workers
-from distributed import Client
-from distributed.utils import get_ip_interface
-
 import numpy
 import pytest
 from tornado.ioloop import IOLoop
 
+from dask import array as da
+from distributed import Client
+from distributed.utils import get_ip_interface
+
+from dask_cuda import LocalCUDACluster
+from dask_cuda.initialize import initialize
+from dask_cuda.utils import wait_workers
+
 mp = mp.get_context("spawn")
 ucp = pytest.importorskip("ucp")
 psutil = pytest.importorskip("psutil")
diff --git a/dask_cuda/tests/test_explicit_comms.py b/dask_cuda/tests/test_explicit_comms.py
index 2bacc8c5..fb4629fe 100644
--- a/dask_cuda/tests/test_explicit_comms.py
+++ b/dask_cuda/tests/test_explicit_comms.py
@@ -1,14 +1,16 @@
 import multiprocessing as mp
 
-import dask.dataframe as dd
-from dask_cuda.explicit_comms import CommsContext, dataframe_merge
+import numpy as np
+import pandas as pd
+import pytest
+
+from dask import dataframe as dd
 from distributed import Client
 from distributed.deploy.local import LocalCluster
 
 import cudf
-import numpy as np
-import pandas as pd
-import pytest
+
+from dask_cuda.explicit_comms import CommsContext, dataframe_merge
 
 mp = mp.get_context("spawn")
 ucp = pytest.importorskip("ucp")
diff --git a/dask_cuda/tests/test_initialize.py b/dask_cuda/tests/test_initialize.py
index 7b450f41..2c56ab34 100644
--- a/dask_cuda/tests/test_initialize.py
+++ b/dask_cuda/tests/test_initialize.py
@@ -1,15 +1,16 @@
 import multiprocessing as mp
 
-import dask.array as da
-from dask_cuda.initialize import initialize
-from dask_cuda.utils import get_ucx_config
-from distributed import Client
-from distributed.deploy.local import LocalCluster
-
 import numpy
 import psutil
 import pytest
 
+from dask import array as da
+from distributed import Client
+from distributed.deploy.local import LocalCluster
+
+from dask_cuda.initialize import initialize
+from dask_cuda.utils import get_ucx_config
+
 mp = mp.get_context("spawn")
 ucp = pytest.importorskip("ucp")
 
diff --git a/dask_cuda/tests/test_local_cuda_cluster.py b/dask_cuda/tests/test_local_cuda_cluster.py
index 2958dc29..2702cfc1 100644
--- a/dask_cuda/tests/test_local_cuda_cluster.py
+++ b/dask_cuda/tests/test_local_cuda_cluster.py
@@ -1,12 +1,13 @@
 import os
 
+import pytest
+
 from dask.distributed import Client
-from dask_cuda import LocalCUDACluster, utils
-from dask_cuda.initialize import initialize
 from distributed.system import MEMORY_LIMIT
 from distributed.utils_test import gen_test
 
-import pytest
+from dask_cuda import LocalCUDACluster, utils
+from dask_cuda.initialize import initialize
 
 
 @gen_test(timeout=20)
diff --git a/dask_cuda/tests/test_spill.py b/dask_cuda/tests/test_spill.py
index 8f45c729..c826d542 100644
--- a/dask_cuda/tests/test_spill.py
+++ b/dask_cuda/tests/test_spill.py
@@ -1,18 +1,19 @@
 import os
 from time import sleep
 
+import pytest
+from zict.file import _safe_key as safe_key
+
 import dask
-import dask.array as da
-from dask_cuda import LocalCUDACluster, utils
-from dask_cuda.device_host_file import DeviceHostFile
+from dask import array as da
 from distributed import Client, get_worker, wait
 from distributed.metrics import time
 from distributed.sizeof import sizeof
 from distributed.utils_test import gen_cluster, gen_test, loop  # noqa: F401
 from distributed.worker import Worker
 
-import pytest
-from zict.file import _safe_key as safe_key
+from dask_cuda import LocalCUDACluster, utils
+from dask_cuda.device_host_file import DeviceHostFile
 
 if utils.get_device_total_memory() < 1e10:
     pytest.skip("Not enough GPU memory", allow_module_level=True)
diff --git a/dask_cuda/tests/test_ucx_options.py b/dask_cuda/tests/test_ucx_options.py
index 8326f459..37f61d15 100644
--- a/dask_cuda/tests/test_ucx_options.py
+++ b/dask_cuda/tests/test_ucx_options.py
@@ -1,13 +1,13 @@
 import multiprocessing as mp
 
+import numpy
+import pytest
+
 import dask
-import dask.array as da
+from dask import array as da
 from distributed import Client
 from distributed.deploy.local import LocalCluster
 
-import numpy
-import pytest
-
 mp = mp.get_context("spawn")
 ucp = pytest.importorskip("ucp")
 
diff --git a/dask_cuda/tests/test_utils.py b/dask_cuda/tests/test_utils.py
index 60d52d51..7a02bee0 100644
--- a/dask_cuda/tests/test_utils.py
+++ b/dask_cuda/tests/test_utils.py
@@ -1,5 +1,8 @@
 import os
 
+import pytest
+from numba import cuda
+
 from dask_cuda.utils import (
     get_cpu_affinity,
     get_device_total_memory,
@@ -10,9 +13,6 @@
     unpack_bitmask,
 )
 
-import pytest
-from numba import cuda
-
 
 def test_get_n_gpus():
     assert isinstance(get_n_gpus(), int)
diff --git a/dask_cuda/tests/test_worker_spec.py b/dask_cuda/tests/test_worker_spec.py
index 454bf106..a157dcf9 100644
--- a/dask_cuda/tests/test_worker_spec.py
+++ b/dask_cuda/tests/test_worker_spec.py
@@ -1,7 +1,8 @@
-from dask_cuda.worker_spec import worker_spec
+import pytest
+
 from distributed import Nanny
 
-import pytest
+from dask_cuda.worker_spec import worker_spec
 
 
 def _check_option(spec, k, v):
diff --git a/setup.py b/setup.py
index 64753fa9..e585e40d 100644
--- a/setup.py
+++ b/setup.py
@@ -1,9 +1,10 @@
 import os
 from codecs import open
 
-import versioneer
 from setuptools import find_packages, setup
 
+import versioneer
+
 # Get the long description from the README file
 with open(os.path.join(os.path.dirname(__file__), "README.md")) as f:
     long_description = f.read()

From 7679c287bfac038c64fe1f3ecfcaa32150987cc3 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Wed, 8 Jul 2020 12:52:11 +0200
Subject: [PATCH 074/126] explicit-comms: added test of merge where some
 workers as no partitions

---
 dask_cuda/tests/test_explicit_comms.py | 29 ++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/dask_cuda/tests/test_explicit_comms.py b/dask_cuda/tests/test_explicit_comms.py
index fb4629fe..98301f00 100644
--- a/dask_cuda/tests/test_explicit_comms.py
+++ b/dask_cuda/tests/test_explicit_comms.py
@@ -90,3 +90,32 @@ def test_dataframe_merge(backend, protocol, nworkers):
     p.start()
     p.join()
     assert not p.exitcode
+
+
+def _test_dataframe_merge_empty_partitions(nrows, npartitions):
+    with LocalCluster(
+        protocol="tcp",
+        dashboard_address=None,
+        n_workers=npartitions,
+        threads_per_worker=1,
+        processes=True,
+    ) as cluster:
+        with Client(cluster):
+            df1 = pd.DataFrame({"key": np.arange(nrows), "payload1": np.arange(nrows)})
+            key = np.arange(nrows)
+            np.random.shuffle(key)
+            df2 = pd.DataFrame({"key": key, "payload2": np.arange(nrows)})
+            expected = df1.merge(df2).set_index("key")
+            ddf1 = dd.from_pandas(df1, npartitions=npartitions)
+            ddf2 = dd.from_pandas(df2, npartitions=npartitions)
+            ddf3 = dataframe_merge(ddf1, ddf2, on="key").set_index("key")
+            got = ddf3.compute()
+            pd.testing.assert_frame_equal(got, expected)
+
+
+def test_dataframe_merge_empty_partitions():
+     # Notice, we use more partitions than rows
+    p = mp.Process(target=_test_dataframe_merge_empty_partitions, args=(2, 4))
+    p.start()
+    p.join()
+    assert not p.exitcode

From 5b83b9b1f6152e5e1696fce5971dd7f799e88065 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Wed, 8 Jul 2020 12:57:08 +0200
Subject: [PATCH 075/126] explicit-comms: now support when one df is not on all
 workers

---
 dask_cuda/explicit_comms/comms.py           | 24 ++++----
 dask_cuda/explicit_comms/dataframe_merge.py | 39 ++++++++----
 dask_cuda/explicit_comms/utils.py           | 66 ++++++---------------
 dask_cuda/tests/test_explicit_comms.py      |  2 +-
 4 files changed, 57 insertions(+), 74 deletions(-)

diff --git a/dask_cuda/explicit_comms/comms.py b/dask_cuda/explicit_comms/comms.py
index 44b2bde0..36897ceb 100644
--- a/dask_cuda/explicit_comms/comms.py
+++ b/dask_cuda/explicit_comms/comms.py
@@ -125,7 +125,7 @@ def __init__(self, client=None):
                 )
             )
 
-        # Each worker creates a UCX endpoint to all workers with greater rank
+        # Each worker creates an endpoint to all workers with greater rank
         self.run(_create_endpoints, self.worker_direct_addresses)
 
         # At this point all workers should have a rank and endpoints to
@@ -211,14 +211,11 @@ def dataframe_operation(self, coroutine, df_list, extra_args=tuple()):
         """
         df_parts_list = []
         for df in df_list:
-            df_parts_list.append(
-                utils.workers_to_parts(
-                    self.client.sync(utils.extract_ddf_partitions, df)
-                )
-            )
+            df_parts_list.append(utils.extract_ddf_partitions(df))
 
         # Let's create a dict for each dataframe that specifices the
         # number of partitions each worker has
+        world = set()
         dfs_nparts = []
         for df_parts in df_parts_list:
             nparts = {}
@@ -226,14 +223,19 @@ def dataframe_operation(self, coroutine, df_list, extra_args=tuple()):
                 npart = len(df_parts.get(worker, []))
                 if npart > 0:
                     nparts[rank] = npart
+                    world.add(rank)
             dfs_nparts.append(nparts)
 
         # Submit `coroutine` on each worker given the df_parts that
         # belong the specific worker as input
         ret = []
-        for worker in self.worker_addresses:
-            dfs = []
-            for df_parts in df_parts_list:
-                dfs.append(df_parts.get(worker, []))
-            ret.append(self.submit(worker, coroutine, dfs_nparts, dfs, *extra_args))
+        for rank, worker in enumerate(self.worker_addresses):
+            if rank in world:
+                dfs = []
+                for df_parts in df_parts_list:
+                    dfs.append(df_parts.get(worker, []))
+                # print("dfs: ", dfs)
+                ret.append(
+                    self.submit(worker, coroutine, world, dfs_nparts, dfs, *extra_args)
+                )
         return utils.dataframes_to_dask_dataframe(ret)
diff --git a/dask_cuda/explicit_comms/dataframe_merge.py b/dask_cuda/explicit_comms/dataframe_merge.py
index 2c8d807e..f31f23d6 100644
--- a/dask_cuda/explicit_comms/dataframe_merge.py
+++ b/dask_cuda/explicit_comms/dataframe_merge.py
@@ -139,7 +139,7 @@ async def single_partition_join(
     return left_table.merge(right_table, left_on=left_on, right_on=right_on)
 
 
-async def _dataframe_merge(s, dfs_nparts, dfs_parts, left_on, right_on):
+async def _dataframe_merge(s, world, dfs_nparts, dfs_parts, left_on, right_on):
     """ Worker job that merge local DataFrames
 
     Parameters
@@ -175,40 +175,53 @@ def df_concat(df_parts):
         else:
             return concat(df_parts)
 
+    assert s["rank"] in world
+
+    # Trimming
+    trim_map = {}
+    for i in range(s["nworkers"]):
+        if i in world:
+            trim_map[i] = len(trim_map)
+
+    nworkers = len(world)
+    rank = trim_map[s["rank"]]
+    eps = {trim_map[i]: s["eps"][trim_map[i]] for i in world if i != s["rank"]}
+
     df1 = df_concat(dfs_parts[0])
     df2 = df_concat(dfs_parts[1])
 
     if len(dfs_nparts[0]) == 1 and len(dfs_nparts[1]) == 1:
         return df1.merge(df2, left_on=left_on, right_on=right_on)
-
     elif len(dfs_nparts[0]) == 1:
         return await single_partition_join(
-            s["nworkers"],
-            s["rank"],
-            s["eps"],
+            nworkers,
+            rank,
+            eps,
             df1,
             df2,
             left_on,
             right_on,
             "left",
-            next(iter(dfs_nparts[0])),  # Extracting the only key in `dfs_nparts[0]`
+            trim_map[
+                next(iter(dfs_nparts[0]))
+            ],  # Extracting the only key in `dfs_nparts[0]`
         )
     elif len(dfs_nparts[1]) == 1:
         return await single_partition_join(
-            s["nworkers"],
-            s["rank"],
-            s["eps"],
+            nworkers,
+            rank,
+            eps,
             df1,
             df2,
             left_on,
             right_on,
             "right",
-            next(iter(dfs_nparts[1])),  # Extracting the only key in `dfs_nparts[1]`
+            trim_map[
+                next(iter(dfs_nparts[1]))
+            ],  # Extracting the only key in `dfs_nparts[1]`
         )
     else:
-        return await hash_join(
-            s["nworkers"], s["rank"], s["eps"], df1, df2, left_on, right_on
-        )
+        return await hash_join(nworkers, rank, eps, df1, df2, left_on, right_on)
 
 
 def dataframe_merge(left, right, on=None, left_on=None, right_on=None, how="inner"):
diff --git a/dask_cuda/explicit_comms/utils.py b/dask_cuda/explicit_comms/utils.py
index ec779f3a..2e808ca6 100644
--- a/dask_cuda/explicit_comms/utils.py
+++ b/dask_cuda/explicit_comms/utils.py
@@ -1,58 +1,26 @@
-from collections import OrderedDict
+import dask.dataframe as dd
+from distributed import default_client, wait, get_client
 
 from toolz import first
-from tornado import gen
 
-from dask import dataframe as dd
-from distributed import default_client, wait
-
-
-def workers_to_parts(futures):
-    """
-    Builds an ordered dict mapping each worker to their list
-    of parts
-    :param futures: list of (worker, part) tuples
-    :return:
-    """
-    w_to_p_map = OrderedDict()
-    for w, p in futures:
-        if w not in w_to_p_map:
-            w_to_p_map[w] = []
-        w_to_p_map[w].append(p)
-    return w_to_p_map
-
-
-@gen.coroutine
-def extract_ddf_partitions(ddf, client=None, agg=True):
-    """
-    Given a Dask dataframe, return an OrderedDict mapping
-    'worker -> [list of futures]' for each partition in ddf.
-
-    :param ddf: Dask.dataframe split dataframe partitions into a list of
-               futures.
-    :param client: dask.distributed.Client Optional client to use
-    """
-    client = default_client() if client is None else client
 
+def extract_ddf_partitions(ddf):
+    """ Returns the mapping: worker -> [list of futures]"""
+    client = get_client()
     delayed_ddf = ddf.to_delayed()
     parts = client.compute(delayed_ddf)
-    yield wait(parts)
-
-    key_to_part_dict = dict([(str(part.key), part) for part in parts])
-    who_has = yield client.who_has(parts)
-
-    worker_map = {}  # Map from part -> worker
-    for key, workers in who_has.items():
-        worker = first(workers)
-        worker_map[key_to_part_dict[key]] = worker
-
-    worker_to_parts = []
-    for part in parts:
-        worker = worker_map[part]
-        worker_to_parts.append((worker, part))
-
-    yield wait(worker_to_parts)
-    raise gen.Return(worker_to_parts)
+    wait(parts)
+
+    key_to_part = dict([(str(part.key), part) for part in parts])
+    ret = {}  # Map worker -> [list of futures]
+    for key, workers in client.who_has(parts).items():
+        worker = first(
+            workers
+        )  # If multiple workers have the part, we pick the first worker
+        if worker not in ret:
+            ret[worker] = []
+        ret[worker].append(key_to_part[key])
+    return ret
 
 
 def get_meta(df):
diff --git a/dask_cuda/tests/test_explicit_comms.py b/dask_cuda/tests/test_explicit_comms.py
index 98301f00..5ca042f9 100644
--- a/dask_cuda/tests/test_explicit_comms.py
+++ b/dask_cuda/tests/test_explicit_comms.py
@@ -114,7 +114,7 @@ def _test_dataframe_merge_empty_partitions(nrows, npartitions):
 
 
 def test_dataframe_merge_empty_partitions():
-     # Notice, we use more partitions than rows
+    # Notice, we use more partitions than rows
     p = mp.Process(target=_test_dataframe_merge_empty_partitions, args=(2, 4))
     p.start()
     p.join()

From 08ebaf2b4295255123739ad5dfac78d17eca6917 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Wed, 8 Jul 2020 13:11:36 +0200
Subject: [PATCH 076/126] reformat: isort

---
 dask_cuda/explicit_comms/utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/dask_cuda/explicit_comms/utils.py b/dask_cuda/explicit_comms/utils.py
index 2e808ca6..b99ea4de 100644
--- a/dask_cuda/explicit_comms/utils.py
+++ b/dask_cuda/explicit_comms/utils.py
@@ -1,8 +1,8 @@
-import dask.dataframe as dd
-from distributed import default_client, wait, get_client
-
 from toolz import first
 
+from dask import dataframe as dd
+from distributed import default_client, get_client, wait
+
 
 def extract_ddf_partitions(ddf):
     """ Returns the mapping: worker -> [list of futures]"""

From c67555138a08871b3fbe201b41b53779b9ecc05e Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Wed, 8 Jul 2020 15:51:24 +0200
Subject: [PATCH 077/126] renamed world to workers

---
 dask_cuda/explicit_comms/dataframe_merge.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/dask_cuda/explicit_comms/dataframe_merge.py b/dask_cuda/explicit_comms/dataframe_merge.py
index f31f23d6..f7cf6f70 100644
--- a/dask_cuda/explicit_comms/dataframe_merge.py
+++ b/dask_cuda/explicit_comms/dataframe_merge.py
@@ -139,13 +139,15 @@ async def single_partition_join(
     return left_table.merge(right_table, left_on=left_on, right_on=right_on)
 
 
-async def _dataframe_merge(s, world, dfs_nparts, dfs_parts, left_on, right_on):
+async def _dataframe_merge(s, workers, dfs_nparts, dfs_parts, left_on, right_on):
     """ Worker job that merge local DataFrames
 
     Parameters
     ----------
     s: dict
         Worker session state
+    workers: set
+        Set of ranks of all the participants
     dfs_nparts: list of dict
         List of dict that for each worker rank specifices the
         number of partitions that worker has. If the worker doesn't
@@ -175,17 +177,16 @@ def df_concat(df_parts):
         else:
             return concat(df_parts)
 
-    assert s["rank"] in world
+    assert s["rank"] in workers
 
-    # Trimming
+    # Trimming such that all participanting workers get a rank within 0..len(workers)
     trim_map = {}
     for i in range(s["nworkers"]):
-        if i in world:
+        if i in workers:
             trim_map[i] = len(trim_map)
 
-    nworkers = len(world)
     rank = trim_map[s["rank"]]
-    eps = {trim_map[i]: s["eps"][trim_map[i]] for i in world if i != s["rank"]}
+    eps = {trim_map[i]: s["eps"][trim_map[i]] for i in workers if i != s["rank"]}
 
     df1 = df_concat(dfs_parts[0])
     df2 = df_concat(dfs_parts[1])
@@ -194,7 +195,7 @@ def df_concat(df_parts):
         return df1.merge(df2, left_on=left_on, right_on=right_on)
     elif len(dfs_nparts[0]) == 1:
         return await single_partition_join(
-            nworkers,
+            len(workers),
             rank,
             eps,
             df1,
@@ -208,7 +209,7 @@ def df_concat(df_parts):
         )
     elif len(dfs_nparts[1]) == 1:
         return await single_partition_join(
-            nworkers,
+            len(workers),
             rank,
             eps,
             df1,
@@ -221,7 +222,7 @@ def df_concat(df_parts):
             ],  # Extracting the only key in `dfs_nparts[1]`
         )
     else:
-        return await hash_join(nworkers, rank, eps, df1, df2, left_on, right_on)
+        return await hash_join(len(workers), rank, eps, df1, df2, left_on, right_on)
 
 
 def dataframe_merge(left, right, on=None, left_on=None, right_on=None, how="inner"):

From 30c1fca1ea8de44aee1c1732428b665461e610ce Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Wed, 8 Jul 2020 15:52:03 +0200
Subject: [PATCH 078/126] Removed debug code

---
 dask_cuda/explicit_comms/comms.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/dask_cuda/explicit_comms/comms.py b/dask_cuda/explicit_comms/comms.py
index 36897ceb..72cfbbc4 100644
--- a/dask_cuda/explicit_comms/comms.py
+++ b/dask_cuda/explicit_comms/comms.py
@@ -234,7 +234,6 @@ def dataframe_operation(self, coroutine, df_list, extra_args=tuple()):
                 dfs = []
                 for df_parts in df_parts_list:
                     dfs.append(df_parts.get(worker, []))
-                # print("dfs: ", dfs)
                 ret.append(
                     self.submit(worker, coroutine, world, dfs_nparts, dfs, *extra_args)
                 )

From 1496ac564b9d0958defb7fca4d0ca11b253e3b72 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Wed, 8 Jul 2020 21:27:51 +0200
Subject: [PATCH 079/126] Using collections.defaultdict(list)

Co-authored-by: jakirkham <jakirkham@gmail.com>
---
 dask_cuda/explicit_comms/utils.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/dask_cuda/explicit_comms/utils.py b/dask_cuda/explicit_comms/utils.py
index b99ea4de..ae11ea9f 100644
--- a/dask_cuda/explicit_comms/utils.py
+++ b/dask_cuda/explicit_comms/utils.py
@@ -1,3 +1,4 @@
+from collections import defaultdict
 from toolz import first
 
 from dask import dataframe as dd
@@ -12,13 +13,11 @@ def extract_ddf_partitions(ddf):
     wait(parts)
 
     key_to_part = dict([(str(part.key), part) for part in parts])
-    ret = {}  # Map worker -> [list of futures]
+    ret = defaultdict(list)  # Map worker -> [list of futures]
     for key, workers in client.who_has(parts).items():
         worker = first(
             workers
         )  # If multiple workers have the part, we pick the first worker
-        if worker not in ret:
-            ret[worker] = []
         ret[worker].append(key_to_part[key])
     return ret
 

From f4f4a0fef1b783ce006eacdd000b99c29e8fa9ca Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Wed, 8 Jul 2020 21:29:34 +0200
Subject: [PATCH 080/126] reformat: isort

---
 dask_cuda/explicit_comms/utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dask_cuda/explicit_comms/utils.py b/dask_cuda/explicit_comms/utils.py
index ae11ea9f..893ebb3a 100644
--- a/dask_cuda/explicit_comms/utils.py
+++ b/dask_cuda/explicit_comms/utils.py
@@ -1,4 +1,5 @@
 from collections import defaultdict
+
 from toolz import first
 
 from dask import dataframe as dd

From d62dd87a7e0145587307dc5c0d64fd4fd5707b3b Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Thu, 9 Jul 2020 16:24:39 -0700
Subject: [PATCH 081/126] Add support for RMM managed memory

---
 dask_cuda/cli/dask_cuda_worker.py | 10 ++++++++++
 dask_cuda/cuda_worker.py          | 13 +++++++++----
 dask_cuda/local_cuda_cluster.py   | 23 +++++++++++++++--------
 dask_cuda/utils.py                | 13 +++++++++----
 4 files changed, 43 insertions(+), 16 deletions(-)

diff --git a/dask_cuda/cli/dask_cuda_worker.py b/dask_cuda/cli/dask_cuda_worker.py
index b343a224..5c5d2586 100755
--- a/dask_cuda/cli/dask_cuda_worker.py
+++ b/dask_cuda/cli/dask_cuda_worker.py
@@ -99,6 +99,14 @@
     "the given size, otherwise no RMM pool is created. This can be "
     "an integer (bytes) or string (like 5GB or 5000M).",
 )
+@click.option(
+    "--rmm-managed-memory/--no-rmm-managed-memory",
+    default=False,
+    help="If enabled, initialize each worker with RMM and set it to "
+    "use managed memory. If disabled, RMM may still be used if "
+    "--rmm-pool-size is specified, but in that case with default "
+    "(non-managed) memory type.",
+)
 @click.option(
     "--reconnect/--no-reconnect",
     default=True,
@@ -188,6 +196,7 @@ def main(
     memory_limit,
     device_memory_limit,
     rmm_pool_size,
+    rmm_managed_memory,
     pid_file,
     resources,
     dashboard,
@@ -223,6 +232,7 @@ def main(
         memory_limit,
         device_memory_limit,
         rmm_pool_size,
+        rmm_managed_memory,
         pid_file,
         resources,
         dashboard,
diff --git a/dask_cuda/cuda_worker.py b/dask_cuda/cuda_worker.py
index d57d9e81..2234103f 100644
--- a/dask_cuda/cuda_worker.py
+++ b/dask_cuda/cuda_worker.py
@@ -22,7 +22,7 @@
 from .local_cuda_cluster import cuda_visible_devices
 from .utils import (
     CPUAffinity,
-    RMMPool,
+    RMMSetup,
     get_cpu_affinity,
     get_device_total_memory,
     get_n_gpus,
@@ -53,6 +53,7 @@ def __init__(
         memory_limit="auto",
         device_memory_limit="auto",
         rmm_pool_size=None,
+        rmm_managed_memory=False,
         pid_file=None,
         resources=None,
         dashboard=True,
@@ -130,7 +131,7 @@ def del_pid_file():
         if interface and host:
             raise ValueError("Can not specify both interface and host")
 
-        if rmm_pool_size is not None:
+        if rmm_pool_size is not None or rmm_managed_memory:
             try:
                 import rmm  # noqa F401
             except ImportError:
@@ -139,7 +140,8 @@ def del_pid_file():
                     "For installation instructions, please see "
                     "https://github.com/rapidsai/rmm"
                 )  # pragma: no cover
-            rmm_pool_size = parse_bytes(rmm_pool_size)
+            if rmm_pool_size is not None:
+                rmm_pool_size = parse_bytes(rmm_pool_size)
 
         # Ensure this parent dask-cuda-worker process uses the same UCX
         # configuration as child worker processes created by it.
@@ -168,7 +170,10 @@ def del_pid_file():
                 preload_argv=(list(preload_argv) or []) + ["--create-cuda-context"],
                 security=security,
                 env={"CUDA_VISIBLE_DEVICES": cuda_visible_devices(i)},
-                plugins={CPUAffinity(get_cpu_affinity(i)), RMMPool(rmm_pool_size)},
+                plugins={
+                    CPUAffinity(get_cpu_affinity(i)),
+                    RMMSetup(rmm_pool_size, rmm_managed_memory),
+                },
                 name=name if nprocs == 1 or not name else name + "-" + str(i),
                 local_directory=local_directory,
                 config={
diff --git a/dask_cuda/local_cuda_cluster.py b/dask_cuda/local_cuda_cluster.py
index d5a6e83d..74714e3f 100644
--- a/dask_cuda/local_cuda_cluster.py
+++ b/dask_cuda/local_cuda_cluster.py
@@ -10,7 +10,7 @@
 from .initialize import initialize
 from .utils import (
     CPUAffinity,
-    RMMPool,
+    RMMSetup,
     get_cpu_affinity,
     get_device_total_memory,
     get_n_gpus,
@@ -102,7 +102,11 @@ class LocalCUDACluster(LocalCluster):
         interfaces are connected and properly configured.
     rmm_pool_size: None, int or str
         When None (default), no RMM pool is initialized. If a different value
-        is given, it can be an integer (bytes) or string (like 5GB or 5000M)."
+        is given, it can be an integer (bytes) or string (like 5GB or 5000M).
+    rmm_pool_size: bool
+        If True, initialize each worker with RMM and set it to use managed
+        memory. If False, RMM may still be used if `rmm_pool_size` is specified,
+        but in that case with default (non-managed) memory type.
 
     Examples
     --------
@@ -142,6 +146,7 @@ def __init__(
         enable_rdmacm=False,
         ucx_net_devices=None,
         rmm_pool_size=None,
+        rmm_managed_memory=False,
         **kwargs,
     ):
         if CUDA_VISIBLE_DEVICES is None:
@@ -157,16 +162,18 @@ def __init__(
         self.device_memory_limit = device_memory_limit
 
         self.rmm_pool_size = rmm_pool_size
-        if rmm_pool_size is not None:
+        self.rmm_managed_memory = rmm_managed_memory
+        if rmm_pool_size is not None or rmm_managed_memory:
             try:
                 import rmm  # noqa F401
             except ImportError:
                 raise ValueError(
-                    "RMM pool requested but module 'rmm' is not available. "
-                    "For installation instructions, please see "
-                    "https://github.com/rapidsai/rmm"
+                    "RMM pool or managed memory requested but module 'rmm' "
+                    "is not available. For installation instructions, please "
+                    "see https://github.com/rapidsai/rmm"
                 )  # pragma: no cover
-            self.rmm_pool_size = parse_bytes(self.rmm_pool_size)
+            if self.rmm_pool_size is not None:
+                self.rmm_pool_size = parse_bytes(self.rmm_pool_size)
 
         if not processes:
             raise ValueError(
@@ -265,7 +272,7 @@ def new_worker_spec(self):
                 "env": {"CUDA_VISIBLE_DEVICES": visible_devices,},
                 "plugins": {
                     CPUAffinity(get_cpu_affinity(worker_count)),
-                    RMMPool(self.rmm_pool_size),
+                    RMMSetup(self.rmm_pool_size, self.rmm_managed_memory),
                 },
             }
         )
diff --git a/dask_cuda/utils.py b/dask_cuda/utils.py
index e102c864..8c0adab8 100644
--- a/dask_cuda/utils.py
+++ b/dask_cuda/utils.py
@@ -28,16 +28,21 @@ def setup(self, worker=None):
         os.sched_setaffinity(0, self.cores)
 
 
-class RMMPool:
-    def __init__(self, nbytes):
+class RMMSetup:
+    def __init__(self, nbytes, managed_memory):
         self.nbytes = nbytes
+        self.managed_memory = managed_memory
 
     def setup(self, worker=None):
-        if self.nbytes is not None:
+        if self.nbytes is not None or self.managed_memory is True:
             import rmm
 
+            pool_allocator = False if self.nbytes is None else True
+
             rmm.reinitialize(
-                pool_allocator=True, managed_memory=False, initial_pool_size=self.nbytes
+                pool_allocator=pool_allocator,
+                managed_memory=self.managed_memory,
+                initial_pool_size=self.nbytes,
             )
 
 
From 8d2d2c5baeea8df762f292456ab2a31054dbd9fb Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Thu, 9 Jul 2020 16:28:02 -0700
Subject: [PATCH 082/126] Add tests for RMM managed memory

---
 dask_cuda/tests/test_dask_cuda_worker.py   | 5 +++--
 dask_cuda/tests/test_local_cuda_cluster.py | 8 +++++---
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/dask_cuda/tests/test_dask_cuda_worker.py b/dask_cuda/tests/test_dask_cuda_worker.py
index fe88a7c2..ad65e470 100644
--- a/dask_cuda/tests/test_dask_cuda_worker.py
+++ b/dask_cuda/tests/test_dask_cuda_worker.py
@@ -48,7 +48,7 @@ def get_visible_devices():
         del os.environ["CUDA_VISIBLE_DEVICES"]
 
 
-def test_rmm_pool(loop):  # noqa: F811
+def test_rmm(loop):  # noqa: F811
     rmm = pytest.importorskip("rmm")
     with popen(["dask-scheduler", "--port", "9369", "--no-dashboard"]):
         with popen(
@@ -59,6 +59,7 @@ def test_rmm_pool(loop):  # noqa: F811
                 "127.0.0.1",
                 "--rmm-pool-size",
                 "2 GB",
+                "--rmm-managed-memory",
                 "--no-dashboard",
             ]
         ):
@@ -67,4 +68,4 @@ def test_rmm_pool(loop):  # noqa: F811
 
                 memory_resource_type = client.run(rmm.mr.get_default_resource_type)
                 for v in memory_resource_type.values():
-                    assert v is rmm._lib.memory_resource.CNMemMemoryResource
+                    assert v is rmm._lib.memory_resource.CNMemManagedMemoryResource
diff --git a/dask_cuda/tests/test_local_cuda_cluster.py b/dask_cuda/tests/test_local_cuda_cluster.py
index 2702cfc1..71f6dd2b 100644
--- a/dask_cuda/tests/test_local_cuda_cluster.py
+++ b/dask_cuda/tests/test_local_cuda_cluster.py
@@ -108,11 +108,13 @@ async def test_n_workers():
 
 
 @gen_test(timeout=20)
-async def test_rmm_pool():
+async def test_rmm():
     rmm = pytest.importorskip("rmm")
 
-    async with LocalCUDACluster(rmm_pool_size="2GB", asynchronous=True) as cluster:
+    async with LocalCUDACluster(
+        rmm_pool_size="2GB", rmm_managed_memory=True, asynchronous=True
+    ) as cluster:
         async with Client(cluster, asynchronous=True) as client:
             memory_resource_type = await client.run(rmm.mr.get_default_resource_type)
             for v in memory_resource_type.values():
-                assert v is rmm._lib.memory_resource.CNMemMemoryResource
+                assert v is rmm._lib.memory_resource.CNMemManagedMemoryResource

From 22c7d07419c5bd14baa59643ca152e114dc2f2a3 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Fri, 10 Jul 2020 07:16:54 -0700
Subject: [PATCH 083/126] Raise exception when RMM managed memory and NVLink
 are both enabled.

---
 dask_cuda/cli/dask_cuda_worker.py | 4 +++-
 dask_cuda/cuda_worker.py          | 5 +++++
 dask_cuda/local_cuda_cluster.py   | 7 +++++--
 3 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/dask_cuda/cli/dask_cuda_worker.py b/dask_cuda/cli/dask_cuda_worker.py
index 5c5d2586..92c7ede2 100755
--- a/dask_cuda/cli/dask_cuda_worker.py
+++ b/dask_cuda/cli/dask_cuda_worker.py
@@ -105,7 +105,9 @@
     help="If enabled, initialize each worker with RMM and set it to "
     "use managed memory. If disabled, RMM may still be used if "
     "--rmm-pool-size is specified, but in that case with default "
-    "(non-managed) memory type.",
+    "(non-managed) memory type."
+    "WARNING: managed memory is currently incompatible with NVLink, "
+    "trying to enable both will result in an exception.",
 )
 @click.option(
     "--reconnect/--no-reconnect",
diff --git a/dask_cuda/cuda_worker.py b/dask_cuda/cuda_worker.py
index 2234103f..50f19e72 100644
--- a/dask_cuda/cuda_worker.py
+++ b/dask_cuda/cuda_worker.py
@@ -143,6 +143,11 @@ def del_pid_file():
             if rmm_pool_size is not None:
                 rmm_pool_size = parse_bytes(rmm_pool_size)
 
+        if enable_nvlink and rmm_managed_memory:
+            raise ValueError(
+                "RMM managed memory and NVLink are currently incompatible."
+            )
+
         # Ensure this parent dask-cuda-worker process uses the same UCX
         # configuration as child worker processes created by it.
         initialize(
diff --git a/dask_cuda/local_cuda_cluster.py b/dask_cuda/local_cuda_cluster.py
index 74714e3f..b0cf91f6 100644
--- a/dask_cuda/local_cuda_cluster.py
+++ b/dask_cuda/local_cuda_cluster.py
@@ -103,10 +103,12 @@ class LocalCUDACluster(LocalCluster):
     rmm_pool_size: None, int or str
         When None (default), no RMM pool is initialized. If a different value
         is given, it can be an integer (bytes) or string (like 5GB or 5000M).
-    rmm_pool_size: bool
+    rmm_managed_memory: bool
         If True, initialize each worker with RMM and set it to use managed
         memory. If False, RMM may still be used if `rmm_pool_size` is specified,
         but in that case with default (non-managed) memory type.
+        WARNING: managed memory is currently incompatible with NVLink, trying
+        to enable both will result in an exception.
 
     Examples
     --------
@@ -122,7 +124,8 @@ class LocalCUDACluster(LocalCluster):
     ValueError
         If ucx_net_devices is an empty string, or if it is "auto" and UCX-Py is
         not installed, or if it is "auto" and enable_infiniband=False, or UCX-Py
-        wasn't compiled with hwloc support.
+        wasn't compiled with hwloc support, or both RMM managed memory and
+        NVLink are enabled.
 
     See Also
     --------

From 018c3998a6b75ff58eae44fa0e27db184404a385 Mon Sep 17 00:00:00 2001
From: sean-frye <sfrye@nvidia.com>
Date: Mon, 13 Jul 2020 10:54:39 -0700
Subject: [PATCH 084/126] update docker image

---
 ci/local/build.sh | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/ci/local/build.sh b/ci/local/build.sh
index 443e3c14..72ccf9e4 100755
--- a/ci/local/build.sh
+++ b/ci/local/build.sh
@@ -1,6 +1,9 @@
 #!/bin/bash
 
-DOCKER_IMAGE="gpuci/rapidsai-base:cuda10.0-ubuntu16.04-gcc5-py3.6"
+GIT_DESCRIBE_TAG=`git describe --tags`
+MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`
+
+DOCKER_IMAGE="gpuci/rapidsai:${MINOR_VERSION}-cuda10.1-devel-ubuntu16.04-py3.7"
 REPO_PATH=${PWD}
 RAPIDS_DIR_IN_CONTAINER="/rapids"
 CPP_BUILD_DIR="cpp/build"
@@ -139,4 +142,4 @@ docker run --rm -it ${GPU_OPTS} \
        -v "$PASSWD_FILE":/etc/passwd:ro \
        -v "$GROUP_FILE":/etc/group:ro \
        --cap-add=SYS_PTRACE \
-       "${DOCKER_IMAGE}" bash -c "${COMMAND}"
\ No newline at end of file
+       "${DOCKER_IMAGE}" bash -c "${COMMAND}"

From 42594c8ecac2a9b389d3f17fea1a2c86fc4ab16c Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Mon, 13 Jul 2020 15:27:06 -0700
Subject: [PATCH 085/126] Ensure UCX configuration for explicit_comms tests

---
 dask_cuda/tests/test_explicit_comms.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/dask_cuda/tests/test_explicit_comms.py b/dask_cuda/tests/test_explicit_comms.py
index 5ca042f9..a64d3402 100644
--- a/dask_cuda/tests/test_explicit_comms.py
+++ b/dask_cuda/tests/test_explicit_comms.py
@@ -4,6 +4,7 @@
 import pandas as pd
 import pytest
 
+import dask
 from dask import dataframe as dd
 from distributed import Client
 from distributed.deploy.local import LocalCluster
@@ -45,6 +46,12 @@ def test_local_cluster(protocol):
 
 
 def _test_dataframe_merge(backend, protocol, n_workers):
+    dask.config.update(
+        dask.config.global_config,
+        {"ucx": {"TLS": "tcp,sockcm,cuda_copy",},},
+        priority="new",
+    )
+
     with LocalCluster(
         protocol=protocol,
         dashboard_address=None,

From d7de57b36f81b58f93095b86a192a52a82192b35 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Tue, 14 Jul 2020 04:52:17 -0700
Subject: [PATCH 086/126] Add new reuse-endpoints distributed variable

---
 dask_cuda/utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dask_cuda/utils.py b/dask_cuda/utils.py
index 8c0adab8..72c7736c 100644
--- a/dask_cuda/utils.py
+++ b/dask_cuda/utils.py
@@ -219,6 +219,7 @@ def get_ucx_config(
         "rdmacm": None,
         "net-devices": None,
         "cuda_copy": None,
+        "reuse-endpoints": True,
     }
     if enable_tcp_over_ucx or enable_infiniband or enable_nvlink:
         ucx_config["cuda_copy"] = True

From 19a3fb0745bb5390dc6fc196a34f633030dd7d93 Mon Sep 17 00:00:00 2001
From: Dillon Cullinan <dcullinan92@gmail.com>
Date: Wed, 15 Jul 2020 16:19:47 -0400
Subject: [PATCH 087/126] FIX Update/remove references to master branch

---
 ci/cpu/upload-pypi.sh | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/ci/cpu/upload-pypi.sh b/ci/cpu/upload-pypi.sh
index 07ba15a1..4eb6a2ad 100755
--- a/ci/cpu/upload-pypi.sh
+++ b/ci/cpu/upload-pypi.sh
@@ -1,10 +1,8 @@
 #!/bin/bash
 set -e
 
-SOURCE_BRANCH=master
 
-# Restrict uploads to master branch
-if [ ${GIT_BRANCH} != ${SOURCE_BRANCH} ]; then
+if [ ${BUILD_MODE} != "branch" ]; then
   echo "Skipping upload"
   return 0
 fi

From f90d8dcb8adb2afe1916dc1db8414471226582cf Mon Sep 17 00:00:00 2001
From: John Kirkham <jakirkham@gmail.com>
Date: Fri, 24 Jul 2020 14:35:29 -0700
Subject: [PATCH 088/126] Ensure `DeviceSerialized` supports pickling

Make sure that `DeviceSerialized` objects can proceed through the
`"pickle"` serialization protocol. Ensure that this is done in a way
where out-of-band pickling is supported through pickle protocol 5 where
possible and a fallback to older serialization protocols still works
when that is not an option.
---
 dask_cuda/device_host_file.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/dask_cuda/device_host_file.py b/dask_cuda/device_host_file.py
index 90cdd869..00c74709 100644
--- a/dask_cuda/device_host_file.py
+++ b/dask_cuda/device_host_file.py
@@ -37,6 +37,11 @@ def __init__(self, header, frames):
     def __sizeof__(self):
         return sum(map(nbytes, self.frames))
 
+    def __reduce_ex__(self, protocol):
+        header, frames = device_serialize(self)
+        frames = [f.obj for f in frames]
+        return device_deserialize, (header, frames)
+
 
 @dask_serialize.register(DeviceSerialized)
 def device_serialize(obj):

From c2889be9a40db54e791b4338c274832bc99bae3d Mon Sep 17 00:00:00 2001
From: John Kirkham <jakirkham@gmail.com>
Date: Fri, 24 Jul 2020 14:37:46 -0700
Subject: [PATCH 089/126] Test round-trip serialization with pickle

Make sure that `DeviceSerialized` objects can proceed through pickle
serialization if needed. Also check that they serialize efficiently with
pickle protocol 5 (when that is an option).
---
 dask_cuda/tests/test_device_host_file.py | 25 +++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/dask_cuda/tests/test_device_host_file.py b/dask_cuda/tests/test_device_host_file.py
index 0d12f97e..5e0fb949 100644
--- a/dask_cuda/tests/test_device_host_file.py
+++ b/dask_cuda/tests/test_device_host_file.py
@@ -6,7 +6,13 @@
 
 import dask
 from dask import array as da
-from distributed.protocol import deserialize_bytes, serialize_bytelist
+from distributed.protocol import (
+    deserialize,
+    serialize,
+    deserialize_bytes,
+    serialize_bytelist,
+)
+from distributed.protocol.pickle import HIGHEST_PROTOCOL
 
 from dask_cuda.device_host_file import (
     DeviceHostFile,
@@ -171,3 +177,20 @@ def test_serialize_cupy_collection(collection, length, value):
         assert isinstance(res, collection)
         values = res.values() if collection is dict else res
         [assert_func(v, x) for v in values]
+
+    header, frames = serialize(obj, serializers=["pickle"])
+
+    if HIGHEST_PROTOCOL >= 5:
+        assert len(frames) == (1 + len(obj.frames))
+    else:
+        assert len(frames) == 1
+
+    obj2 = deserialize(header, frames)
+    res = host_to_device(obj2)
+
+    if length == 0:
+        assert_func(res, x)
+    else:
+        assert isinstance(res, collection)
+        values = res.values() if collection is dict else res
+        [assert_func(v, x) for v in values]

From a6fabf5583ab6f54fa12b3b7aa9aa1ffd6fd1de9 Mon Sep 17 00:00:00 2001
From: Benjamin Zaitlen <quasiben@gmail.com>
Date: Tue, 4 Aug 2020 09:48:09 -0500
Subject: [PATCH 090/126] check for gpu usage rather try/except in functions

---
 dask_cuda/benchmarks/local_cudf_merge.py | 41 ++++++++++++------------
 dask_cuda/benchmarks/utils.py            |  8 ++---
 2 files changed, 23 insertions(+), 26 deletions(-)

diff --git a/dask_cuda/benchmarks/local_cudf_merge.py b/dask_cuda/benchmarks/local_cudf_merge.py
index fa7896b4..a435387e 100644
--- a/dask_cuda/benchmarks/local_cudf_merge.py
+++ b/dask_cuda/benchmarks/local_cudf_merge.py
@@ -2,7 +2,6 @@
 from collections import defaultdict
 from time import perf_counter as clock
 
-import cupy
 import numpy
 
 from dask.base import tokenize
@@ -10,8 +9,6 @@
 from dask.distributed import Client, performance_report, wait
 from dask.utils import format_bytes, format_time, parse_bytes
 
-import cudf
-
 from dask_cuda import explicit_comms
 from dask_cuda.benchmarks.utils import (
     get_cluster_options,
@@ -24,17 +21,17 @@
 # <https://gist.github.com/rjzamora/0ffc35c19b5180ab04bbf7c793c45955>
 
 
-def generate_chunk(i_chunk, local_size, num_chunks, chunk_type, frac_match,
-        gpu):
+def generate_chunk(i_chunk, local_size, num_chunks, chunk_type, frac_match, gpu):
     # Setting a seed that triggers max amount of comm in the two-GPU case.
     if gpu:
         import cupy as xp
+
         import cudf as xdf
     else:
         import numpy as xp
         import pandas as xdf
 
-    xp.random.seed(2**32 - 1)
+    xp.random.seed(2 ** 32 - 1)
 
     chunk_type = chunk_type or "build"
     frac_match = frac_match or 1.0
@@ -57,9 +54,7 @@ def generate_chunk(i_chunk, local_size, num_chunks, chunk_type, frac_match,
             {
                 "key": xp.arange(start, stop=stop, dtype="int64"),
                 "shuffle": xp.random.permutation(suffle_array)[:local_size],
-                "payload": xp.random.permutation(
-                    xp.arange(local_size, dtype="int64")
-                ),
+                "payload": xp.random.permutation(xp.arange(local_size, dtype="int64")),
             }
         )
     else:
@@ -96,9 +91,7 @@ def generate_chunk(i_chunk, local_size, num_chunks, chunk_type, frac_match,
         df = xdf.DataFrame(
             {
                 "key": xp.random.permutation(key_array_combine),
-                "payload": xp.random.permutation(
-                    xp.arange(local_size, dtype="int64")
-                ),
+                "payload": xp.random.permutation(xp.arange(local_size, dtype="int64")),
             }
         )
     return df
@@ -107,15 +100,22 @@ def generate_chunk(i_chunk, local_size, num_chunks, chunk_type, frac_match,
 def get_random_ddf(chunk_size, num_chunks, frac_match, chunk_type, args):
 
     parts = [chunk_size for i in range(num_chunks)]
-    device_type = True if args.type is 'gpu' else False
+    device_type = True if args.type is "gpu" else False
     meta = generate_chunk(0, 4, 1, chunk_type, None, device_type)
     divisions = [None] * (len(parts) + 1)
 
     name = "generate-data-" + tokenize(chunk_size, num_chunks, frac_match, chunk_type)
 
     graph = {
-        (name, i): (generate_chunk, i, part, len(parts), chunk_type,
-            frac_match, device_type)
+        (name, i): (
+            generate_chunk,
+            i,
+            part,
+            len(parts),
+            chunk_type,
+            frac_match,
+            device_type,
+        )
         for i, part in enumerate(parts)
     }
 
@@ -202,10 +202,11 @@ def main(args):
 
         client = Client(scheduler_addr if args.multi_node else cluster)
 
-    client.run(setup_memory_pool, disable_pool=args.no_rmm_pool)
-    # Create an RMM pool on the scheduler due to occasional deserialization
-    # of CUDA objects. May cause issues with InfiniBand otherwise.
-    client.run_on_scheduler(setup_memory_pool, 1e9, disable_pool=args.no_rmm_pool)
+    if args.type == "gpu":
+        client.run(setup_memory_pool, disable_pool=args.no_rmm_pool)
+        # Create an RMM pool on the scheduler due to occasional deserialization
+        # of CUDA objects. May cause issues with InfiniBand otherwise.
+        client.run_on_scheduler(setup_memory_pool, 1e9, disable_pool=args.no_rmm_pool)
 
     scheduler_workers = client.run_on_scheduler(get_scheduler_workers)
     n_workers = len(scheduler_workers)
@@ -301,8 +302,6 @@ def parse_args():
             "type": str,
             "help": "Do merge with GPU or CPU dataframes",
         },
-
-
         {
             "name": ["-c", "--chunk-size",],
             "default": 1_000_000,
diff --git a/dask_cuda/benchmarks/utils.py b/dask_cuda/benchmarks/utils.py
index fc5d4259..32d92158 100644
--- a/dask_cuda/benchmarks/utils.py
+++ b/dask_cuda/benchmarks/utils.py
@@ -184,11 +184,9 @@ def get_scheduler_workers(dask_scheduler=None):
 
 
 def setup_memory_pool(pool_size=None, disable_pool=False):
-    try:
-        import rmm
-        import cupy
-    except ImportError:
-        return None
+    import cupy
+
+    import rmm
 
     rmm.reinitialize(
         pool_allocator=not disable_pool, devices=0, initial_pool_size=pool_size,

From 4a7058c46388391a6b6b6f95165a83eb446bf71d Mon Sep 17 00:00:00 2001
From: Benjamin Zaitlen <quasiben@gmail.com>
Date: Tue, 4 Aug 2020 09:54:14 -0500
Subject: [PATCH 091/126] delay cudf import in explicit comms

---
 dask_cuda/explicit_comms/dataframe_merge.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/dask_cuda/explicit_comms/dataframe_merge.py b/dask_cuda/explicit_comms/dataframe_merge.py
index 0c933b86..ad5ad8af 100644
--- a/dask_cuda/explicit_comms/dataframe_merge.py
+++ b/dask_cuda/explicit_comms/dataframe_merge.py
@@ -5,11 +5,6 @@
 from dask.dataframe.shuffle import partitioning_index, shuffle_group
 from distributed.protocol import to_serialize
 
-try:
-    import cudf
-except ImportError:
-    pass
-
 from . import comms
 
 
@@ -66,10 +61,14 @@ async def exchange_and_concat_bins(rank, eps, bins):
 def concat(df_list):
     if len(df_list) == 0:
         return None
-    elif isinstance(df_list[0], (cudf.DataFrame, cudf.Series)):
-        return cudf.concat(df_list)
     else:
-        return pandas.concat(df_list)
+        typ = str(type(df_list[0]))
+        if 'cudf' in typ:
+            # delay import of cudf to handle CPU only tests
+            import cudf
+            return cudf.concat(df_list)
+        else:
+            return pandas.concat(df_list)
 
 
 def partition_by_hash(df, columns, n_chunks, ignore_index=False):

From 76d3f8f0a758da9e8e811cd5e989bc99ed8fe3d7 Mon Sep 17 00:00:00 2001
From: Benjamin Zaitlen <quasiben@gmail.com>
Date: Tue, 4 Aug 2020 09:59:42 -0500
Subject: [PATCH 092/126] lint

---
 dask_cuda/benchmarks/local_cudf_merge.py | 2 +-
 dask_cuda/local_cuda_cluster.py          | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/dask_cuda/benchmarks/local_cudf_merge.py b/dask_cuda/benchmarks/local_cudf_merge.py
index 4dc05308..8762f0cf 100644
--- a/dask_cuda/benchmarks/local_cudf_merge.py
+++ b/dask_cuda/benchmarks/local_cudf_merge.py
@@ -100,7 +100,7 @@ def generate_chunk(i_chunk, local_size, num_chunks, chunk_type, frac_match, gpu)
 def get_random_ddf(chunk_size, num_chunks, frac_match, chunk_type, args):
 
     parts = [chunk_size for i in range(num_chunks)]
-    device_type = True if args.type is "gpu" else False
+    device_type = True if args.type == "gpu" else False
     meta = generate_chunk(0, 4, 1, chunk_type, None, device_type)
     divisions = [None] * (len(parts) + 1)
 
diff --git a/dask_cuda/local_cuda_cluster.py b/dask_cuda/local_cuda_cluster.py
index b0cf91f6..6843f257 100644
--- a/dask_cuda/local_cuda_cluster.py
+++ b/dask_cuda/local_cuda_cluster.py
@@ -208,7 +208,8 @@ def __init__(
 
         if ucx_net_devices == "auto":
             try:
-                from ucp._libs.topological_distance import TopologicalDistance  # NOQA
+                from ucp._libs.topological_distance import \
+                    TopologicalDistance  # NOQA
             except ImportError:
                 raise ValueError(
                     "ucx_net_devices set to 'auto' but UCX-Py is not "

From b203e4e9bed60e0e94c73810c5aad32f36fcd502 Mon Sep 17 00:00:00 2001
From: Benjamin Zaitlen <quasiben@gmail.com>
Date: Tue, 4 Aug 2020 09:16:27 -0700
Subject: [PATCH 093/126] use cudf assert frame equal

---
 dask_cuda/tests/test_explicit_comms.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/dask_cuda/tests/test_explicit_comms.py b/dask_cuda/tests/test_explicit_comms.py
index a64d3402..1ad45f8a 100644
--- a/dask_cuda/tests/test_explicit_comms.py
+++ b/dask_cuda/tests/test_explicit_comms.py
@@ -10,6 +10,7 @@
 from distributed.deploy.local import LocalCluster
 
 import cudf
+from cudf.tests.utils import assert_eq
 
 from dask_cuda.explicit_comms import CommsContext, dataframe_merge
 
@@ -83,10 +84,10 @@ def _test_dataframe_merge(backend, protocol, n_workers):
             got = ddf3.compute()
 
             if backend == "cudf":
-                got = got.to_pandas()
-                got.index.names = ["key"]  # TODO: this shouldn't be needed
+                assert_eq(got, expected)
 
-            pd.testing.assert_frame_equal(got, expected)
+            else:
+                pd.testing.assert_frame_equal(got, expected)
 
 
 @pytest.mark.parametrize("nworkers", [1, 2, 4])

From 37f77cac740a274c818d562698be651ef7077fb0 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Tue, 11 Aug 2020 11:59:23 -0700
Subject: [PATCH 094/126] Use get_n_gpus for RMM test with dask-cuda-worker

---
 dask_cuda/tests/test_dask_cuda_worker.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dask_cuda/tests/test_dask_cuda_worker.py b/dask_cuda/tests/test_dask_cuda_worker.py
index ad65e470..97ec9da0 100644
--- a/dask_cuda/tests/test_dask_cuda_worker.py
+++ b/dask_cuda/tests/test_dask_cuda_worker.py
@@ -9,7 +9,7 @@
 from distributed.utils_test import loop  # noqa: F401
 from distributed.utils_test import popen
 
-from dask_cuda.utils import get_gpu_count, wait_workers
+from dask_cuda.utils import get_n_gpus, wait_workers
 
 
 def test_cuda_visible_devices_and_memory_limit(loop):  # noqa: F811
@@ -64,7 +64,7 @@ def test_rmm(loop):  # noqa: F811
             ]
         ):
             with Client("127.0.0.1:9369", loop=loop) as client:
-                assert wait_workers(client, n_gpus=get_gpu_count())
+                assert wait_workers(client, n_gpus=get_n_gpus())
 
                 memory_resource_type = client.run(rmm.mr.get_default_resource_type)
                 for v in memory_resource_type.values():

From 58013f8560182515ee8768ae5a638afb36798b54 Mon Sep 17 00:00:00 2001
From: John Kirkham <jakirkham@gmail.com>
Date: Tue, 11 Aug 2020 19:43:28 -0700
Subject: [PATCH 095/126] Access memory resources from public module

---
 dask_cuda/tests/test_dask_cuda_worker.py   | 2 +-
 dask_cuda/tests/test_local_cuda_cluster.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/dask_cuda/tests/test_dask_cuda_worker.py b/dask_cuda/tests/test_dask_cuda_worker.py
index ad65e470..b1326a6a 100644
--- a/dask_cuda/tests/test_dask_cuda_worker.py
+++ b/dask_cuda/tests/test_dask_cuda_worker.py
@@ -68,4 +68,4 @@ def test_rmm(loop):  # noqa: F811
 
                 memory_resource_type = client.run(rmm.mr.get_default_resource_type)
                 for v in memory_resource_type.values():
-                    assert v is rmm._lib.memory_resource.CNMemManagedMemoryResource
+                    assert v is rmm.mr.CNMemManagedMemoryResource
diff --git a/dask_cuda/tests/test_local_cuda_cluster.py b/dask_cuda/tests/test_local_cuda_cluster.py
index 71f6dd2b..b85a4186 100644
--- a/dask_cuda/tests/test_local_cuda_cluster.py
+++ b/dask_cuda/tests/test_local_cuda_cluster.py
@@ -117,4 +117,4 @@ async def test_rmm():
         async with Client(cluster, asynchronous=True) as client:
             memory_resource_type = await client.run(rmm.mr.get_default_resource_type)
             for v in memory_resource_type.values():
-                assert v is rmm._lib.memory_resource.CNMemManagedMemoryResource
+                assert v is rmm.mr.CNMemManagedMemoryResource

From e1438d09207c2ced8b42addac8473bc2029d01dc Mon Sep 17 00:00:00 2001
From: John Kirkham <jakirkham@gmail.com>
Date: Tue, 11 Aug 2020 19:50:02 -0700
Subject: [PATCH 096/126] Handle removal of `get_default_resource_type`

As `get_default_resource_type` was dropped in RMM recently, use the
newly introduce `rmm.mr.get_per_device_resource` instead to access the
resource on device `0` (configured to a unique device for each worker).
Since the memory resource itself is not realistically serializable,
instead grab the type of each resource to send back. This is all done
within a `lambda` to allow for a function that can be run on each
worker.
---
 dask_cuda/tests/test_dask_cuda_worker.py   | 4 +++-
 dask_cuda/tests/test_local_cuda_cluster.py | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/dask_cuda/tests/test_dask_cuda_worker.py b/dask_cuda/tests/test_dask_cuda_worker.py
index b1326a6a..86acbce2 100644
--- a/dask_cuda/tests/test_dask_cuda_worker.py
+++ b/dask_cuda/tests/test_dask_cuda_worker.py
@@ -66,6 +66,8 @@ def test_rmm(loop):  # noqa: F811
             with Client("127.0.0.1:9369", loop=loop) as client:
                 assert wait_workers(client, n_gpus=get_gpu_count())
 
-                memory_resource_type = client.run(rmm.mr.get_default_resource_type)
+                memory_resource_type = client.run(
+                    lambda: type(rmm.mr.get_per_device_resource(0))
+                )
                 for v in memory_resource_type.values():
                     assert v is rmm.mr.CNMemManagedMemoryResource
diff --git a/dask_cuda/tests/test_local_cuda_cluster.py b/dask_cuda/tests/test_local_cuda_cluster.py
index b85a4186..8ecc9b3d 100644
--- a/dask_cuda/tests/test_local_cuda_cluster.py
+++ b/dask_cuda/tests/test_local_cuda_cluster.py
@@ -115,6 +115,8 @@ async def test_rmm():
         rmm_pool_size="2GB", rmm_managed_memory=True, asynchronous=True
     ) as cluster:
         async with Client(cluster, asynchronous=True) as client:
-            memory_resource_type = await client.run(rmm.mr.get_default_resource_type)
+            memory_resource_type = await client.run(
+                lambda: type(rmm.mr.get_per_device_resource(0))
+            )
             for v in memory_resource_type.values():
                 assert v is rmm.mr.CNMemManagedMemoryResource

From 69680e7607edf9f4f0ffce1bb9bfb5e0e2229906 Mon Sep 17 00:00:00 2001
From: John Kirkham <jakirkham@gmail.com>
Date: Tue, 11 Aug 2020 19:50:06 -0700
Subject: [PATCH 097/126] Test for RMM's `PoolMemoryResource`

As CNMeM has been dropped from Python and replaced with RMM's own pool
resource, just check for that pool resource instead.
---
 dask_cuda/tests/test_dask_cuda_worker.py   | 2 +-
 dask_cuda/tests/test_local_cuda_cluster.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/dask_cuda/tests/test_dask_cuda_worker.py b/dask_cuda/tests/test_dask_cuda_worker.py
index 86acbce2..698a33f7 100644
--- a/dask_cuda/tests/test_dask_cuda_worker.py
+++ b/dask_cuda/tests/test_dask_cuda_worker.py
@@ -70,4 +70,4 @@ def test_rmm(loop):  # noqa: F811
                     lambda: type(rmm.mr.get_per_device_resource(0))
                 )
                 for v in memory_resource_type.values():
-                    assert v is rmm.mr.CNMemManagedMemoryResource
+                    assert v is rmm.mr.PoolMemoryResource
diff --git a/dask_cuda/tests/test_local_cuda_cluster.py b/dask_cuda/tests/test_local_cuda_cluster.py
index 8ecc9b3d..ed80e127 100644
--- a/dask_cuda/tests/test_local_cuda_cluster.py
+++ b/dask_cuda/tests/test_local_cuda_cluster.py
@@ -119,4 +119,4 @@ async def test_rmm():
                 lambda: type(rmm.mr.get_per_device_resource(0))
             )
             for v in memory_resource_type.values():
-                assert v is rmm.mr.CNMemManagedMemoryResource
+                assert v is rmm.mr.PoolMemoryResource

From cd4b9ac807d02c8f54395b3ed9cb81ef63666ec5 Mon Sep 17 00:00:00 2001
From: jakirkham <jakirkham@gmail.com>
Date: Tue, 11 Aug 2020 20:01:34 -0700
Subject: [PATCH 098/126] Use `get_current_device_resource_type`

Simplify the resource type checks a bit. Thanks Mark! :)

Co-authored-by: Mark Harris <mharris@nvidia.com>
---
 dask_cuda/tests/test_dask_cuda_worker.py   | 2 +-
 dask_cuda/tests/test_local_cuda_cluster.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/dask_cuda/tests/test_dask_cuda_worker.py b/dask_cuda/tests/test_dask_cuda_worker.py
index 698a33f7..40a625a1 100644
--- a/dask_cuda/tests/test_dask_cuda_worker.py
+++ b/dask_cuda/tests/test_dask_cuda_worker.py
@@ -67,7 +67,7 @@ def test_rmm(loop):  # noqa: F811
                 assert wait_workers(client, n_gpus=get_gpu_count())
 
                 memory_resource_type = client.run(
-                    lambda: type(rmm.mr.get_per_device_resource(0))
+                    rmm.mr.get_current_device_resource_type
                 )
                 for v in memory_resource_type.values():
                     assert v is rmm.mr.PoolMemoryResource
diff --git a/dask_cuda/tests/test_local_cuda_cluster.py b/dask_cuda/tests/test_local_cuda_cluster.py
index ed80e127..ab39e0b8 100644
--- a/dask_cuda/tests/test_local_cuda_cluster.py
+++ b/dask_cuda/tests/test_local_cuda_cluster.py
@@ -116,7 +116,7 @@ async def test_rmm():
     ) as cluster:
         async with Client(cluster, asynchronous=True) as client:
             memory_resource_type = await client.run(
-                lambda: type(rmm.mr.get_per_device_resource(0))
+                rmm.mr.get_current_device_resource_type
             )
             for v in memory_resource_type.values():
                 assert v is rmm.mr.PoolMemoryResource

From 626c84854326c0e5015191375a41be37e62b53aa Mon Sep 17 00:00:00 2001
From: John Kirkham <jakirkham@gmail.com>
Date: Tue, 11 Aug 2020 20:05:38 -0700
Subject: [PATCH 099/126] Fix a black error in explicit comms

---
 dask_cuda/explicit_comms/dataframe_merge.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/dask_cuda/explicit_comms/dataframe_merge.py b/dask_cuda/explicit_comms/dataframe_merge.py
index ad5ad8af..91e0e3e4 100644
--- a/dask_cuda/explicit_comms/dataframe_merge.py
+++ b/dask_cuda/explicit_comms/dataframe_merge.py
@@ -63,9 +63,10 @@ def concat(df_list):
         return None
     else:
         typ = str(type(df_list[0]))
-        if 'cudf' in typ:
+        if "cudf" in typ:
             # delay import of cudf to handle CPU only tests
             import cudf
+
             return cudf.concat(df_list)
         else:
             return pandas.concat(df_list)

From d4567b14211ecf0c188821cf8d19a12886384abe Mon Sep 17 00:00:00 2001
From: John Kirkham <jakirkham@gmail.com>
Date: Tue, 11 Aug 2020 20:22:23 -0700
Subject: [PATCH 100/126] Fix an `isort` error

---
 dask_cuda/tests/test_device_host_file.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dask_cuda/tests/test_device_host_file.py b/dask_cuda/tests/test_device_host_file.py
index 5e0fb949..a3327b9c 100644
--- a/dask_cuda/tests/test_device_host_file.py
+++ b/dask_cuda/tests/test_device_host_file.py
@@ -8,8 +8,8 @@
 from dask import array as da
 from distributed.protocol import (
     deserialize,
-    serialize,
     deserialize_bytes,
+    serialize,
     serialize_bytelist,
 )
 from distributed.protocol.pickle import HIGHEST_PROTOCOL

From 5877d3a746de913f6ece2f702026b6303be9bb39 Mon Sep 17 00:00:00 2001
From: Benjamin Zaitlen <quasiben@gmail.com>
Date: Wed, 12 Aug 2020 14:00:31 -0700
Subject: [PATCH 101/126] avoid cuda driver initialization from rmm

---
 dask_cuda/benchmarks/utils.py   | 2 ++
 dask_cuda/cuda_worker.py        | 1 +
 dask_cuda/initialize.py         | 3 +++
 dask_cuda/local_cuda_cluster.py | 1 +
 dask_cuda/utils.py              | 1 +
 5 files changed, 8 insertions(+)

diff --git a/dask_cuda/benchmarks/utils.py b/dask_cuda/benchmarks/utils.py
index 32d92158..552d3109 100644
--- a/dask_cuda/benchmarks/utils.py
+++ b/dask_cuda/benchmarks/utils.py
@@ -186,6 +186,8 @@ def get_scheduler_workers(dask_scheduler=None):
 def setup_memory_pool(pool_size=None, disable_pool=False):
     import cupy
 
+    print("FOODSFSDF")
+    os.environ['RMM_NO_INITIALIZE'] = 'True'
     import rmm
 
     rmm.reinitialize(
diff --git a/dask_cuda/cuda_worker.py b/dask_cuda/cuda_worker.py
index 50f19e72..0afd9928 100644
--- a/dask_cuda/cuda_worker.py
+++ b/dask_cuda/cuda_worker.py
@@ -133,6 +133,7 @@ def del_pid_file():
 
         if rmm_pool_size is not None or rmm_managed_memory:
             try:
+                os.environ['RMM_NO_INITIALIZE'] = 'True'
                 import rmm  # noqa F401
             except ImportError:
                 raise ValueError(
diff --git a/dask_cuda/initialize.py b/dask_cuda/initialize.py
index e53c10de..d98849d5 100644
--- a/dask_cuda/initialize.py
+++ b/dask_cuda/initialize.py
@@ -24,6 +24,7 @@
 about Dask configuration.
 """
 import logging
+import os
 
 import click
 import numba.cuda
@@ -46,6 +47,7 @@ def initialize(
 ):
     if create_cuda_context:
         try:
+            os.environ['RMM_NO_INITIALIZE'] = 'True'
             numba.cuda.current_context()
         except Exception:
             logger.error("Unable to start CUDA Context", exc_info=True)
@@ -105,6 +107,7 @@ def dask_setup(
 ):
     if create_cuda_context:
         try:
+            os.environ['RMM_NO_INITIALIZE'] = 'True'
             numba.cuda.current_context()
         except Exception:
             logger.error("Unable to start CUDA Context", exc_info=True)
diff --git a/dask_cuda/local_cuda_cluster.py b/dask_cuda/local_cuda_cluster.py
index 6843f257..6f08519d 100644
--- a/dask_cuda/local_cuda_cluster.py
+++ b/dask_cuda/local_cuda_cluster.py
@@ -168,6 +168,7 @@ def __init__(
         self.rmm_managed_memory = rmm_managed_memory
         if rmm_pool_size is not None or rmm_managed_memory:
             try:
+                os.environ['RMM_NO_INITIALIZE'] = 'True'
                 import rmm  # noqa F401
             except ImportError:
                 raise ValueError(
diff --git a/dask_cuda/utils.py b/dask_cuda/utils.py
index 72c7736c..b65507ac 100644
--- a/dask_cuda/utils.py
+++ b/dask_cuda/utils.py
@@ -35,6 +35,7 @@ def __init__(self, nbytes, managed_memory):
 
     def setup(self, worker=None):
         if self.nbytes is not None or self.managed_memory is True:
+            os.environ['RMM_NO_INITIALIZE'] = 'True'
             import rmm
 
             pool_allocator = False if self.nbytes is None else True

From 63cce2541d877cbaca300897bf07197d91894abb Mon Sep 17 00:00:00 2001
From: Benjamin Zaitlen <quasiben@users.noreply.github.com>
Date: Wed, 12 Aug 2020 17:46:09 -0400
Subject: [PATCH 102/126] Update dask_cuda/benchmarks/utils.py

woops

Co-authored-by: jakirkham <jakirkham@gmail.com>
---
 dask_cuda/benchmarks/utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/dask_cuda/benchmarks/utils.py b/dask_cuda/benchmarks/utils.py
index 552d3109..2b81b2a9 100644
--- a/dask_cuda/benchmarks/utils.py
+++ b/dask_cuda/benchmarks/utils.py
@@ -186,7 +186,6 @@ def get_scheduler_workers(dask_scheduler=None):
 def setup_memory_pool(pool_size=None, disable_pool=False):
     import cupy
 
-    print("FOODSFSDF")
     os.environ['RMM_NO_INITIALIZE'] = 'True'
     import rmm
 

From 2d0503f2c6d5ab146871435b10dcd06b4f1ac662 Mon Sep 17 00:00:00 2001
From: Benjamin Zaitlen <quasiben@gmail.com>
Date: Wed, 12 Aug 2020 14:50:32 -0700
Subject: [PATCH 103/126] forgot os import

---
 dask_cuda/benchmarks/utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dask_cuda/benchmarks/utils.py b/dask_cuda/benchmarks/utils.py
index 2b81b2a9..40b969a6 100644
--- a/dask_cuda/benchmarks/utils.py
+++ b/dask_cuda/benchmarks/utils.py
@@ -1,4 +1,5 @@
 import argparse
+import os
 
 from dask.distributed import SSHCluster
 

From fabc0b5473212432e91632a80bb6d7c703b5701f Mon Sep 17 00:00:00 2001
From: Benjamin Zaitlen <quasiben@gmail.com>
Date: Fri, 14 Aug 2020 07:36:13 -0700
Subject: [PATCH 104/126] fix bash lines in docs

---
 docs/source/ucx.rst | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docs/source/ucx.rst b/docs/source/ucx.rst
index 732d5a8b..8559378d 100644
--- a/docs/source/ucx.rst
+++ b/docs/source/ucx.rst
@@ -43,11 +43,13 @@ dask-scheduler
 The ``dask-scheduler`` has no parameters for UCX configuration -- different from what we will see for ``dask-cuda-worker`` on the next section -- for that reason we rely on Dask environment variables. Here's how to start the scheduler with all transports that are currently supported by Dask-CUDA:
 
 .. code-block:: bash
+
     DASK_RMM__POOL_SIZE=1GB DASK_UCX__CUDA_COPY=True DASK_UCX__TCP=True DASK_UCX__NVLINK=True DASK_UCX__INFINIBAND=True DASK_UCX__RDMACM=True DASK_UCX__NET_DEVICES=mlx5_0:1 dask-scheduler --protocol ucx --interface ib0
 
 Note above how we use ``DASK_UCX__NET_DEVICES=mlx5_0:1`` (the Mellanox name for ``ib0``) and the same interface with ``--interface ib0``. If the system doesn't have an InfiniBand interface available, you would normally use the main network interface, such as ``eth0``, as seen below:
 
 .. code-block:: bash
+
     DASK_RMM__POOL_SIZE=1GB DASK_UCX__CUDA_COPY=True DASK_UCX__TCP=True DASK_UCX__NVLINK=True dask-scheduler --protocol ucx --interface eth0
 
 Setting ``DASK_UCX__NET_DEVICES`` when using an interface that isn't an InfiniBand can generally be skipped.
@@ -67,6 +69,7 @@ All ``DASK_*`` configurations described above have analogous parameters in ``das
 Here's how to start workers with all transports that are currently relevant for us:
 
 .. code-block:: bash
+
     dask-cuda-worker ucx://SCHEDULER_IB0_IP:8786 --enable-tcp-over-ucx --enable-nvlink --enable-infiniband -- enable-rdmacm --net-devices="auto" --rmm-pool-size="30GB"
 
 
From 9a31ffde643da6fbc9345a3aa697f10073fd84e6 Mon Sep 17 00:00:00 2001
From: Benjamin Zaitlen <quasiben@gmail.com>
Date: Fri, 14 Aug 2020 08:20:05 -0700
Subject: [PATCH 105/126] update rtd config file

---
 readthedocs.yml => .readthedocs.yml | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)
 rename readthedocs.yml => .readthedocs.yml (51%)

diff --git a/readthedocs.yml b/.readthedocs.yml
similarity index 51%
rename from readthedocs.yml
rename to .readthedocs.yml
index b11377da..7238980b 100644
--- a/readthedocs.yml
+++ b/.readthedocs.yml
@@ -1,9 +1,16 @@
+version: 2
+
 build:
    image: latest
 
+sphinx:
+  configuration: docs/conf.py
+
 python:
    version: 3.7
-   setup_py_install: true
+   install:
+      - method: pip
+        path: .
 
 conda:
    file: conda/environments/builddocs_py37.yml

From 78321a3902f08de083ebcb2acc9ca69594b5ff51 Mon Sep 17 00:00:00 2001
From: Benjamin Zaitlen <quasiben@gmail.com>
Date: Fri, 14 Aug 2020 08:30:24 -0700
Subject: [PATCH 106/126] fix conda path key

---
 .readthedocs.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.readthedocs.yml b/.readthedocs.yml
index 7238980b..2e6d3a85 100644
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@@ -13,4 +13,4 @@ python:
         path: .
 
 conda:
-   file: conda/environments/builddocs_py37.yml
+   environment: conda/environments/builddocs_py37.yml

From fb200afe4d2580f271eae81e64546761ae92003b Mon Sep 17 00:00:00 2001
From: Benjamin Zaitlen <quasiben@gmail.com>
Date: Fri, 14 Aug 2020 08:41:10 -0700
Subject: [PATCH 107/126] small doc fix

---
 docs/source/ucx.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/ucx.rst b/docs/source/ucx.rst
index 8559378d..b149cf94 100644
--- a/docs/source/ucx.rst
+++ b/docs/source/ucx.rst
@@ -81,6 +81,7 @@ The same configurations used for the scheduler should be used by the client. One
 One can use ``os.environ`` inside the client script, it's important to set them at the very top before importing anything other than ``os``. See example below:
 
 .. code-block:: python
+
     import os
 
     os.environ["DASK_RMM__POOL_SIZE"] = "1GB"

From 74a9754add397f0bf1224b795b3098f0f6a0adba Mon Sep 17 00:00:00 2001
From: Benjamin Zaitlen <quasiben@gmail.com>
Date: Fri, 14 Aug 2020 10:38:39 -0700
Subject: [PATCH 108/126] fix docs

---
 dask_cuda/local_cuda_cluster.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/dask_cuda/local_cuda_cluster.py b/dask_cuda/local_cuda_cluster.py
index 6f08519d..e2315d8f 100644
--- a/dask_cuda/local_cuda_cluster.py
+++ b/dask_cuda/local_cuda_cluster.py
@@ -60,8 +60,6 @@ class LocalCUDACluster(LocalCluster):
     CUDA_VISIBLE_DEVICES: str
         String like ``"0,1,2,3"`` or ``[0, 1, 2, 3]`` to restrict activity to
         different GPUs
-    Parameters
-    ----------
     interface: str
         The external interface used to connect to the scheduler, usually
         an ethernet interface is used for connection, and not an InfiniBand

From 483d5bcc61a1cdd85cb1a8cbc90760287a4a92a7 Mon Sep 17 00:00:00 2001
From: Benjamin Zaitlen <quasiben@gmail.com>
Date: Fri, 14 Aug 2020 10:46:02 -0700
Subject: [PATCH 109/126] more docs fixes

---
 docs/source/conf.py   | 3 +++
 docs/source/index.rst | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 213aafb5..4bb283d1 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -47,8 +47,11 @@
     "sphinx.ext.autosummary",
     "sphinx.ext.intersphinx",
     "sphinx.ext.extlinks",
+    "numpydoc",
 ]
 
+numpydoc_show_class_members = False
+
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ["_templates"]
 
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 3d53e39c..507e3066 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -1,5 +1,5 @@
 Dask-CUDA
-======
+=========
 
 Dask-CUDA is tool for using `Dask <https://dask.org>`_ on GPUs.  It extends Dask's `Single-Machine Cluster <https://docs.dask.org/en/latest/setup/single-distributed.html#localcluster>`_ and `Workers <https://distributed.dask.org/en/latest/worker.html>`_ for optimized distributed GPU workloads.
 

From 7389387cefb436ff6abf11e5ec0fe39039e0b503 Mon Sep 17 00:00:00 2001
From: John Kirkham <jakirkham@gmail.com>
Date: Fri, 14 Aug 2020 11:26:13 -0700
Subject: [PATCH 110/126] Run `black` on Dask-CUDA

---
 dask_cuda/benchmarks/utils.py   | 2 +-
 dask_cuda/cuda_worker.py        | 2 +-
 dask_cuda/initialize.py         | 4 ++--
 dask_cuda/local_cuda_cluster.py | 2 +-
 dask_cuda/utils.py              | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/dask_cuda/benchmarks/utils.py b/dask_cuda/benchmarks/utils.py
index 40b969a6..3d07474d 100644
--- a/dask_cuda/benchmarks/utils.py
+++ b/dask_cuda/benchmarks/utils.py
@@ -187,7 +187,7 @@ def get_scheduler_workers(dask_scheduler=None):
 def setup_memory_pool(pool_size=None, disable_pool=False):
     import cupy
 
-    os.environ['RMM_NO_INITIALIZE'] = 'True'
+    os.environ["RMM_NO_INITIALIZE"] = "True"
     import rmm
 
     rmm.reinitialize(
diff --git a/dask_cuda/cuda_worker.py b/dask_cuda/cuda_worker.py
index 0afd9928..897e3a85 100644
--- a/dask_cuda/cuda_worker.py
+++ b/dask_cuda/cuda_worker.py
@@ -133,7 +133,7 @@ def del_pid_file():
 
         if rmm_pool_size is not None or rmm_managed_memory:
             try:
-                os.environ['RMM_NO_INITIALIZE'] = 'True'
+                os.environ["RMM_NO_INITIALIZE"] = "True"
                 import rmm  # noqa F401
             except ImportError:
                 raise ValueError(
diff --git a/dask_cuda/initialize.py b/dask_cuda/initialize.py
index d98849d5..8d98f596 100644
--- a/dask_cuda/initialize.py
+++ b/dask_cuda/initialize.py
@@ -47,7 +47,7 @@ def initialize(
 ):
     if create_cuda_context:
         try:
-            os.environ['RMM_NO_INITIALIZE'] = 'True'
+            os.environ["RMM_NO_INITIALIZE"] = "True"
             numba.cuda.current_context()
         except Exception:
             logger.error("Unable to start CUDA Context", exc_info=True)
@@ -107,7 +107,7 @@ def dask_setup(
 ):
     if create_cuda_context:
         try:
-            os.environ['RMM_NO_INITIALIZE'] = 'True'
+            os.environ["RMM_NO_INITIALIZE"] = "True"
             numba.cuda.current_context()
         except Exception:
             logger.error("Unable to start CUDA Context", exc_info=True)
diff --git a/dask_cuda/local_cuda_cluster.py b/dask_cuda/local_cuda_cluster.py
index 6f08519d..a725ff99 100644
--- a/dask_cuda/local_cuda_cluster.py
+++ b/dask_cuda/local_cuda_cluster.py
@@ -168,7 +168,7 @@ def __init__(
         self.rmm_managed_memory = rmm_managed_memory
         if rmm_pool_size is not None or rmm_managed_memory:
             try:
-                os.environ['RMM_NO_INITIALIZE'] = 'True'
+                os.environ["RMM_NO_INITIALIZE"] = "True"
                 import rmm  # noqa F401
             except ImportError:
                 raise ValueError(
diff --git a/dask_cuda/utils.py b/dask_cuda/utils.py
index b65507ac..6a56ac94 100644
--- a/dask_cuda/utils.py
+++ b/dask_cuda/utils.py
@@ -35,7 +35,7 @@ def __init__(self, nbytes, managed_memory):
 
     def setup(self, worker=None):
         if self.nbytes is not None or self.managed_memory is True:
-            os.environ['RMM_NO_INITIALIZE'] = 'True'
+            os.environ["RMM_NO_INITIALIZE"] = "True"
             import rmm
 
             pool_allocator = False if self.nbytes is None else True

From 1eac90701ded0333755bb95a4d8f38ba547222dd Mon Sep 17 00:00:00 2001
From: John Kirkham <jakirkham@gmail.com>
Date: Fri, 14 Aug 2020 11:28:34 -0700
Subject: [PATCH 111/126] Replace RMM_NO_INITIALIZE w/ RAPIDS_NO_INITIALIZE

---
 dask_cuda/benchmarks/utils.py   | 2 +-
 dask_cuda/cuda_worker.py        | 2 +-
 dask_cuda/initialize.py         | 4 ++--
 dask_cuda/local_cuda_cluster.py | 2 +-
 dask_cuda/utils.py              | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/dask_cuda/benchmarks/utils.py b/dask_cuda/benchmarks/utils.py
index 3d07474d..5aeaa9b0 100644
--- a/dask_cuda/benchmarks/utils.py
+++ b/dask_cuda/benchmarks/utils.py
@@ -187,7 +187,7 @@ def get_scheduler_workers(dask_scheduler=None):
 def setup_memory_pool(pool_size=None, disable_pool=False):
     import cupy
 
-    os.environ["RMM_NO_INITIALIZE"] = "True"
+    os.environ["RAPIDS_NO_INITIALIZE"] = "True"
     import rmm
 
     rmm.reinitialize(
diff --git a/dask_cuda/cuda_worker.py b/dask_cuda/cuda_worker.py
index 897e3a85..f9a82f68 100644
--- a/dask_cuda/cuda_worker.py
+++ b/dask_cuda/cuda_worker.py
@@ -133,7 +133,7 @@ def del_pid_file():
 
         if rmm_pool_size is not None or rmm_managed_memory:
             try:
-                os.environ["RMM_NO_INITIALIZE"] = "True"
+                os.environ["RAPIDS_NO_INITIALIZE"] = "True"
                 import rmm  # noqa F401
             except ImportError:
                 raise ValueError(
diff --git a/dask_cuda/initialize.py b/dask_cuda/initialize.py
index 8d98f596..6fc42548 100644
--- a/dask_cuda/initialize.py
+++ b/dask_cuda/initialize.py
@@ -47,7 +47,7 @@ def initialize(
 ):
     if create_cuda_context:
         try:
-            os.environ["RMM_NO_INITIALIZE"] = "True"
+            os.environ["RAPIDS_NO_INITIALIZE"] = "True"
             numba.cuda.current_context()
         except Exception:
             logger.error("Unable to start CUDA Context", exc_info=True)
@@ -107,7 +107,7 @@ def dask_setup(
 ):
     if create_cuda_context:
         try:
-            os.environ["RMM_NO_INITIALIZE"] = "True"
+            os.environ["RAPIDS_NO_INITIALIZE"] = "True"
             numba.cuda.current_context()
         except Exception:
             logger.error("Unable to start CUDA Context", exc_info=True)
diff --git a/dask_cuda/local_cuda_cluster.py b/dask_cuda/local_cuda_cluster.py
index a725ff99..fd154326 100644
--- a/dask_cuda/local_cuda_cluster.py
+++ b/dask_cuda/local_cuda_cluster.py
@@ -168,7 +168,7 @@ def __init__(
         self.rmm_managed_memory = rmm_managed_memory
         if rmm_pool_size is not None or rmm_managed_memory:
             try:
-                os.environ["RMM_NO_INITIALIZE"] = "True"
+                os.environ["RAPIDS_NO_INITIALIZE"] = "True"
                 import rmm  # noqa F401
             except ImportError:
                 raise ValueError(
diff --git a/dask_cuda/utils.py b/dask_cuda/utils.py
index 6a56ac94..63771238 100644
--- a/dask_cuda/utils.py
+++ b/dask_cuda/utils.py
@@ -35,7 +35,7 @@ def __init__(self, nbytes, managed_memory):
 
     def setup(self, worker=None):
         if self.nbytes is not None or self.managed_memory is True:
-            os.environ["RMM_NO_INITIALIZE"] = "True"
+            os.environ["RAPIDS_NO_INITIALIZE"] = "True"
             import rmm
 
             pool_allocator = False if self.nbytes is None else True

From 1c9cee6d626fe20bb4b78e4e1ea724e8d3a32835 Mon Sep 17 00:00:00 2001
From: Benjamin Zaitlen <quasiben@gmail.com>
Date: Fri, 14 Aug 2020 12:00:59 -0700
Subject: [PATCH 112/126] one more fix

---
 .readthedocs.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.readthedocs.yml b/.readthedocs.yml
index 2e6d3a85..eba812c2 100644
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@@ -4,7 +4,7 @@ build:
    image: latest
 
 sphinx:
-  configuration: docs/conf.py
+  configuration: docs/source/conf.py
 
 python:
    version: 3.7

From 7a80341c2024f398f5077d1357b062893412b1d5 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Mon, 17 Aug 2020 09:59:15 -0700
Subject: [PATCH 113/126] Confirm DGX tests are running baremetal

---
 dask_cuda/tests/test_dgx.py | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/dask_cuda/tests/test_dgx.py b/dask_cuda/tests/test_dgx.py
index b2378ffc..4a120d80 100644
--- a/dask_cuda/tests/test_dgx.py
+++ b/dask_cuda/tests/test_dgx.py
@@ -29,8 +29,12 @@ class DGXVersion(Enum):
 
 def _get_dgx_name():
     product_name_file = "/sys/class/dmi/id/product_name"
+    dgx_release_file = "/etc/dgx-release"
 
-    if not os.path.isfile(product_name_file):
+    # We verify `product_name_file` to check it's a DGX, and check
+    # if `dgx_release_file` exists to confirm it's not a container.
+    if (not os.path.isfile(product_name_file) or
+            not os.path.isfile(dgx_release_file)):
         return None
 
     for line in open(product_name_file):
@@ -38,17 +42,16 @@ def _get_dgx_name():
 
 
 def _get_dgx_version():
-    dgx_server = None
     dgx_name = _get_dgx_name()
 
-    if "DGX-1" in dgx_name:
-        dgx_server = DGXVersion.DGX_1
+    if dgx_name is None:
+        return None
+    elif "DGX-1" in dgx_name:
+        return DGXVersion.DGX_1
     elif "DGX-2" in dgx_name:
-        dgx_server = DGXVersion.DGX_2
+        return DGXVersion.DGX_2
     elif "DGXA100" in dgx_name:
-        dgx_server = DGXVersion.DGX_A100
-
-    return dgx_server
+        return DGXVersion.DGX_A100
 
 
 def _get_dgx_net_devices():

From 152d659e060f8e6a24e37e101b6f7f7c0f7e931c Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Mon, 17 Aug 2020 15:16:01 -0700
Subject: [PATCH 114/126] Fix black formatting

---
 dask_cuda/tests/test_dgx.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/dask_cuda/tests/test_dgx.py b/dask_cuda/tests/test_dgx.py
index 4a120d80..86d82e6b 100644
--- a/dask_cuda/tests/test_dgx.py
+++ b/dask_cuda/tests/test_dgx.py
@@ -33,8 +33,7 @@ def _get_dgx_name():
 
     # We verify `product_name_file` to check it's a DGX, and check
     # if `dgx_release_file` exists to confirm it's not a container.
-    if (not os.path.isfile(product_name_file) or
-            not os.path.isfile(dgx_release_file)):
+    if not os.path.isfile(product_name_file) or not os.path.isfile(dgx_release_file):
         return None
 
     for line in open(product_name_file):

From a5b8d310cd8cd04e7a90b0fb6a4eea4196dadc95 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Tue, 18 Aug 2020 08:42:02 -0700
Subject: [PATCH 115/126] Update changelog for 0.15

---
 CHANGELOG.rst | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 6da59671..40c57a9d 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -1,3 +1,37 @@
+0.15
+----
+- Fix-up versioneer (#305) `John Kirkham`_
+- Require Distributed 2.15.0+ (#306) `John Kirkham`_
+- Rely on Dask's ability to serialize collections (#307) `John Kirkham`_
+- Ensure CI installs GPU build of UCX (#308) `Peter Andreas Entschev`_
+- Skip 2nd serialization pass of `DeviceSerialized` (#309) `John Kirkham`_
+- Fix tests related to latest RMM changes (#310) `Peter Andreas Entschev`_
+- Fix dask-cuda-worker's interface argument (#314) `Peter Andreas Entschev`_
+- Check only for memory type during test_get_device_total_memory (#315) `Peter Andreas Entschev`_
+- Fix and improve DGX tests (#316) `Peter Andreas Entschev`_
+- Install dependencies via meta package (#317) `Ray Douglass`_
+- Fix errors when TLS files are not specified (#320) `Peter Andreas Entschev`_
+- Refactor dask-cuda-worker into CUDAWorker class (#324) `Jacob Tomlinson`_
+- Add missing __init__.py to dask_cuda/cli (#327) `Peter Andreas Entschev`_
+- Add Dask distributed GPU tests to CI (#329) `Benjamin Zaitlen`_
+- Fix rmm_pool_size argument name in docstrings (#329) `Benjamin Zaitlen`_
+- Add CPU support to benchmarks (#338) `Benjamin Zaitlen`_
+- Fix isort configuration (#339) `Mads R. B. Kristensen`_
+- Explicit-comms: cleanup and bug fix (#340) `Mads R. B. Kristensen`_
+- Add support for RMM managed memory (#343) `Peter Andreas Entschev`_
+- Update docker image in local build script (#345) `Sean Frye`_
+- Support pickle protocol 5 based spilling (#349) `John Kirkham`_
+- Use get_n_gpus for RMM test with dask-cuda-worker (#356) `Peter Andreas Entschev`_
+- Update RMM tests based on deprecated CNMeM (#359) `John Kirkham`_
+- Fix a black error in explicit comms (#360) `John Kirkham`_
+- Fix an `isort` error (#360) `John Kirkham`_
+- Fix an `isort` error (#360) `John Kirkham`_
+- Set `RMM_NO_INITIALIZE` environment variable (#363) `Benjamin Zaitlen`_
+- Fix bash lines in docs (#369) `Benjamin Zaitlen`_
+- Replace `RMM_NO_INITIALIZE` with `RAPIDS_NO_INITIALIZE` (#371) `John Kirkham`_
+- Fixes for docs and RTD updates (#373) `Benjamin Zaitlen`_
+- Confirm DGX tests are running baremetal (#376) `Peter Andreas Entschev`_
+
 0.14
 ----
 - Publish branch-0.14 to conda (#262) `Paul Taylor`_
@@ -143,3 +177,5 @@
 .. _`Paul Taylor`: https://github.com/trxcllnt
 .. _`Eli Fajardo`: https://github.com/efajardo-nv
 .. _`Randy Gelhausen`: https://github.com/randerzander
+.. _`Jacob Tomlinson`: https://github.com/jacobtomlinson
+.. _`Sean Frye`: https://github.com/sean-frye

From 4d49f8cf6cf928d9442ab290f8c0efba9e6a2eff Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Tue, 18 Aug 2020 13:26:37 -0700
Subject: [PATCH 116/126] Set RAPIDS_NO_INITIALIZE at the top of CUDAWorker

---
 dask_cuda/cuda_worker.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/dask_cuda/cuda_worker.py b/dask_cuda/cuda_worker.py
index f9a82f68..dcb97698 100644
--- a/dask_cuda/cuda_worker.py
+++ b/dask_cuda/cuda_worker.py
@@ -72,6 +72,8 @@ def __init__(
         net_devices=None,
         **kwargs,
     ):
+        os.environ["RAPIDS_NO_INITIALIZE"] = "True"
+
         enable_proctitle_on_current()
         enable_proctitle_on_children()
 
@@ -133,7 +135,6 @@ def del_pid_file():
 
         if rmm_pool_size is not None or rmm_managed_memory:
             try:
-                os.environ["RAPIDS_NO_INITIALIZE"] = "True"
                 import rmm  # noqa F401
             except ImportError:
                 raise ValueError(

From ee3c37d7c8dae0b03520e244c2827e63d894e4e3 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Tue, 18 Aug 2020 13:40:16 -0700
Subject: [PATCH 117/126] Remove redundant RAPIDS_NO_INITIALIZE entries

---
 dask_cuda/benchmarks/utils.py | 2 --
 dask_cuda/initialize.py       | 2 --
 dask_cuda/utils.py            | 1 -
 3 files changed, 5 deletions(-)

diff --git a/dask_cuda/benchmarks/utils.py b/dask_cuda/benchmarks/utils.py
index 5aeaa9b0..5d094fb4 100644
--- a/dask_cuda/benchmarks/utils.py
+++ b/dask_cuda/benchmarks/utils.py
@@ -186,8 +186,6 @@ def get_scheduler_workers(dask_scheduler=None):
 
 def setup_memory_pool(pool_size=None, disable_pool=False):
     import cupy
-
-    os.environ["RAPIDS_NO_INITIALIZE"] = "True"
     import rmm
 
     rmm.reinitialize(
diff --git a/dask_cuda/initialize.py b/dask_cuda/initialize.py
index 6fc42548..d65fa2db 100644
--- a/dask_cuda/initialize.py
+++ b/dask_cuda/initialize.py
@@ -47,7 +47,6 @@ def initialize(
 ):
     if create_cuda_context:
         try:
-            os.environ["RAPIDS_NO_INITIALIZE"] = "True"
             numba.cuda.current_context()
         except Exception:
             logger.error("Unable to start CUDA Context", exc_info=True)
@@ -107,7 +106,6 @@ def dask_setup(
 ):
     if create_cuda_context:
         try:
-            os.environ["RAPIDS_NO_INITIALIZE"] = "True"
             numba.cuda.current_context()
         except Exception:
             logger.error("Unable to start CUDA Context", exc_info=True)
diff --git a/dask_cuda/utils.py b/dask_cuda/utils.py
index 63771238..72c7736c 100644
--- a/dask_cuda/utils.py
+++ b/dask_cuda/utils.py
@@ -35,7 +35,6 @@ def __init__(self, nbytes, managed_memory):
 
     def setup(self, worker=None):
         if self.nbytes is not None or self.managed_memory is True:
-            os.environ["RAPIDS_NO_INITIALIZE"] = "True"
             import rmm
 
             pool_allocator = False if self.nbytes is None else True

From cdc63f58bc9c06e45c3803a2101c262482d930e1 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Tue, 18 Aug 2020 13:44:03 -0700
Subject: [PATCH 118/126] Set RAPIDS_NO_INITIALIZE at the top of
 LocalCUDACluster

---
 dask_cuda/local_cuda_cluster.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/dask_cuda/local_cuda_cluster.py b/dask_cuda/local_cuda_cluster.py
index 2c771c6a..b9df001f 100644
--- a/dask_cuda/local_cuda_cluster.py
+++ b/dask_cuda/local_cuda_cluster.py
@@ -150,6 +150,8 @@ def __init__(
         rmm_managed_memory=False,
         **kwargs,
     ):
+        os.environ["RAPIDS_NO_INITIALIZE"] = "True"
+
         if CUDA_VISIBLE_DEVICES is None:
             CUDA_VISIBLE_DEVICES = cuda_visible_devices(0)
         if isinstance(CUDA_VISIBLE_DEVICES, str):
@@ -166,7 +168,6 @@ def __init__(
         self.rmm_managed_memory = rmm_managed_memory
         if rmm_pool_size is not None or rmm_managed_memory:
             try:
-                os.environ["RAPIDS_NO_INITIALIZE"] = "True"
                 import rmm  # noqa F401
             except ImportError:
                 raise ValueError(

From 15abd717ea6f71cd4dde829db9754c6b0a25ae7e Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Tue, 18 Aug 2020 13:45:00 -0700
Subject: [PATCH 119/126] Add comment about need to set RAPIDS_NO_INITIALIZE

---
 dask_cuda/cuda_worker.py        | 2 ++
 dask_cuda/local_cuda_cluster.py | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/dask_cuda/cuda_worker.py b/dask_cuda/cuda_worker.py
index dcb97698..28d7c0d3 100644
--- a/dask_cuda/cuda_worker.py
+++ b/dask_cuda/cuda_worker.py
@@ -72,6 +72,8 @@ def __init__(
         net_devices=None,
         **kwargs,
     ):
+        # Required by RAPIDS libraries (e.g., cuDF) to ensure no context
+        # initialization happens before we can set CUDA_VISIBLE_DEVICES
         os.environ["RAPIDS_NO_INITIALIZE"] = "True"
 
         enable_proctitle_on_current()
diff --git a/dask_cuda/local_cuda_cluster.py b/dask_cuda/local_cuda_cluster.py
index b9df001f..6598b52e 100644
--- a/dask_cuda/local_cuda_cluster.py
+++ b/dask_cuda/local_cuda_cluster.py
@@ -150,6 +150,8 @@ def __init__(
         rmm_managed_memory=False,
         **kwargs,
     ):
+        # Required by RAPIDS libraries (e.g., cuDF) to ensure no context
+        # initialization happens before we can set CUDA_VISIBLE_DEVICES
         os.environ["RAPIDS_NO_INITIALIZE"] = "True"
 
         if CUDA_VISIBLE_DEVICES is None:

From 7004e0b084cfa8d7fd7b8b1a0c41d392425cd994 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Tue, 18 Aug 2020 13:49:05 -0700
Subject: [PATCH 120/126] Fix formatting errors

---
 dask_cuda/benchmarks/utils.py | 2 +-
 dask_cuda/initialize.py       | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/dask_cuda/benchmarks/utils.py b/dask_cuda/benchmarks/utils.py
index 5d094fb4..32d92158 100644
--- a/dask_cuda/benchmarks/utils.py
+++ b/dask_cuda/benchmarks/utils.py
@@ -1,5 +1,4 @@
 import argparse
-import os
 
 from dask.distributed import SSHCluster
 
@@ -186,6 +185,7 @@ def get_scheduler_workers(dask_scheduler=None):
 
 def setup_memory_pool(pool_size=None, disable_pool=False):
     import cupy
+
     import rmm
 
     rmm.reinitialize(
diff --git a/dask_cuda/initialize.py b/dask_cuda/initialize.py
index d65fa2db..e53c10de 100644
--- a/dask_cuda/initialize.py
+++ b/dask_cuda/initialize.py
@@ -24,7 +24,6 @@
 about Dask configuration.
 """
 import logging
-import os
 
 import click
 import numba.cuda

From 50672a1536818f1009768c84d7b831fbf6e1fe55 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Tue, 18 Aug 2020 14:15:31 -0700
Subject: [PATCH 121/126] Change pytest's basetemp in CI build script

---
 ci/gpu/build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 0c92c6e2..b4931161 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -100,7 +100,7 @@ else
     logger "Python py.test for dask-cuda..."
     cd $WORKSPACE
     ls dask_cuda/tests/
-    UCXPY_IFNAME=eth0 UCX_WARN_UNUSED_ENV_VARS=n UCX_MEMTYPE_CACHE=n py.test -vs --cache-clear --junitxml=${WORKSPACE}/junit-dask-cuda.xml --cov-config=.coveragerc --cov=dask_cuda --cov-report=xml:${WORKSPACE}/dask-cuda-coverage.xml --cov-report term dask_cuda/tests/
+    UCXPY_IFNAME=eth0 UCX_WARN_UNUSED_ENV_VARS=n UCX_MEMTYPE_CACHE=n py.test -vs --cache-clear --basetemp=${WORKSPACE}/dask-cuda-tmp --junitxml=${WORKSPACE}/junit-dask-cuda.xml --cov-config=.coveragerc --cov=dask_cuda --cov-report=xml:${WORKSPACE}/dask-cuda-coverage.xml --cov-report term dask_cuda/tests/
 
     logger "Running dask.distributed GPU tests"
     # Test downstream packages, which requires Python v3.7

From 1c594a61c840c7c8c3a42692048f8065cad9a181 Mon Sep 17 00:00:00 2001
From: Benjamin Zaitlen <quasiben@gmail.com>
Date: Wed, 19 Aug 2020 07:54:49 -0700
Subject: [PATCH 122/126] revert warning test

---
 dask_cuda/tests/test_dask_cuda_worker.py | 18 +++---------------
 1 file changed, 3 insertions(+), 15 deletions(-)

diff --git a/dask_cuda/tests/test_dask_cuda_worker.py b/dask_cuda/tests/test_dask_cuda_worker.py
index 025c06cf..ad65e470 100644
--- a/dask_cuda/tests/test_dask_cuda_worker.py
+++ b/dask_cuda/tests/test_dask_cuda_worker.py
@@ -12,7 +12,7 @@
 from dask_cuda.utils import get_gpu_count, wait_workers
 
 
-def test_cuda_visible_devices_and_memory_limit_and_warning(loop):  # noqa: F811
+def test_cuda_visible_devices_and_memory_limit(loop):  # noqa: F811
     os.environ["CUDA_VISIBLE_DEVICES"] = "2,3,7,8"
     try:
         with popen(["dask-scheduler", "--port", "9359", "--no-dashboard"]):
@@ -25,11 +25,8 @@ def test_cuda_visible_devices_and_memory_limit_and_warning(loop):  # noqa: F811
                     "--device-memory-limit",
                     "1 MB",
                     "--no-dashboard",
-                    "--enable-nvlink",
-                ],
-                stdout=True,
-                stderr=True,
-            ) as proc:
+                ]
+            ):
                 with Client("127.0.0.1:9359", loop=loop) as client:
                     assert wait_workers(client, n_gpus=4)
 
@@ -46,15 +43,6 @@ def get_visible_devices():
                     for w in workers.values():
                         assert w["memory_limit"] == MEMORY_LIMIT // len(workers)
 
-                    # grab first 5 lines of dask-cuda-worker startup
-                    lines = []
-                    for idx, line in enumerate(proc.stderr):
-                        lines.append(line)
-                        if idx == 5:
-                            break
-
-                    assert any(b"When using NVLink we" in line for line in lines)
-
                     assert len(expected) == 0
     finally:
         del os.environ["CUDA_VISIBLE_DEVICES"]

From 85d2b746d992fdf113b13b84218184825f408d81 Mon Sep 17 00:00:00 2001
From: Benjamin Zaitlen <quasiben@gmail.com>
Date: Wed, 19 Aug 2020 12:07:47 -0700
Subject: [PATCH 123/126] update numba pinning

---
 conda/recipes/dask-cuda/meta.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conda/recipes/dask-cuda/meta.yaml b/conda/recipes/dask-cuda/meta.yaml
index 3404113c..ece107eb 100644
--- a/conda/recipes/dask-cuda/meta.yaml
+++ b/conda/recipes/dask-cuda/meta.yaml
@@ -28,7 +28,7 @@ requirements:
     - distributed >=2.18.0
     - pynvml >=8.0.3
     - numpy >=1.16.0
-    - numba >=0.40.1
+    - numba >=0.50.0,!=0.51.0
 
 test:
   imports:

From e551fff1d69cd37d5e7055cb25737ab9386bdeca Mon Sep 17 00:00:00 2001
From: Benjamin Zaitlen <quasiben@gmail.com>
Date: Thu, 20 Aug 2020 06:40:59 -0700
Subject: [PATCH 124/126] update warning message

---
 dask_cuda/local_cuda_cluster.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/dask_cuda/local_cuda_cluster.py b/dask_cuda/local_cuda_cluster.py
index b1fe0fa4..f0703d35 100644
--- a/dask_cuda/local_cuda_cluster.py
+++ b/dask_cuda/local_cuda_cluster.py
@@ -183,8 +183,7 @@ def __init__(
             if enable_nvlink:
                 warnings.warn(
                     "When using NVLink we recommend setting a "
-                    "`rmm_pool_size` or setting an RMM pool via `client.run`. "
-                    "Please see: "
+                    "`rmm_pool_size`. Please see: "
                     "https://dask-cuda.readthedocs.io/en/latest/ucx.html"
                     "#important-notes for more details"
                 )

From 4470a37877963ff91ecf44765e446f597e07b29e Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Thu, 20 Aug 2020 08:04:33 -0700
Subject: [PATCH 125/126] More updates to 0.15 changelog

---
 CHANGELOG.rst | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 40c57a9d..ea092916 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -31,6 +31,9 @@
 - Replace `RMM_NO_INITIALIZE` with `RAPIDS_NO_INITIALIZE` (#371) `John Kirkham`_
 - Fixes for docs and RTD updates (#373) `Benjamin Zaitlen`_
 - Confirm DGX tests are running baremetal (#376) `Peter Andreas Entschev`_
+- Set RAPIDS_NO_INITIALIZE at the top of CUDAWorker/LocalCUDACluster (#379) `Peter Andreas Entschev`_
+- Change pytest's basetemp in CI build script (#380) `Peter Andreas Entschev`_
+- Pin Numba version to exclude 0.51.0 (#385) `Benjamin Zaitlen`_
 
 0.14
 ----

From a7587ea16b94fb784d1ee7e257535a2a1464399e Mon Sep 17 00:00:00 2001
From: Dillon Cullinan <dcullinan92@gmail.com>
Date: Tue, 25 Aug 2020 11:16:33 -0400
Subject: [PATCH 126/126] FIX Upload anaconda script

---
 ci/cpu/upload-anaconda.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/ci/cpu/upload-anaconda.sh b/ci/cpu/upload-anaconda.sh
index 4183188d..cf6c34f9 100755
--- a/ci/cpu/upload-anaconda.sh
+++ b/ci/cpu/upload-anaconda.sh
@@ -7,13 +7,12 @@ set -e
 export UPLOADFILE=`conda build conda/recipes/dask-cuda --python=$PYTHON --output`
 CUDA_REL=${CUDA_VERSION%.*}
 
-SOURCE_BRANCH=master
 
 LABEL_OPTION="--label main"
 echo "LABEL_OPTION=${LABEL_OPTION}"
 
 # Restrict uploads to master branch
-if [ ${GIT_BRANCH} != ${SOURCE_BRANCH} ]; then
+if [ ${BUILD_MODE} != "branch" ]; then
   echo "Skipping upload"
   return 0
 fi