[train] set auto_transfer cuda device (#26819)

This sets the CUDA Stream on the correct device (and not the default one) when calling train.torch.prepare_data_loader(auto_transfer=True). Signed-off-by: Matthew Deng <[email protected]>
ray-project · Jul 21, 2022 · 728e2b3 · 728e2b3
1 parent 4da78c4
commit 728e2b3
Show file tree

Hide file tree

Showing 2 changed files with 30 additions and 3 deletions.
diff --git a/python/ray/train/tests/test_gpu.py b/python/ray/train/tests/test_gpu.py
@@ -477,6 +477,33 @@ def host_to_device_auto_pipeline(device):
         assert compute_average_runtime(host_to_device) >= with_auto_transfer
 
 
+def test_auto_transfer_correct_device(ray_start_4_cpus_2_gpus):
+    """Tests that auto_transfer uses the right device for the cuda stream."""
+    import nvidia_smi
+
+    nvidia_smi.nvmlInit()
+
+    def get_gpu_used_mem(i):
+        handle = nvidia_smi.nvmlDeviceGetHandleByIndex(i)
+        info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
+        return info.used
+
+    start_gpu_memory = get_gpu_used_mem(1)
+
+    device = torch.device("cuda:1")
+    small_dataloader = [(torch.randn((1024 * 4, 1024 * 4)),) for _ in range(10)]
+    wrapped_dataloader = (  # noqa: F841
+        ray.train.torch.train_loop_utils._WrappedDataLoader(
+            small_dataloader, device, True
+        )
+    )
+
+    end_gpu_memory = get_gpu_used_mem(1)
+
+    # Verify GPU memory usage increases on the right cuda device
+    assert end_gpu_memory > start_gpu_memory
+
+
 if __name__ == "__main__":
     import sys
 

diff --git a/python/ray/train/torch/train_loop_utils.py b/python/ray/train/torch/train_loop_utils.py
@@ -354,7 +354,7 @@ def prepare_data_loader(
         data_loader: torch.utils.data.DataLoader,
         add_dist_sampler: bool = True,
         move_to_device: bool = True,
-        auto_transfer: bool = True,
+        auto_transfer: bool = False,
     ) -> torch.utils.data.DataLoader:
         """Prepares DataLoader for distributed execution.
 
@@ -368,7 +368,7 @@ def prepare_data_loader(
                 the provided DataLoader.
             move_to_device: If set, automatically move the data
                 returned by the data loader to the correct device.
-            auto_transfer: If set and device is GPU, another CUDA stream
+            auto_transfer: (Experimental) If set and device is GPU, another CUDA stream
                 is created to automatically copy data from host (CPU) memory
                 to device (GPU) memory (the default CUDA stream still runs the
                 training procedure). If device is CPU, it will be disabled
@@ -567,7 +567,7 @@ def __init__(
         self._auto_transfer = auto_transfer if device.type == "cuda" else False
         # create a new CUDA stream to move data from host to device concurrently
         self._memcpy_stream = (
-            torch.cuda.Stream()
+            torch.cuda.Stream(device)
             if device.type == "cuda" and self._auto_transfer
             else None
         )