Skip to content

Commit

Permalink
[train] set auto_transfer cuda device (#26819)
Browse files Browse the repository at this point in the history
This sets the CUDA Stream on the correct device (and not the default one) when calling train.torch.prepare_data_loader(auto_transfer=True).

Signed-off-by: Matthew Deng <[email protected]>
  • Loading branch information
matthewdeng authored Jul 21, 2022
1 parent 4da78c4 commit 728e2b3
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 3 deletions.
27 changes: 27 additions & 0 deletions python/ray/train/tests/test_gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -477,6 +477,33 @@ def host_to_device_auto_pipeline(device):
assert compute_average_runtime(host_to_device) >= with_auto_transfer


def test_auto_transfer_correct_device(ray_start_4_cpus_2_gpus):
"""Tests that auto_transfer uses the right device for the cuda stream."""
import nvidia_smi

nvidia_smi.nvmlInit()

def get_gpu_used_mem(i):
handle = nvidia_smi.nvmlDeviceGetHandleByIndex(i)
info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
return info.used

start_gpu_memory = get_gpu_used_mem(1)

device = torch.device("cuda:1")
small_dataloader = [(torch.randn((1024 * 4, 1024 * 4)),) for _ in range(10)]
wrapped_dataloader = ( # noqa: F841
ray.train.torch.train_loop_utils._WrappedDataLoader(
small_dataloader, device, True
)
)

end_gpu_memory = get_gpu_used_mem(1)

# Verify GPU memory usage increases on the right cuda device
assert end_gpu_memory > start_gpu_memory


if __name__ == "__main__":
import sys

Expand Down
6 changes: 3 additions & 3 deletions python/ray/train/torch/train_loop_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,7 @@ def prepare_data_loader(
data_loader: torch.utils.data.DataLoader,
add_dist_sampler: bool = True,
move_to_device: bool = True,
auto_transfer: bool = True,
auto_transfer: bool = False,
) -> torch.utils.data.DataLoader:
"""Prepares DataLoader for distributed execution.
Expand All @@ -368,7 +368,7 @@ def prepare_data_loader(
the provided DataLoader.
move_to_device: If set, automatically move the data
returned by the data loader to the correct device.
auto_transfer: If set and device is GPU, another CUDA stream
auto_transfer: (Experimental) If set and device is GPU, another CUDA stream
is created to automatically copy data from host (CPU) memory
to device (GPU) memory (the default CUDA stream still runs the
training procedure). If device is CPU, it will be disabled
Expand Down Expand Up @@ -567,7 +567,7 @@ def __init__(
self._auto_transfer = auto_transfer if device.type == "cuda" else False
# create a new CUDA stream to move data from host to device concurrently
self._memcpy_stream = (
torch.cuda.Stream()
torch.cuda.Stream(device)
if device.type == "cuda" and self._auto_transfer
else None
)
Expand Down

0 comments on commit 728e2b3

Please sign in to comment.