You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
When we were using ray.Actor to perform calculations related to PyTorch tensors, we noticed that the CUDA memory usage continuously increased with each computation step.
After investigation, we found that this was caused by setting max_concurrency > 1.
In the test provided below, the memory leak reached 3 GB after executing 100 times.
Versions / Dependencies
Version: 2.40.0
python
Reproduction script
import json
import ray
import torch
def log_gpu_memory_usage(head: str):
memory_allocated = torch.cuda.memory_allocated() / 1024 ** 3
memory_reserved = torch.cuda.memory_reserved() / 1024 ** 3
message = f'{head}, memory allocated (GB): {memory_allocated}, memory reserved (GB): {memory_reserved}'
print(message)
return memory_allocated, memory_reserved
@ray.remote(num_gpus=1)
class ComputeTensorActor:
def __init__(self, name):
self.name = name
def compute_tensor(self, num=100):
torch.manual_seed(0)
tensor_size = (1024, 1024)
tensor = torch.randn(tensor_size, device='cuda')
tensor_list = [torch.randn(tensor_size, device='cuda') for _ in range(num * 100)]
for _ in range(num):
other_tensor = torch.randn(tensor_size, device='cuda')
tensor = torch.mm(tensor, other_tensor)
tensor = torch.relu(tensor)
tensor += 0.1 * torch.randn(tensor_size, device='cuda')
metrics = {}
memory_allocated, memory_reserved = log_gpu_memory_usage(head=f"{self.name} before empty cache")
metrics["onload/memory_allocated"] = memory_allocated
metrics["onload/memory_reserved"] = memory_reserved
del tensor_list, tensor
torch.cuda.empty_cache()
memory_allocated, memory_reserved = log_gpu_memory_usage(head=f"{self.name} after empty cache")
metrics["offload/memory_allocated"] = memory_allocated
metrics["offload/memory_reserved"] = memory_reserved
return metrics
def test_thread_actor():
ray.init(num_gpus=1, ignore_reinit_error=True)
cp_actor = ComputeTensorActor.options(num_gpus=1, max_concurrency=1000).remote("thread actor")
num = 100
metric_list = [ray.get(cp_actor.compute_tensor.remote(num=num)) for _ in range(num)]
print(metric_list)
with open("thread_actor_metrics.json", "w") as f:
json.dump(metric_list, f)
def test_common_actor():
ray.init(num_gpus=1, ignore_reinit_error=True)
cp_actor = ComputeTensorActor.options(num_gpus=1).remote("thread actor")
num = 100
metric_list = [ray.get(cp_actor.compute_tensor.remote(num=num)) for _ in range(num)]
print(metric_list)
with open("common_actor_metrics.json", "w") as f:
json.dump(metric_list, f)
if __name__ == '__main__':
test_thread_actor()
test_common_actor()
Medium: It is a significant difficulty but I can work around it.
The text was updated successfully, but these errors were encountered:
PanAndy
added
bug
Something that is supposed to be working; but isn't
triage
Needs triage (eg: priority, bug/not-bug, and owning component)
labels
Dec 19, 2024
ruisearch42
added
P0
Issues that should be fixed in short order
and removed
triage
Needs triage (eg: priority, bug/not-bug, and owning component)
labels
Dec 23, 2024
What happened + What you expected to happen
When we were using ray.Actor to perform calculations related to PyTorch tensors, we noticed that the CUDA memory usage continuously increased with each computation step.
After investigation, we found that this was caused by setting max_concurrency > 1.
In the test provided below, the memory leak reached 3 GB after executing 100 times.
Versions / Dependencies
Version: 2.40.0
python
Reproduction script
last result:
Issue Severity
Medium: It is a significant difficulty but I can work around it.
The text was updated successfully, but these errors were encountered: