Lightning-AI · Borda · Oct 31, 2024 · Sep 17, 2024 · Sep 17, 2024 · Oct 9, 2024
@@ -24,6 +24,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Added multi-output support for MAE metric ([#2605](https://github.com/Lightning-AI/torchmetrics/pull/2605))
 
 
+- Added support for propagation of the autograd graph in ddp setting ([#2754](https://github.com/Lightning-AI/torchmetrics/pull/2754))
+
+
 ### Changed
 
 - Tracker higher is better integration ([#2649](https://github.com/Lightning-AI/torchmetrics/pull/2649))

@@ -492,6 +492,10 @@ In practice this means that:
 
 A functional metric is differentiable if its corresponding modular metric is differentiable.
 
+For PyTorch versions 2.1 or higher, differentiation in DDP mode is enabled, allowing autograd graph
+propagation after the ``all_gather`` operation. This is useful for synchronizing metrics used as
+loss functions in a DDP setting.
+
 ***************************************
 Metrics and hyperparameter optimization
 ***************************************

@@ -18,6 +18,8 @@
 from torch.nn import functional as F  # noqa: N812
 from typing_extensions import Literal
 
+from torchmetrics.utilities.imports import _TORCH_GREATER_EQUAL_2_1
+
 
 def reduce(x: Tensor, reduction: Literal["elementwise_mean", "sum", "none", None]) -> Tensor:
     """Reduces a given tensor by a given reduction method.
@@ -91,6 +93,9 @@ def class_reduce(
 def _simple_gather_all_tensors(result: Tensor, group: Any, world_size: int) -> List[Tensor]:
     gathered_result = [torch.zeros_like(result) for _ in range(world_size)]
     torch.distributed.all_gather(gathered_result, result, group)
+    # to propagate autograd graph from local rank (achieves intended effect for torch> 2.0)
+    if _TORCH_GREATER_EQUAL_2_1:
+        gathered_result[torch.distributed.get_rank(group)] = result
     return gathered_result
 
 
@@ -144,4 +149,7 @@ def gather_all_tensors(result: Tensor, group: Optional[Any] = None) -> List[Tens
     for idx, item_size in enumerate(local_sizes):
         slice_param = [slice(dim_size) for dim_size in item_size]
         gathered_result[idx] = gathered_result[idx][slice_param]
+    # to propagate autograd graph from local rank (achieves intended effect for torch> 2.0)
+    if _TORCH_GREATER_EQUAL_2_1:
+        gathered_result[torch.distributed.get_rank(group)] = result
     return gathered_result
@@ -105,6 +105,76 @@ def test_ddp(process):
     pytest.pool.map(process, range(NUM_PROCESSES))
 
 
+def _test_ddp_gather_autograd_same_shape(rank: int, worldsize: int = NUM_PROCESSES) -> None:
+    """Test that ddp gather preserves local rank's autograd graph for same-shaped tensors across ranks.
+
+    This function tests that ``torchmetrics.utilities.distributed.gather_all_tensors`` works as intended in
+    preserving the local rank's autograd graph upon the gather. The function compares derivative values obtained
+    with the local rank results from the ``gather_all_tensors`` output and the original local rank tensor.
+    This test only considers tensors of the same shape across different ranks.
+
+    Note that this test only works for torch>=2.0.
+
+    """
+    tensor = torch.ones(50, requires_grad=True)
+    result = gather_all_tensors(tensor)
+    assert len(result) == worldsize
+    scalar1 = 0
+    scalar2 = 0
+    for idx in range(worldsize):
+        if idx == rank:
+            scalar1 = scalar1 + torch.sum(tensor * torch.ones_like(tensor))
+        else:
+            scalar1 = scalar1 + torch.sum(result[idx] * torch.ones_like(result[idx]))
+        scalar2 = scalar2 + torch.sum(result[idx] * torch.ones_like(result[idx]))
+    gradient1 = torch.autograd.grad(scalar1, [tensor], retain_graph=True)[0]
+    gradient2 = torch.autograd.grad(scalar2, [tensor])[0]
+    assert torch.allclose(gradient1, gradient2)
+
+
+def _test_ddp_gather_autograd_different_shape(rank: int, worldsize: int = NUM_PROCESSES) -> None:
+    """Test that ddp gather preserves local rank's autograd graph for differently-shaped tensors across ranks.
+
+    This function tests that ``torchmetrics.utilities.distributed.gather_all_tensors`` works as intended in
+    preserving the local rank's autograd graph upon the gather. The function compares derivative values obtained
+    with the local rank results from the ``gather_all_tensors`` output and the original local rank tensor.
+    This test considers tensors of different shapes across different ranks.
+
+    Note that this test only works for torch>=2.0.
+
+    """
+    tensor = torch.ones(rank + 1, 2 - rank, requires_grad=True)
+    result = gather_all_tensors(tensor)
+    assert len(result) == worldsize
+    scalar1 = 0
+    scalar2 = 0
+    for idx in range(worldsize):
+        if idx == rank:
+            scalar1 = scalar1 + torch.sum(tensor * torch.ones_like(tensor))
+        else:
+            scalar1 = scalar1 + torch.sum(result[idx] * torch.ones_like(result[idx]))
+        scalar2 = scalar2 + torch.sum(result[idx] * torch.ones_like(result[idx]))
+    gradient1 = torch.autograd.grad(scalar1, [tensor], retain_graph=True)[0]
+    gradient2 = torch.autograd.grad(scalar2, [tensor])[0]
+    assert torch.allclose(gradient1, gradient2)
+
+
+@pytest.mark.DDP()
+@pytest.mark.skipif(sys.platform == "win32", reason="DDP not available on windows")
+@pytest.mark.skipif(not USE_PYTEST_POOL, reason="DDP pool is not available.")
+@pytest.mark.skipif(not _TORCH_GREATER_EQUAL_2_1, reason="test only works on newer torch versions")
+@pytest.mark.parametrize(
+    "process",
+    [
+        _test_ddp_gather_autograd_same_shape,
+        _test_ddp_gather_autograd_different_shape,
+    ],
+)
+def test_ddp_autograd(process):
+    """Test ddp functions for autograd compatibility."""
+    pytest.pool.map(process, range(NUM_PROCESSES))
+
+
 def _test_non_contiguous_tensors(rank):
     class DummyCatMetric(Metric):
         full_state_update = True