document precompute_float8_amax_for_fsdp

Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags:
pytorch-labs · Jul 9, 2024 · fe2e0a0 · fe2e0a0
1 parent 1cbaa13
commit fe2e0a0
Showing 1 changed file with 14 additions and 4 deletions.
diff --git a/float8_experimental/fsdp_utils.py b/float8_experimental/fsdp_utils.py
@@ -11,6 +11,14 @@
 
 
 def precompute_float8_amax_for_fsdp(module: nn.Module) -> None:
+    """
+    Calculate amax for all float8 parameters after optimizer step
+    It performs a single all-reduce instead of many all-reduces for each parameter
+    Exmaple usage:
+        model(input).sum().backward()
+        optim.step()
+        precompute_float8_amax_for_fsdp(model)
+    """
     from torch.distributed._tensor import DTensor
 
     if any(
@@ -32,10 +40,12 @@ def precompute_float8_amax_for_fsdp(module: nn.Module) -> None:
 
     def compute_amaxes(weights: List[DTensor]):
         # inf-norm is equivalent to max(abs(w))
-        max_weights = torch._foreach_norm(weights, ord=math.inf)
-        amax_tensor = torch.vstack(max_weights)
-        amax_tensor = torch.clamp(amax_tensor, EPS)  # R
-        amaxes = torch.split(amax_tensor, 1)  # R
+        max_weights = torch._foreach_norm(weights, ord=math.inf)  # Partial
+        amax_tensor = torch.vstack(max_weights)  # Partial
+        # clamp is dispatched through DTensor
+        # it will issue a single all-reduce
+        amax_tensor = torch.clamp(amax_tensor, EPS)  # Replicate
+        amaxes = torch.split(amax_tensor, 1)  # Replicate
         return amaxes
 
     if weights: