Skip to content
This repository has been archived by the owner on Aug 7, 2024. It is now read-only.

Commit

Permalink
document precompute_float8_amax_for_fsdp
Browse files Browse the repository at this point in the history
Summary:

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
  • Loading branch information
weifengpy committed Jul 9, 2024
1 parent 1cbaa13 commit fe2e0a0
Showing 1 changed file with 14 additions and 4 deletions.
18 changes: 14 additions & 4 deletions float8_experimental/fsdp_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,14 @@


def precompute_float8_amax_for_fsdp(module: nn.Module) -> None:
"""
Calculate amax for all float8 parameters after optimizer step
It performs a single all-reduce instead of many all-reduces for each parameter
Exmaple usage:
model(input).sum().backward()
optim.step()
precompute_float8_amax_for_fsdp(model)
"""
from torch.distributed._tensor import DTensor

if any(
Expand All @@ -32,10 +40,12 @@ def precompute_float8_amax_for_fsdp(module: nn.Module) -> None:

def compute_amaxes(weights: List[DTensor]):
# inf-norm is equivalent to max(abs(w))
max_weights = torch._foreach_norm(weights, ord=math.inf)
amax_tensor = torch.vstack(max_weights)
amax_tensor = torch.clamp(amax_tensor, EPS) # R
amaxes = torch.split(amax_tensor, 1) # R
max_weights = torch._foreach_norm(weights, ord=math.inf) # Partial
amax_tensor = torch.vstack(max_weights) # Partial
# clamp is dispatched through DTensor
# it will issue a single all-reduce
amax_tensor = torch.clamp(amax_tensor, EPS) # Replicate
amaxes = torch.split(amax_tensor, 1) # Replicate
return amaxes

if weights:
Expand Down

0 comments on commit fe2e0a0

Please sign in to comment.