fp8e4m3 for wgrad

Azure · wkcn · Dec 7, 2023 · Dec 7, 2023 · Dec 7, 2023 · Dec 7, 2023
commit 69f73ce35b6a1d48e441643d2cefab2388b54423
diff --git a/msamp/megatron/optimizer/distrib_optimizer.py b/msamp/megatron/optimizer/distrib_optimizer.py
@@ -547,16 +547,15 @@ def reduce_model_grads(self, args, timers):    # noqa: C901
                 for p in model_group:
                     g = p.main_grad
                     if g is not None and not torch.is_tensor(g):
-                        if g.qtype != WGRAD_QTYPE:
-                            raise TypeError('g.qtype != WGRAD_QTYPE: {} != {}'.format(g.qtype, WGRAD_QTYPE))
+                        if g.qtype != Dtypes.kfloat8_e4m3:
+                            raise TypeError('g.qtype != Dtypes.kfloat8_e4m3: {}'.format(g.qtype))
                         # stat overflow ratio
                         num_infs = torch.count_nonzero((g.value & 0x7f) == 126)
                         overflow_ratio = num_infs / g.numel()
-                        if args.wgrad_auto_scaling_ratio is not None:
-                            if overflow_ratio > args.wgrad_auto_scaling_ratio:
-                                g.meta.pre_scale /= 2.0
-                            else:
-                                g.meta.pre_scale *= 2.0**(1.0 / args.wgrad_auto_scaling_window)
+                        if overflow_ratio > args.wgrad_auto_scaling_ratio:
+                            g.meta.pre_scale /= 2.0
+                        else:
+                            g.meta.pre_scale *= 2.0**(1.0 / args.wgrad_auto_scaling_window)
 
             # synchonize pre_scale
             for model_id, model in enumerate(self.models):