NVIDIA
diff --git a/‎examples/jax/encoder/common.py
Lines changed: 2 additions & 0 deletions b/‎examples/jax/encoder/common.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎examples/jax/encoder/run_test_multiprocessing_encoder.sh
Lines changed: 2 additions & 0 deletions b/‎examples/jax/encoder/run_test_multiprocessing_encoder.sh
Lines changed: 2 additions & 0 deletions
diff --git a/‎examples/jax/encoder/test_multigpu_encoder.py
Lines changed: 17 additions & 0 deletions b/‎examples/jax/encoder/test_multigpu_encoder.py
Lines changed: 17 additions & 0 deletions
diff --git a/‎examples/jax/encoder/test_multiprocessing_encoder.py
Lines changed: 17 additions & 1 deletion b/‎examples/jax/encoder/test_multiprocessing_encoder.py
Lines changed: 17 additions & 1 deletion
diff --git a/‎examples/jax/encoder/test_single_gpu_encoder.py
Lines changed: 8 additions & 0 deletions b/‎examples/jax/encoder/test_single_gpu_encoder.py
Lines changed: 8 additions & 0 deletions
diff --git a/‎examples/jax/mnist/test_single_gpu_mnist.py
Lines changed: 8 additions & 0 deletions b/‎examples/jax/mnist/test_single_gpu_mnist.py
Lines changed: 8 additions & 0 deletions
diff --git a/‎tests/jax/test_distributed_layernorm.py
Lines changed: 3 additions & 0 deletions b/‎tests/jax/test_distributed_layernorm.py
Lines changed: 3 additions & 0 deletions
diff --git a/‎tests/jax/test_distributed_layernorm_mlp.py
Lines changed: 1 addition & 27 deletions b/‎tests/jax/test_distributed_layernorm_mlp.py
Lines changed: 1 addition & 27 deletions
diff --git a/‎tests/jax/test_helper.py
Lines changed: 72 additions & 37 deletions b/‎tests/jax/test_helper.py
Lines changed: 72 additions & 37 deletions
@@ -37,5 +37,7 @@ def get_fp8_recipe_from_name_string(name: str):
             return recipe.DelayedScaling()
         case "MXFP8BlockScaling":
             return recipe.MXFP8BlockScaling()
+        case "Float8CurrentScaling":
+            return recipe.Float8CurrentScaling()
         case _:
             raise ValueError(f"Invalid fp8_recipe, got {name}")
@@ -8,9 +8,11 @@ NUM_GPUS=${NUM_GPUS:-$(nvidia-smi -L | wc -l)}
 TEST_CASES=(
 "test_te_bf16"
 "test_te_delayed_scaling_fp8"
+"test_te_current_scaling_fp8"
 "test_te_mxfp8"
 "test_te_bf16_shardy"
 "test_te_delayed_scaling_fp8_shardy"
+"test_te_current_scaling_fp8_shardy"
 )
 
 echo
 
@@ -441,6 +441,14 @@ def test_te_delayed_scaling_fp8(self):
         actual = train_and_evaluate(self.args)
         assert actual[0] < 0.535 and actual[1] > 0.73
 
+    @unittest.skipIf(not is_fp8_supported, fp8_reason)
+    def test_te_current_scaling_fp8(self):
+        """Test Transformer Engine with CurrentScaling FP8"""
+        self.args.use_fp8 = True
+        self.args.fp8_recipe = "Float8CurrentScaling"
+        actual = train_and_evaluate(self.args)
+        assert actual[0] < 0.535 and actual[1] > 0.73
+
     @unittest.skipIf(not is_mxfp8_supported, mxfp8_reason)
     def test_te_mxfp8(self):
         """Test Transformer Engine with MXFP8"""
@@ -467,6 +475,15 @@ def test_te_delayed_scaling_fp8_shardy(self):
 
     # TODO(jreiffers): Add mxfp8 Shardy tests once supported in JAX.
 
+    @unittest.skipIf(not is_fp8_supported, fp8_reason)
+    def test_te_current_scaling_fp8_shardy(self):
+        """Test Transformer Engine with CurrentScaling FP8"""
+        self.args.enable_shardy = True
+        self.args.use_fp8 = True
+        self.args.fp8_recipe = "Float8CurrentScaling"
+        actual = train_and_evaluate(self.args)
+        assert actual[0] < 0.535 and actual[1] > 0.73
+
 
 if __name__ == "__main__":
     train_and_evaluate(encoder_parser(None))
@@ -611,6 +611,14 @@ def test_te_delayed_scaling_fp8(self):
         result = self.exec(True, "DelayedScaling")
         assert result[0] < 0.505 and result[1] > 0.754
 
+    @unittest.skipIf(
+        not is_fp8_supported(), "Device compute capability 9.0+ is required for CurrentScaling FP8"
+    )
+    def test_te_current_scaling_fp8(self):
+        """Test Transformer Engine with CurrentScaling FP8"""
+        result = self.exec(True, "Float8CurrentScaling")
+        assert result[0] < 0.507 and result[1] > 0.753
+
     @unittest.skipIf(
         not is_mxfp8_supported(), "Device compute capability 10.0+ is required for MXFP8"
     )
@@ -631,10 +639,18 @@ def test_te_bf16_shardy(self):
     def test_te_delayed_scaling_fp8_shardy(self):
         """Test Transformer Engine with DelayedScaling FP8"""
         result = self.exec(True, "DelayedScaling", enable_shardy=True)
-        assert result[0] < 0.505 and result[1] > 0.754
+        assert result[0] < 0.505 and result[1] > 0.753
 
     # TODO(jreiffers): Add mxfp8 Shardy tests once supported in JAX.
 
+    @unittest.skipIf(
+        not is_fp8_supported(), "Device compute capability 9.0+ is required for CurrentScaling FP8"
+    )
+    def test_te_current_scaling_fp8_shardy(self):
+        """Test Transformer Engine with CurrentScaling FP8"""
+        result = self.exec(True, "Float8CurrentScaling", enable_shardy=True)
+        assert result[0] < 0.507 and result[1] > 0.753
+
 
 if __name__ == "__main__":
     train_and_evaluate(encoder_parser(None))
@@ -348,6 +348,14 @@ def test_te_delayed_scaling_fp8(self):
         actual = train_and_evaluate(self.args)
         assert actual[0] < 0.455 and actual[1] > 0.79
 
+    @unittest.skipIf(not is_fp8_supported, fp8_reason)
+    def test_te_current_scaling_fp8(self):
+        """Test Transformer Engine with CurrentScaling FP8"""
+        self.args.use_fp8 = True
+        self.args.fp8_recipe = "Float8CurrentScaling"
+        actual = train_and_evaluate(self.args)
+        assert actual[0] < 0.455 and actual[1] > 0.79
+
     @unittest.skipIf(not is_mxfp8_supported, mxfp8_reason)
     def test_te_mxfp8(self):
         """Test Transformer Engine with MXFP8"""
 
@@ -350,6 +350,14 @@ def test_te_mxfp8(self):
         actual = train_and_evaluate(self.args)
         self.verify(actual)
 
+    @unittest.skipIf(not is_fp8_supported, fp8_reason)
+    def test_te_current_scaling_fp8(self):
+        """Test Transformer Engine with CurrentScaling FP8"""
+        self.args.use_fp8 = True
+        self.args.fp8_recipe = "Float8CurrentScaling"
+        actual = train_and_evaluate(self.args)
+        self.verify(actual)
+
 
 if __name__ == "__main__":
     train_and_evaluate(mnist_parser(None))
@@ -34,6 +34,7 @@
 SUPPORTED_RECIPES = []
 if is_fp8_supported:
     SUPPORTED_RECIPES.append(pytest.param(recipe.DelayedScaling(), id="DelayedScaling"))
+    SUPPORTED_RECIPES.append(pytest.param(recipe.Float8CurrentScaling(), id="CurrentScaling"))
 if is_mxfp8_supported:
     SUPPORTED_RECIPES.append(pytest.param(recipe.MXFP8BlockScaling(), id="MXFP8BlockScaling"))
 
@@ -76,6 +77,8 @@ def generate_collectives_count_ref(
         other_bytes = 0
         if fp8_recipe == recipe.MXFP8BlockScaling() and "dp" in mesh_axes:
             other_bytes = 384  # required for small scale shapes that require padding
+        if fp8_recipe == recipe.Float8CurrentScaling():
+            allreduce_total_bytes += 4  # 1 * FP32 for the amax reduction
         return generate_collectives_count(
             allreduce=allreduce_total_bytes * int(is_dp_enabled), allgather=0, other=other_bytes
         )
 
@@ -41,6 +41,7 @@
 SUPPORTED_RECIPES = []
 if is_fp8_supported:
     SUPPORTED_RECIPES.append(pytest.param(recipe.DelayedScaling(), id="DelayedScaling"))
+    SUPPORTED_RECIPES.append(pytest.param(recipe.Float8CurrentScaling(), id="CurrentScaling"))
 if is_mxfp8_supported:
     SUPPORTED_RECIPES.append(pytest.param(recipe.MXFP8BlockScaling(), id="MXFP8BlockScaling"))
 
@@ -217,37 +218,10 @@ def _test_layernorm_mlp_grad(
                             m_grad, s_grad, dtype=dtype, err_msg=f"multi_grads[{i}] is not close"
                         )
                 else:
-                    is_gated = len(activation_type) > 1
-                    rtol = None
-                    atol = None
-                    if is_gated:
-                        if dtype == jnp.bfloat16:
-                            if i == 2:
-                                rtol = 800
-                                atol = 9e-2
-                            if i == 4:
-                                atol = 300
-                                rtol = 1e-1
-                        if dtype == jnp.float16:
-                            if i == 1:  # gamma
-                                rtol = 200
-                                atol = 1e-2
-                            if i == 2:
-                                rtol = 2000
-                                atol = 7e-2
-                            if i == 4 and fp8_recipe == recipe.MXFP8BlockScaling():  # bias_1
-                                # Accumulating dbias across a large tensor introduces a larger difference
-                                rtol = 200
-                                atol = 4e-2
-                            if i == 4 and fp8_recipe == recipe.DelayedScaling():
-                                rtol = 2200
-                                atol = 9e-2
                     assert_allclose(
                         multi_grads[i],
                         single_grads[i],
                         dtype=dtype,
-                        rtol=rtol,
-                        atol=atol,
                         err_msg=f"multi_grads[{i}] is not close",
                     )
 
 
@@ -10,47 +10,22 @@
 import numpy as np
 
 from utils import assert_allclose
-from transformer_engine.common.recipe import DelayedScaling
+from transformer_engine.common.recipe import DelayedScaling, MXFP8BlockScaling, Float8CurrentScaling
 from transformer_engine.common.recipe import Format as FP8Format
 from transformer_engine.jax import fp8_autocast, get_delayed_scaling
-from transformer_engine.jax.quantize import QuantizeConfig, is_fp8_available, AmaxComputeAlgo
+from transformer_engine.jax.quantize import (
+    QuantizeConfig,
+    is_fp8_available,
+    ScalingMode,
+    update_collections,
+)
 from transformer_engine.jax.sharding import MeshResource, global_mesh_resource
 
 is_fp8_supported, reason = is_fp8_available()
+is_mxfp8_supported, mxfp8_reason = is_fp8_available(ScalingMode.MXFP8_1D_SCALING)
 
 
-class TestQuantizeConfig(unittest.TestCase):
-
-    @unittest.skipIf(not is_fp8_supported, reason=reason)
-    def test_initialize(self):
-        margin = 5.0
-        fp8_format = FP8Format.E4M3
-        amax_history_len = 10
-
-        QuantizeConfig.initialize(
-            margin=margin, fp8_format=fp8_format, amax_history_len=amax_history_len
-        )
-
-        self.assertEqual(
-            QuantizeConfig.MARGIN,
-            margin,
-            f"QuantizeConfig.MARGIN initialization failed, should be {margin}"
-            f" but got {QuantizeConfig.MARGIN}.",
-        )
-        self.assertEqual(
-            QuantizeConfig.FP8_FORMAT,
-            fp8_format,
-            f"QuantizeConfig.FP8_FORMAT initialization failed, should be {fp8_format}"
-            f" but got {QuantizeConfig.FP8_FORMAT}.",
-        )
-        self.assertEqual(
-            QuantizeConfig.AMAX_HISTORY_LEN,
-            amax_history_len,
-            f"QuantizeConfig.AMAX_HISTORY_LEN initialization failed, should be {amax_history_len}"
-            f" but got {QuantizeConfig.AMAX_HISTORY_LEN}.",
-        )
-
-        QuantizeConfig.finalize()
+class TestHelper(unittest.TestCase):
 
     @unittest.skipIf(not is_fp8_supported, reason=reason)
     def test_update_collections(self):
@@ -61,12 +36,12 @@ def test_update_collections(self):
             "test1": original_val,
             "test2": original_val,
         }
-        updated_state = QuantizeConfig.update_collections({"test1": updated_val}, original_state)
+        updated_state = update_collections({"test1": updated_val}, original_state)
         self.assertEqual(updated_state["test1"], updated_val)
         self.assertEqual(updated_state["test2"], original_val)
 
         original_state = flax.core.frozen_dict.FrozenDict(original_state)
-        updated_state = QuantizeConfig.update_collections({"test1": updated_val}, original_state)
+        updated_state = update_collections({"test1": updated_val}, original_state)
         self.assertEqual(updated_state["test1"], updated_val)
         self.assertEqual(updated_state["test2"], original_val)
 
@@ -82,8 +57,18 @@ def _compare_delay_scaling(self, ref, test):
         self.assertTrue(ref.amax_history_len == test.amax_history_len)
         self.assertTrue(ref.amax_compute_algo == test.amax_compute_algo)
 
+    def _compare_current_scaling(self, test):
+        self.assertEqual(QuantizeConfig.MARGIN, test.margin)
+        self.assertEqual(QuantizeConfig.FP8_FORMAT, test.fp8_format)
+        self.assertEqual(QuantizeConfig.SCALING_MODE, ScalingMode.CURRENT_TENSOR_SCALING)
+
+    def _compare_mxfp8_scaling(self, test):
+        self.assertEqual(QuantizeConfig.MARGIN, test.margin)
+        self.assertEqual(QuantizeConfig.FP8_FORMAT, test.fp8_format)
+        self.assertEqual(QuantizeConfig.SCALING_MODE, ScalingMode.MXFP8_1D_SCALING)
+
     @unittest.skipIf(not is_fp8_supported, reason=reason)
-    def test_fp8_autocast(self):
+    def test_fp8_autocast_delayed_scaling(self):
         QuantizeConfig.finalize()  # Ensure the testing not affect by previous tests.
         self._check_defult_state()
 
@@ -107,6 +92,56 @@ def test_fp8_autocast(self):
 
         self._check_defult_state()
 
+    @unittest.skipIf(not is_mxfp8_supported, reason=mxfp8_reason)
+    def test_fp8_autocast_mxfp8_scaling(self):
+        QuantizeConfig.finalize()  # Ensure the testing not affect by previous tests.
+        self._check_defult_state()
+
+        with fp8_autocast(enabled=False, fp8_recipe=Float8CurrentScaling()):
+            self.assertFalse(QuantizeConfig.is_fp8_enabled())
+            self._compare_current_scaling(Float8CurrentScaling())
+
+        self._check_defult_state()
+
+        cs = Float8CurrentScaling(margin=5.0, fp8_format=FP8Format.E4M3)
+        with fp8_autocast(enabled=True, fp8_recipe=cs):
+            self.assertTrue(QuantizeConfig.is_fp8_enabled())
+            self._compare_current_scaling(cs)
+
+        self._check_defult_state()
+
+        cs = Float8CurrentScaling(margin=3.0, fp8_format=FP8Format.HYBRID)
+        with fp8_autocast(enabled=True, fp8_recipe=cs):
+            self.assertTrue(QuantizeConfig.is_fp8_enabled())
+            self._compare_current_scaling(cs)
+
+        self._check_defult_state()
+
+    @unittest.skipIf(not is_mxfp8_supported, reason=mxfp8_reason)
+    def test_fp8_autocast_mxfp8_scaling(self):
+        QuantizeConfig.finalize()  # Ensure the testing not affect by previous tests.
+        self._check_defult_state()
+
+        with fp8_autocast(enabled=False, fp8_recipe=MXFP8BlockScaling()):
+            self.assertFalse(QuantizeConfig.is_fp8_enabled())
+            self._compare_mxfp8_scaling(MXFP8BlockScaling())
+
+        self._check_defult_state()
+
+        bs = MXFP8BlockScaling(margin=5.0, fp8_format=FP8Format.E4M3)
+        with fp8_autocast(enabled=True, fp8_recipe=bs):
+            self.assertTrue(QuantizeConfig.is_fp8_enabled())
+            self._compare_mxfp8_scaling(bs)
+
+        self._check_defult_state()
+
+        bs = MXFP8BlockScaling(margin=3.0, fp8_format=FP8Format.HYBRID)
+        with fp8_autocast(enabled=True, fp8_recipe=bs):
+            self.assertTrue(QuantizeConfig.is_fp8_enabled())
+            self._compare_mxfp8_scaling(bs)
+
+        self._check_defult_state()
+
     @unittest.skipIf(not is_fp8_supported, reason=reason)
     def test_fp8_autocast_with_sharding_resource(self):
         QuantizeConfig.finalize()  # Ensure the testing not affect by previous tests.