NVIDIA
diff --git a/‎3rdparty/cudnn-frontend b/‎3rdparty/cudnn-frontend
diff --git a/‎build_tools/build_ext.py
Lines changed: 12 additions & 6 deletions b/‎build_tools/build_ext.py
Lines changed: 12 additions & 6 deletions
diff --git a/‎build_tools/jax.py
Lines changed: 7 additions & 2 deletions b/‎build_tools/jax.py
Lines changed: 7 additions & 2 deletions
diff --git a/‎build_tools/pytorch.py
Lines changed: 8 additions & 10 deletions b/‎build_tools/pytorch.py
Lines changed: 8 additions & 10 deletions
diff --git a/‎build_tools/utils.py
Lines changed: 1 addition & 1 deletion b/‎build_tools/utils.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/jax/encoder/test_multiprocessing_encoder.py
Lines changed: 1 addition & 1 deletion b/‎examples/jax/encoder/test_multiprocessing_encoder.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎qa/L0_pytorch_unittest/test.sh
Lines changed: 2 additions & 2 deletions b/‎qa/L0_pytorch_unittest/test.sh
Lines changed: 2 additions & 2 deletions
diff --git a/‎qa/L3_pytorch_FA_versions_test/test.sh
Lines changed: 5 additions & 3 deletions b/‎qa/L3_pytorch_FA_versions_test/test.sh
Lines changed: 5 additions & 3 deletions
diff --git a/‎setup.py
Lines changed: 1 addition & 3 deletions b/‎setup.py
Lines changed: 1 addition & 3 deletions
diff --git a/‎tests/jax/utils.py
Lines changed: 36 additions & 32 deletions b/‎tests/jax/utils.py
Lines changed: 36 additions & 32 deletions
diff --git a/‎tests/pytorch/test_numerics.py
Lines changed: 7 additions & 1 deletion b/‎tests/pytorch/test_numerics.py
Lines changed: 7 additions & 1 deletion
diff --git a/‎transformer_engine/__init__.py
Lines changed: 2 additions & 2 deletions b/‎transformer_engine/__init__.py
Lines changed: 2 additions & 2 deletions
@@ -130,18 +130,24 @@ def run(self) -> None:
             super().run()
             self.extensions = all_extensions
 
-            # Ensure that binaries are not in global package space.
+            # Ensure that shared objects files for source and PyPI installations live
+            # in separate directories to avoid conflicts during install and runtime.
             lib_dir = (
                 "wheel_lib"
                 if bool(int(os.getenv("NVTE_RELEASE_BUILD", "0"))) or framework_extension_only
                 else ""
             )
-            target_dir = install_dir / "transformer_engine" / lib_dir
-            target_dir.mkdir(exist_ok=True, parents=True)
 
-            for ext in Path(self.build_lib).glob("*.so"):
-                self.copy_file(ext, target_dir)
-                os.remove(ext)
+            # Ensure that binaries are not in global package space.
+            # For editable/inplace builds this is not a concern as
+            # the SOs will be in a local directory anyway.
+            if not self.inplace:
+                target_dir = install_dir / "transformer_engine" / lib_dir
+                target_dir.mkdir(exist_ok=True, parents=True)
+
+                for ext in Path(self.build_lib).glob("*.so"):
+                    self.copy_file(ext, target_dir)
+                    os.remove(ext)
 
         def build_extensions(self):
             # For core lib + JAX install, fix build_ext from pybind11.setup_helpers
 
@@ -9,7 +9,7 @@
 
 import setuptools
 
-from .utils import get_cuda_include_dirs, all_files_in_dir
+from .utils import get_cuda_include_dirs, all_files_in_dir, debug_build_enabled
 from typing import List
 
 
@@ -41,7 +41,7 @@ def setup_jax_extension(
     # Source files
     csrc_source_files = Path(csrc_source_files)
     extensions_dir = csrc_source_files / "extensions"
-    sources = all_files_in_dir(extensions_dir, ".cpp")
+    sources = all_files_in_dir(extensions_dir, name_extension="cpp")
 
     # Header files
     include_dirs = get_cuda_include_dirs()
@@ -57,6 +57,11 @@ def setup_jax_extension(
 
     # Compile flags
     cxx_flags = ["-O3"]
+    if debug_build_enabled():
+        cxx_flags.append("-g")
+        cxx_flags.append("-UNDEBUG")
+    else:
+        cxx_flags.append("-g0")
 
     # Define TE/JAX as a Pybind11Extension
     from pybind11.setup_helpers import Pybind11Extension
 
@@ -8,7 +8,7 @@
 
 import setuptools
 
-from .utils import all_files_in_dir, cuda_version, get_cuda_include_dirs
+from .utils import all_files_in_dir, cuda_version, get_cuda_include_dirs, debug_build_enabled
 
 
 def setup_pytorch_extension(
@@ -19,11 +19,7 @@ def setup_pytorch_extension(
     """Setup CUDA extension for PyTorch support"""
 
     # Source files
-    csrc_source_files = Path(csrc_source_files)
-    extensions_dir = csrc_source_files / "extensions"
-    sources = [
-        csrc_source_files / "common.cpp",
-    ] + all_files_in_dir(extensions_dir)
+    sources = all_files_in_dir(Path(csrc_source_files), name_extension="cpp")
 
     # Header files
     include_dirs = get_cuda_include_dirs()
@@ -37,10 +33,12 @@ def setup_pytorch_extension(
     )
 
     # Compiler flags
-    cxx_flags = [
-        "-O3",
-        "-fvisibility=hidden",
-    ]
+    cxx_flags = ["-O3", "-fvisibility=hidden"]
+    if debug_build_enabled():
+        cxx_flags.append("-g")
+        cxx_flags.append("-UNDEBUG")
+    else:
+        cxx_flags.append("-g0")
 
     # Version-dependent CUDA options
     try:
 
@@ -56,7 +56,7 @@ def all_files_in_dir(path, name_extension=None):
     all_files = []
     for dirname, _, names in os.walk(path):
         for name in names:
-            if name_extension is not None and name_extension not in name:
+            if name_extension is not None and not name.endswith(f".{name_extension}"):
                 continue
             all_files.append(Path(dirname, name))
     return all_files
 
@@ -609,7 +609,7 @@ def test_te_bf16(self):
     def test_te_delayed_scaling_fp8(self):
         """Test Transformer Engine with DelayedScaling FP8"""
         result = self.exec(True, "DelayedScaling")
-        assert result[0] < 0.505 and result[1] > 0.754
+        assert result[0] < 0.505 and result[1] > 0.753
 
     @unittest.skipIf(
         not is_fp8_supported(), "Device compute capability 9.0+ is required for CurrentScaling FP8"
 
@@ -42,8 +42,8 @@ python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_fusible_ops.xml $TE_
 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_permutation.xml $TE_PATH/tests/pytorch/test_permutation.py || test_fail "test_permutation.py"
 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_parallel_cross_entropy.xml $TE_PATH/tests/pytorch/test_parallel_cross_entropy.py || test_fail "test_parallel_cross_entropy.py"
 NVTE_FLASH_ATTN=0 python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_cpu_offloading.xml $TE_PATH/tests/pytorch/test_cpu_offloading.py || test_fail "test_cpu_offloading.py"
-NVTE_DEBUG=1 NVTE_DEBUG_LEVEL=1 python3 -m pytest -o log_cli=true --log-cli-level=INFO -v -s --junitxml=$XML_LOG_DIR/pytest_test_fused_attn.xml $TE_PATH/tests/pytorch/fused_attn/test_fused_attn.py || test_fail "test_fused_attn.py"
-NVTE_DEBUG=1 NVTE_DEBUG_LEVEL=1 python3 -m pytest -o log_cli=true --log-cli-level=INFO -v -s --junitxml=$XML_LOG_DIR/pytest_test_kv_cache.xml $TE_PATH/tests/pytorch/fused_attn/test_kv_cache.py || test_fail "test_kv_cache.py"
+python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_fused_attn.xml $TE_PATH/tests/pytorch/fused_attn/test_fused_attn.py || test_fail "test_fused_attn.py"
+python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_kv_cache.xml $TE_PATH/tests/pytorch/fused_attn/test_kv_cache.py || test_fail "test_kv_cache.py"
 
 if [ "$RET" -ne 0 ]; then
     echo "Error in the following test cases:$FAILED_CASES"
 
@@ -11,15 +11,17 @@ mkdir -p "$XML_LOG_DIR"
 pip3 install pytest==8.2.1
 
 # Limit parallel build jobs to avoid overwhelming system resources
-export MAX_JOBS=4
+export MAX_JOBS=32
 
 # Iterate over Flash Attention versions
 sm_arch=`python3 -c "import torch; sm = torch.cuda.get_device_capability(0); print(sm[0]*10+sm[1])"`
+export FLASH_ATTN_CUDA_ARCHS=$sm_arch
 if [ $sm_arch -gt 90 ]
 then
   FA_versions=(2.7.3)
-else
-  FA_versions=(2.3.0 2.4.1 2.5.7 2.7.3 3.0.0b1)
+elif [ $sm_arch -eq 90 ]
+then
+  FA_versions=(2.5.7 2.7.3 3.0.0b1)
 fi
 
 for fa_version in "${FA_versions[@]}"
 
@@ -123,7 +123,7 @@ def setup_requirements() -> Tuple[List[str], List[str], List[str]]:
             )
             # Blackwell is not supported as of Triton 3.2.0, need custom internal build
             # install_reqs.append("triton")
-            test_reqs.extend(["numpy", "torchvision", "prettytable", "PyYAML"])
+            test_reqs.extend(["numpy", "torchvision"])
         if "jax" in frameworks:
             setup_reqs.extend(["jax[cuda12]", "flax>=0.7.1"])
             install_reqs.extend(["jax", "flax>=0.7.1"])
@@ -144,7 +144,6 @@ def setup_requirements() -> Tuple[List[str], List[str], List[str]]:
             int(os.getenv("NVTE_RELEASE_BUILD", "0"))
         ), "NVTE_RELEASE_BUILD env must be set for metapackage build."
         ext_modules = []
-        cmdclass = {}
         package_data = {}
         include_package_data = False
         setup_requires = []
@@ -156,7 +155,6 @@ def setup_requirements() -> Tuple[List[str], List[str], List[str]]:
     else:
         setup_requires, install_requires, test_requires = setup_requirements()
         ext_modules = [setup_common_extension()]
-        cmdclass = {"build_ext": CMakeBuildExtension, "bdist_wheel": TimedBdist}
         package_data = {"": ["VERSION.txt"]}
         include_package_data = True
         extras_require = {"test": test_requires}
 
@@ -13,7 +13,6 @@
 import jax.numpy as jnp
 import numpy as np
 from flax import linen as nn
-from flax.linen import partitioning as nn_partitioning
 from flax.linen.attention import combine_masks
 from jax import lax, vmap
 from jax import nn as jax_nn
@@ -316,16 +315,22 @@ def __call__(self, inputs: Array) -> Array:
 
         kernel_shape = tuple(inputs.shape[ax] for ax in axis) + features
         kernel_param_shape = (np.prod([inputs.shape[ax] for ax in axis]), np.prod(features))
-        kernel = nn_partitioning.param_with_axes(
-            "kernel", self.kernel_init, kernel_param_shape, self.dtype, axes=self.kernel_axes
+        kernel = self.param(
+            "kernel",
+            nn.with_logical_partitioning(self.kernel_init, self.kernel_axes),
+            kernel_param_shape,
+            self.dtype,
         )
 
         kernel = jnp.asarray(kernel, input_dtype)
         kernel = jnp.reshape(kernel, kernel_shape)
 
         if self.use_bias:
-            bias = nn_partitioning.param_with_axes(
-                "bias", self.bias_init, self.features, self.dtype, axes=self.bias_axes
+            bias = self.param(
+                "bias",
+                nn.with_logical_partitioning(self.bias_init, self.bias_axes),
+                self.features,
+                self.dtype,
             )
             bias = bias.astype(input_dtype)
         else:
@@ -422,9 +427,9 @@ def __call__(self, inputs, deterministic: bool = False):
         )  # Broadcast along length.
 
         if self.transpose_batch_sequence:
-            x = nn_partitioning.with_sharding_constraint(x, ("length", "batch", "mlp"))
+            x = nn.with_logical_constraint(x, ("length", "batch", "mlp"))
         else:
-            x = nn_partitioning.with_sharding_constraint(x, ("batch", "length", "mlp"))
+            x = nn.with_logical_constraint(x, ("batch", "length", "mlp"))
         output = DenseGeneral(
             inputs.shape[-1],
             dtype=self.dtype,
@@ -688,21 +693,13 @@ def qkv_init(key, shape, dtype):
         value = value.reshape((*value.shape[:2], self.num_gqa_groups, self.head_dim))
 
         if self.transpose_batch_sequence:
-            query = nn_partitioning.with_sharding_constraint(
-                query, ("length", "batch", "heads", "kv")
-            )
-            key = nn_partitioning.with_sharding_constraint(key, ("length", "batch", "heads", "kv"))
-            value = nn_partitioning.with_sharding_constraint(
-                value, ("length", "batch", "heads", "kv")
-            )
+            query = nn.with_logical_constraint(query, ("length", "batch", "heads", "kv"))
+            key = nn.with_logical_constraint(key, ("length", "batch", "heads", "kv"))
+            value = nn.with_logical_constraint(value, ("length", "batch", "heads", "kv"))
         else:
-            query = nn_partitioning.with_sharding_constraint(
-                query, ("batch", "length", "heads", "kv")
-            )
-            key = nn_partitioning.with_sharding_constraint(key, ("batch", "length", "heads", "kv"))
-            value = nn_partitioning.with_sharding_constraint(
-                value, ("batch", "length", "heads", "kv")
-            )
+            query = nn.with_logical_constraint(query, ("batch", "length", "heads", "kv"))
+            key = nn.with_logical_constraint(key, ("batch", "length", "heads", "kv"))
+            value = nn.with_logical_constraint(value, ("batch", "length", "heads", "kv"))
 
         if decode:
             # Detect if we're initializing by absence of existing cache data.
@@ -809,9 +806,9 @@ def qkv_init(key, shape, dtype):
         x = x.reshape((x.shape[0], x.shape[1], x.shape[2] * x.shape[3]))
 
         if self.transpose_batch_sequence:
-            x = nn_partitioning.with_sharding_constraint(x, ("length", "batch", "joined_kv"))
+            x = nn.with_logical_constraint(x, ("length", "batch", "joined_kv"))
         else:
-            x = nn_partitioning.with_sharding_constraint(x, ("batch", "length", "joined_kv"))
+            x = nn.with_logical_constraint(x, ("batch", "length", "joined_kv"))
 
         # Back to the original inputs dimensions.
 
@@ -857,17 +854,23 @@ def __call__(self, x: jnp.ndarray) -> jnp.ndarray:
         input_dtype = x.dtype
         features = x.shape[-1]
 
-        scale = nn_partitioning.param_with_axes(
-            "scale", self.scale_init, (features,), self.dtype, axes=("embed",)
+        scale = self.param(
+            "scale",
+            nn.with_logical_partitioning(self.scale_init, ("embed",)),
+            (features,),
+            self.dtype,
         )
         x_ = x.astype(jnp.float32)
         if self.layernorm_type == "layernorm":
             mean = jnp.mean(x_, axis=-1, keepdims=True)
             var = jnp.mean(jnp.square(x_ - mean), axis=-1, keepdims=True)
             y = (x_ - mean) * lax.rsqrt(var + self.epsilon)
 
-            bias = nn_partitioning.param_with_axes(
-                "ln_bias", self.bias_init, (features,), self.dtype, axes=("embed",)
+            bias = self.param(
+                "ln_bias",
+                nn.with_logical_partitioning(self.bias_init, ("embed",)),
+                (features,),
+                self.dtype,
             )
             bias = jnp.asarray(bias, input_dtype)
 
@@ -976,12 +979,11 @@ def __call__(self, qlen, klen, bidirectional=True):
             num_buckets=self.num_buckets,
             max_distance=self.max_distance,
         )
-        relative_attention_bias = nn_partitioning.param_with_axes(
+        relative_attention_bias = self.param(
             "rel_embedding",
-            self.embedding_init,
+            nn.with_logical_partitioning(self.embedding_init, ("heads", "relpos_buckets")),
             (self.num_heads, self.num_buckets),
             jnp.float32,
-            axes=("heads", "relpos_buckets"),
         )
 
         relative_attention_bias = jnp.asarray(relative_attention_bias, self.dtype)
@@ -1559,14 +1561,16 @@ def sync_params_values(dst, src, transformations, sep="/"):
     """
     src_values = {}
     for key, value in jax.tree_util.tree_leaves_with_path(src):
-        normalized_key = sep.join(x.key for x in key)
+        # Only select DictKey(key="...") entries, skip GetAttr(name="...") entries at the end of the tree path
+        normalized_key = sep.join(x.key for x in key if hasattr(x, "key"))
         src_values[normalized_key] = value
 
     flatten_dst, dst_tree_def = jax.tree_util.tree_flatten_with_path(dst)
     synced_dst_values = []
 
     for key, value in flatten_dst:
-        normalized_key = sep.join(x.key for x in key)
+        # Only select DictKey(key="...") entries, skip GetAttr(name="...") entries at the end of the tree path
+        normalized_key = sep.join(x.key for x in key if hasattr(x, "key"))
         if normalized_key in transformations:
             corresponding_src_key = transformations[normalized_key]
         else:
 
@@ -42,7 +42,7 @@
 from transformer_engine.pytorch.cpp_extensions import general_gemm, general_grouped_gemm
 from transformer_engine.pytorch.tensor.float8_tensor import Float8Quantizer
 from transformer_engine.pytorch.module.base import get_multi_stream_cublas_workspace, get_workspace
-from transformer_engine.pytorch.utils import get_device_compute_capability
+from transformer_engine.pytorch.utils import get_device_compute_capability, get_cudnn_version
 from transformer_engine.common import recipe
 import transformer_engine_torch as tex
 
@@ -2293,6 +2293,12 @@ def test_kv_cache_accuracy(dtype, bs, model_key, use_RoPE, input_format, module,
         pytest.skip("FusedAttention and FlashAttention do not support FP32")
     if use_RoPE:
         pytest.skip("KV cache does not support starting positions for RoPE")
+    if (
+        backend == "FusedAttention"
+        and get_device_compute_capability() == (8, 9)
+        and get_cudnn_version() < (9, 11, 0)
+    ):
+        pytest.skip("Skip KV cache for sm89 and cuDNN < 9.11")
 
     os.environ["NVTE_FLASH_ATTN"] = "0"
     os.environ["NVTE_FUSED_ATTN"] = "0"
 
@@ -11,12 +11,12 @@
 
 try:
     from . import pytorch
-except (ImportError, StopIteration) as e:
+except ImportError as e:
     pass
 
 try:
     from . import jax
-except (ImportError, StopIteration) as e:
+except ImportError as e:
     pass
 
 __version__ = str(metadata.version("transformer_engine"))