phu0ngng · phu0ngng · Jul 8, 2024 · Jul 2, 2024 · Jul 2, 2024 · Jul 3, 2024
diff --git a/build_tools/build_ext.py b/build_tools/build_ext.py
@@ -23,6 +23,7 @@
     found_ninja,
     get_frameworks,
     cuda_path,
+    get_max_jobs_for_parallel_build,
 )
 
 
@@ -60,8 +61,6 @@ def _build_cmake(self, build_dir: Path, install_dir: Path) -> None:
             f"-DCMAKE_INSTALL_PREFIX={install_dir}",
         ]
         configure_command += self.cmake_flags
-        if found_ninja():
-            configure_command.append("-GNinja")
 
         import pybind11
 
@@ -73,6 +72,14 @@ def _build_cmake(self, build_dir: Path, install_dir: Path) -> None:
         build_command = [_cmake_bin, "--build", build_dir]
         install_command = [_cmake_bin, "--install", build_dir]
 
+        # Check whether parallel build is restricted
+        max_jobs = get_max_jobs_for_parallel_build()
+        if found_ninja():
+            configure_command.append("-GNinja")
+        build_command.append("--parallel")
+        if max_jobs > 0:
+            build_command.append(str(max_jobs))
+
         # Run CMake commands
         for command in [configure_command, build_command, install_command]:
             print(f"Running command {' '.join(command)}")

diff --git a/build_tools/pytorch.py b/build_tools/pytorch.py
@@ -74,10 +74,9 @@ def setup_pytorch_extension(
         if version >= (11, 8):
             nvcc_flags.extend(["-gencode", "arch=compute_90,code=sm_90"])
 
-    # Libraries -- PyTorch CUDAExtension links to libcudart.so but not to libcuda.so
-    cuda_home, _ = cuda_path()
-    library_dirs = [cuda_home / "compat" / "lib"]
-    libraries = ["cuda"]
+    # Libraries
+    library_dirs = []
+    libraries = []
     if os.getenv("UB_MPI_BOOTSTRAP"):
         assert (
             os.getenv("MPI_HOME") is not None

diff --git a/build_tools/utils.py b/build_tools/utils.py
@@ -28,6 +28,28 @@ def debug_build_enabled() -> bool:
     return False
 
 
+@functools.lru_cache(maxsize=None)
+def get_max_jobs_for_parallel_build() -> int:
+    """Number of parallel jobs for Nina build"""
+
+    # Default: maximum parallel jobs
+    num_jobs = 0
+
+    # Check environment variable
+    if os.getenv("NVTE_MAX_BUILD_JOBS"):
+        num_jobs = int(os.getenv("NVTE_MAX_BUILD_JOBS"))
+    elif os.getenv("MAX_JOBS"):
+        num_jobs = int(os.getenv("MAX_JOBS"))
+
+    # Check command-line arguments
+    for arg in sys.argv.copy():
+        if arg.startswith("--parallel="):
+            num_jobs = int(arg.replace("--parallel=", ""))
+            sys.argv.remove(arg)
+
+    return num_jobs
+
+
 def all_files_in_dir(path, name_extension=None):
     all_files = []
     for dirname, _, names in os.walk(path):

diff --git a/docs/examples/attention/attention.ipynb b/docs/examples/attention/attention.ipynb
@@ -345,10 +345,10 @@
     "| cuDNN attention  | `bshd`, `sbhd`, `thd`  | PyTorch: 3 formats, i.e. 15 layouts<br>JAX, PaddlePaddle: `bs3hd`, `bshd_bs2hd`, `bshd_bshd_bshd` layouts |\n",
     "| Framework-native attention | `bshd`, `sbhd`<br>(`sbhd` requires transpose operations) | PyTorch, JAX, PaddlePaddle: 2 formats, i.e. 10 layouts |\n",
     "\n",
-    "Some example usage of the different layouts can be found at [test_dpa_qkv_layout](../../tests/pytorch/fused_attention/test_fused_attn.py) and [test_dpa_qkv_layout_thd](../../tests/pytorch/fused_attention/test_fused_attn.py). Transformer Engine also provides a utility function [transformer_engine.pytorch.attention._get_qkv_layout](../../transformer_engine/pytorch/attention.py) to help determine which layout a set of `q`, `k`, `v` tensors have (PyTorch only).\n",
+    "Some example usage of the different layouts can be found at [test_dpa_qkv_layout](../../tests/pytorch/fused_attention/test_fused_attn.py) and [test_dpa_qkv_layout_thd](../../tests/pytorch/fused_attention/test_fused_attn.py). Transformer Engine also provides a utility function [transformer_engine.pytorch.attention.get_qkv_layout](../../transformer_engine/pytorch/attention.py) to help determine which layout a set of `q`, `k`, `v` tensors have (PyTorch only).\n",
     "\n",
     "<div class=\"alert alert-info\">\n",
-    "<b>Note:</b> When RoPE is employed, the <code>qkv_layout</code> may change in Transformer Engine PyTorch through [_get_qkv_layout](../../transformer_engine/pytorch/attention.py). This is due to the in-place nature of our RoPE implementations. We convert `q`, `k`, `v` tensors from their initial layout to the corresponding <code>hd_hd_hd</code> layout. For example, from <code>sbh3d</code> in <code>pytorch.MultiHeadAttention</code> before RoPE, to <code>sbhd_sbhd_sbhd</code> in <code>pytorch.DotProductAttention</code> after RoPE.\n",
+    "<b>Note:</b> When RoPE is employed, the <code>qkv_layout</code> may change in Transformer Engine PyTorch through [get_qkv_layout](../../transformer_engine/pytorch/attention.py). This is due to the in-place nature of our RoPE implementations. We convert `q`, `k`, `v` tensors from their initial layout to the corresponding <code>hd_hd_hd</code> layout. For example, from <code>sbh3d</code> in <code>pytorch.MultiHeadAttention</code> before RoPE, to <code>sbhd_sbhd_sbhd</code> in <code>pytorch.DotProductAttention</code> after RoPE.\n",
     "</div>\n"
    ]
   },

diff --git a/tests/jax/test_distributed_fused_attn.py b/tests/jax/test_distributed_fused_attn.py
@@ -15,8 +15,7 @@
 from transformer_engine.jax import fp8_autocast
 from transformer_engine.jax.attention import (
     is_fused_attn_kernel_available,
-    fused_attn_qkvpacked,
-    fused_attn_kvpacked,
+    fused_attn,
     AttnBiasType,
     AttnMaskType,
     QKVLayout,
@@ -120,11 +119,15 @@ def test_self_attn(
 
         def target_func(qkv, bias, mask):
             return jnp.mean(
-                fused_attn_qkvpacked(
-                    qkv,
+                fused_attn(
+                    (qkv,),
                     bias,
                     mask,
                     None,
+                    None,
+                    None,
+                    None,
+                    None,
                     attn_bias_type=attn_bias_type,
                     attn_mask_type=attn_mask_type,
                     scaling_factor=scaling_factor,
@@ -252,12 +255,15 @@ def test_cross_attn(
 
         def target_func(q, kv, mask):
             return jnp.mean(
-                fused_attn_kvpacked(
-                    q,
-                    kv,
+                fused_attn(
+                    (q, kv),
                     None,
                     mask,
                     None,
+                    None,
+                    None,
+                    None,
+                    None,
                     attn_bias_type=attn_bias_type,
                     attn_mask_type=attn_mask_type,
                     scaling_factor=scaling_factor,