[call_jax] Bridge the torch_xla and JAX mesh

tengyifei · tengyifei · commit 97dc6c37b918 · 2025-04-24T18:45:35.000-07:00
Now we can run a JAX SPMD function that accesses the ambient SPMD mesh from xb.call_jax. Fixes #8972. Also I beefed up the assume_pure tests and updated the docs to mention that mark_sharding is supported thanks to qihqi@' #8989.
diff --git a/benchmarks/benchmark_model.py b/benchmarks/benchmark_model.py
@@ -227,8 +227,9 @@ def is_compatible(self, dummy_benchmark_model: BenchmarkModel,
   def get_benchmark_indices(self, length: int):
     start = self._args.partition_id * (length // self._args.total_partitions)
     end = ((self._args.partition_id + 1) *
-           (length // self._args.total_partitions) if self._args.partition_id
-           < self._args.total_partitions - 1 else length)
+           (length // self._args.total_partitions)
+           if self._args.partition_id < self._args.total_partitions - 1 else
+           length)
     return start, end
 
   def skip_model(self, model_name: str):
diff --git a/docs/source/perf/assume_pure.md b/docs/source/perf/assume_pure.md
@@ -122,8 +122,13 @@ a fixed up-front cost, and then later runs will reuse the cached XLA computation
 ## Limitations
 
 Currently, all operations in a function wrapped with `@assume_pure` must be
-PyTorch upstream operations (e.g. `torch.einsum`, `torch.sin`, ...). More
-PyTorch/XLA operations (e.g. `mark_sharding`) will be supported in the future.
+PyTorch upstream operations (e.g. `torch.einsum`, `torch.sin`, ...), or these
+PyTorch/XLA operations:
+  * `torch_xla.experimental.assume_pure` (recursive `assume_pure`)
+  * `torch_xla.distributed.spmd.mark_sharding`
+
+More PyTorch/XLA operations (e.g. `flash_attention`) will be supported in the
+future.
 
 <!-- xrefs -->
 
diff --git a/test/pytorch_test_base.py b/test/pytorch_test_base.py
@@ -619,8 +619,8 @@ def skipped_test(self, *args, reason=reason, **kwargs):
             setattr(cls, dtype_test_name, disallowed_test)
           if not skipped:
             xla_dtypes.append(
-                dtype_combination if len(dtype_combination) >
-                1 else dtype_combination[0])
+                dtype_combination
+                if len(dtype_combination) > 1 else dtype_combination[0])
         if len(xla_dtypes) != 0:
           test.dtypes[cls.device_type] = xla_dtypes
           super().instantiate_test(name, test, generic_cls=generic_cls)
diff --git a/test/run_tests.sh b/test/run_tests.sh
@@ -211,7 +211,9 @@ function run_xla_op_tests2 {
   run_test "$CDIR/test_callback.py"
   XLA_USE_SPMD=1 run_test "$CDIR/test_callback.py"
   run_test "$CDIR/test_jax_interop.py"
+  run_test "$CDIR/test_jax_interop_spmd.py"
   run_test "$CDIR/test_assume_pure.py"
+  run_test "$CDIR/test_assume_pure_spmd.py"
 }
 
 # All the new xla op tests should go to run_xla_op_tests3
diff --git a/test/scan/test_scan_spmd.py b/test/scan/test_scan_spmd.py
@@ -8,7 +8,6 @@
 import torch_xla
 import torch.nn as nn
 from torch_xla.distributed.spmd.xla_sharding import apply_xla_patch_to_nn_linear, Mesh
-from torch_xla.experimental.assume_pure import assume_pure
 from torch_xla.experimental.scan import scan
 from torch_xla.experimental.scan_layers import scan_layers
 from torch_xla.distributed.spmd import mark_sharding, mark_sharding_with_gradients, set_global_mesh, get_1d_mesh, get_global_mesh
@@ -231,27 +230,6 @@ def check_dots_in_model(self, model, x, expect_pattern):
   def count_regex(self, hlo_text, regex_str):
     return len(re.findall(regex_str, hlo_text))
 
-  @unittest.skipIf(
-      torch.cuda.is_available() or os.environ.get('PJRT_DEVICE') == 'CUDA',
-      "TODO(https://github.com/pytorch/xla/issues/9017): Get these tests working on GPU"
-  )
-  def test_assume_pure_works_with_mark_sharding(self):
-    x = torch.randn((3, 4, 5, 128), device='xla')
-    assume_pure(mark_sharding)(x, self.spmd_mesh, ("model", None, None, None))
-    # assert not throwing
-
-  @unittest.skipIf(
-      torch.cuda.is_available() or os.environ.get('PJRT_DEVICE') == 'CUDA',
-      "TODO(https://github.com/pytorch/xla/issues/9017): Get these tests working on GPU"
-  )
-  def test_convert_to_jax_mesh(self):
-    jax_mesh = self.spmd_mesh.maybe_convert_and_get_jax_mesh()
-    self.assertEqual(jax_mesh.devices.shape, self.spmd_mesh.mesh_shape)
-    np.testing.assert_equal(
-        np.array([dev.id for dev in jax_mesh.devices.flatten()]),
-        self.spmd_mesh.device_ids)
-    # assert not throwing
-
 
 if __name__ == '__main__':
   test = unittest.main()
diff --git a/test/spmd/test_xla_sharding.py b/test/spmd/test_xla_sharding.py
@@ -618,9 +618,9 @@ def test_inplace_add_with_sharding(self):
 
   # avoid calling xr.addressable_device_count here otherwise it will init the test
   # in non-spmd mode.
-  @unittest.skipIf(
-      xr.device_type() == 'CPU',
-      "sharding will be the same for both tensors on single device")
+  @unittest.skipIf(xr.device_type() == 'CPU',
+                   "sharding will be the same for both tensors on single device"
+                  )
   def test_shard_hashing(self):
     xt1 = torch.ones(2, 2).to(xm.xla_device())
     xt2 = torch.ones(2, 2).to(xm.xla_device())
@@ -1383,9 +1383,8 @@ def test_get_1d_mesh(self):
     self.assertEqual(mesh_without_name.mesh_shape,
                      (xr.global_runtime_device_count(),))
 
-  @unittest.skipUnless(
-      xr.global_runtime_device_count() > 1,
-      "Multiple devices required for dataloader sharding test")
+  @unittest.skipUnless(xr.global_runtime_device_count() > 1,
+                       "Multiple devices required for dataloader sharding test")
   def test_data_loader_with_sharding(self):
     device = torch_xla.device()
     mesh = xs.get_1d_mesh("data")
@@ -1406,9 +1405,8 @@ def test_data_loader_with_sharding(self):
         f"{{devices=[{mesh.size()},1,1,1]{','.join([str(i) for i in range(mesh.size())])}}}"
     )
 
-  @unittest.skipUnless(
-      xr.global_runtime_device_count() > 1,
-      "Multiple devices required for dataloader sharding test")
+  @unittest.skipUnless(xr.global_runtime_device_count() > 1,
+                       "Multiple devices required for dataloader sharding test")
   def test_data_loader_with_non_batch_size(self):
     device = torch_xla.device()
     mesh = xs.get_1d_mesh("data")
@@ -1429,9 +1427,8 @@ def test_data_loader_with_non_batch_size(self):
         f"{{devices=[{mesh.size()},1,1,1]{','.join([str(i) for i in range(mesh.size())])}}}"
     )
 
-  @unittest.skipUnless(
-      xr.global_runtime_device_count() > 1,
-      "Multiple devices required for dataloader sharding test")
+  @unittest.skipUnless(xr.global_runtime_device_count() > 1,
+                       "Multiple devices required for dataloader sharding test")
   def test_data_loader_with_non_batch_size_and_mini_batch(self):
     device = torch_xla.device()
     mesh = xs.get_1d_mesh("data")
@@ -1663,9 +1660,9 @@ def test_get_logical_mesh(self):
     self.assertEqual(logical_mesh.shape, mesh_shape)
     np.testing.assert_array_equal(np.sort(logical_mesh.flatten()), device_ids)
 
-  @unittest.skipIf(
-      xr.device_type() == 'CPU',
-      "sharding will be the same for both tensors on single device")
+  @unittest.skipIf(xr.device_type() == 'CPU',
+                   "sharding will be the same for both tensors on single device"
+                  )
   def test_shard_as(self):
     mesh = self._get_mesh((self.n_devices,))
     partition_spec = (0,)
diff --git a/test/test_assume_pure_spmd.py b/test/test_assume_pure_spmd.py
@@ -0,0 +1,79 @@
+import os
+import sys
+import unittest
+
+import numpy as np
+import torch
+import torch_xla
+import torch_xla.runtime as xr
+from torch_xla.experimental.assume_pure import assume_pure
+from torch_xla.distributed.spmd import mark_sharding, set_global_mesh, get_1d_mesh, Mesh
+
+
+class AssumePureSpmdTest(unittest.TestCase):
+
+  def setUp(self):
+    # Activate SPMD
+    xr.use_spmd()
+
+    # Set up a simple SPMD mesh for these tests.
+    self.spmd_mesh = get_1d_mesh(axis_name="model")
+    set_global_mesh(self.spmd_mesh)
+
+  @unittest.skipUnless(xr.global_runtime_device_count() > 1,
+                       "Multiple devices required")
+  @unittest.skipIf(
+      torch.cuda.is_available() or os.environ.get('PJRT_DEVICE') == 'CUDA',
+      "TODO(https://github.com/pytorch/xla/issues/9017): Get these tests working on GPU"
+  )
+  def test_assume_pure_works_with_mark_sharding(self):
+    x = torch.randn((8, 4, 5, 128), device='xla')
+    result = assume_pure(mark_sharding)(x, self.spmd_mesh,
+                                        ("model", None, None, None))
+    torch_xla.sync(wait=True)
+    N = xr.global_runtime_device_count()
+    self.assertIn(f'devices=[{N}',
+                  torch_xla._XLAC._get_xla_sharding_spec(result))
+
+  @unittest.skipUnless(xr.global_runtime_device_count() > 1,
+                       "Multiple devices required")
+  @unittest.skipIf(
+      torch.cuda.is_available() or os.environ.get('PJRT_DEVICE') == 'CUDA',
+      "TODO(https://github.com/pytorch/xla/issues/9017): Get these tests working on GPU"
+  )
+  def test_convert_to_jax_mesh(self):
+    jax_mesh = self.spmd_mesh.get_jax_mesh()
+    self.assertEqual(jax_mesh.devices.shape, self.spmd_mesh.mesh_shape)
+    np.testing.assert_equal(
+        np.array([dev.id for dev in jax_mesh.devices.flatten()]),
+        self.spmd_mesh.device_ids)
+
+  @unittest.skipUnless(xr.global_runtime_device_count() > 1,
+                       "Multiple devices required")
+  @unittest.skipUnless(os.environ.get('PJRT_DEVICE') == 'TPU', "TPU only test")
+  def test_convert_to_jax_mesh_shuffled(self):
+    """Test get_jax_mesh when the PyTorch/XLA mesh has a custom order."""
+
+    # Arrange
+    num_devices = xr.global_runtime_device_count()
+    device_ids = np.arange(num_devices)
+    device_ids = np.random.permutation(device_ids)
+    self.spmd_mesh = Mesh(
+        device_ids, mesh_shape=(num_devices,), axis_names=('model',))
+
+    # Act
+    jax_mesh = self.spmd_mesh.get_jax_mesh()
+
+    # Assert
+    torch_xla_devices = np.array(
+        [xr.global_runtime_device_attributes()[i] for i in device_ids])
+    self.assertEqual(jax_mesh.devices.shape, self.spmd_mesh.mesh_shape)
+    np.testing.assert_equal(
+        np.array([dev.coords for dev in jax_mesh.devices.flatten()]),
+        np.array([dev['coords'] for dev in torch_xla_devices.flatten()]),
+    )
+
+
+if __name__ == '__main__':
+  test = unittest.main()
+  sys.exit(0 if test.result.wasSuccessful() else 1)
diff --git a/test/test_jax_interop_spmd.py b/test/test_jax_interop_spmd.py
@@ -0,0 +1,49 @@
+import sys
+import unittest
+
+import torch
+import torch_xla
+import torch_xla.core.xla_model as xm
+import torch_xla.core.xla_builder as xb
+import torch_xla.runtime as xr
+from torch_xla.distributed.spmd import set_global_mesh, get_1d_mesh
+
+
+class TestJaxInteropSpmd(unittest.TestCase):
+
+  def setUp(self):
+    xb._JAX_TO_XLA_COMPUTATION_CACHE.clear()
+    # Activate SPMD
+    xr.use_spmd()
+
+    # Set up a simple SPMD mesh for these tests.
+    self.spmd_mesh = get_1d_mesh(axis_name="model")
+    set_global_mesh(self.spmd_mesh)
+
+  @unittest.skipUnless(xr.global_runtime_device_count() > 1,
+                       "Multiple devices required")
+  def test_call_jax_sharding_constraints(self):
+    """Test that we can call jax.lax.with_sharding_constraints from PyTorch/XLA."""
+
+    # Arrange
+    a = torch.ones((8, 8), device='xla')
+
+    def f(a, b):
+      import jax
+      from jax.sharding import PartitionSpec as P
+      import jax.numpy as jnp
+      return jax.lax.with_sharding_constraint(a, P("model",)) + jnp.sin(b)
+
+    # Act
+    result = xb.call_jax(f, (a, a))
+    torch_xla.sync(wait=True)
+
+    # Assert
+    N = xr.global_runtime_device_count()
+    self.assertIn(f'devices=[{N}',
+                  torch_xla._XLAC._get_xla_sharding_spec(result))
+
+
+if __name__ == "__main__":
+  test = unittest.main()
+  sys.exit(0 if test.result.wasSuccessful() else 1)
diff --git a/test/test_operations.py b/test/test_operations.py
@@ -2959,9 +2959,11 @@ def test_dlpack_roundtrip_tensor(self, dtype):
 
   @onlyIfTorchSupportsCUDA
   @onlyIfPJRTDeviceIsCUDA
-  @parameterized.parameters(
-      *all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool,
-                                 torch.uint16, torch.uint32, torch.uint64))
+  @parameterized.parameters(*all_types_and_complex_and(torch.half,
+                                                       torch.bfloat16,
+                                                       torch.bool, torch.uint16,
+                                                       torch.uint32,
+                                                       torch.uint64))
   def test_dlpack_roundtrip_scalar(self, dtype):
     xla_device = xm.xla_device()
     xla_tensor_0 = torch.tensor(42, dtype=dtype).to(xla_device)
diff --git a/test/test_pallas.py b/test/test_pallas.py
@@ -41,10 +41,10 @@ class PallasTest(parameterized.TestCase):
   # therefore we use != instead of ==.
   def _make_attention_mask_from_segment_ids(self, q_segment_ids,
                                             kv_segment_ids):
-    return q_segment_ids.view(q_segment_ids.shape[0], 1, q_segment_ids.shape[1],
-                              1) != kv_segment_ids.view(kv_segment_ids.shape[0],
-                                                        1, 1,
-                                                        kv_segment_ids.shape[1])
+    return q_segment_ids.view(q_segment_ids.shape[0], 1,
+                              q_segment_ids.shape[1], 1) != kv_segment_ids.view(
+                                  kv_segment_ids.shape[0], 1, 1,
+                                  kv_segment_ids.shape[1])
 
   def _attention(self, q, k, v, *, attn_mask=None, ab=None):
     attn_weight = q @ k.transpose(-2, -1)
diff --git a/test/test_pallas_spmd.py b/test/test_pallas_spmd.py
@@ -41,10 +41,10 @@ class PallasTest(unittest.TestCase):
   # therefore we use != instead of ==.
   def _make_attention_mask_from_segment_ids(self, q_segment_ids,
                                             kv_segment_ids):
-    return q_segment_ids.view(q_segment_ids.shape[0], 1, q_segment_ids.shape[1],
-                              1) != kv_segment_ids.view(kv_segment_ids.shape[0],
-                                                        1, 1,
-                                                        kv_segment_ids.shape[1])
+    return q_segment_ids.view(q_segment_ids.shape[0], 1,
+                              q_segment_ids.shape[1], 1) != kv_segment_ids.view(
+                                  kv_segment_ids.shape[0], 1, 1,
+                                  kv_segment_ids.shape[1])
 
   def _attention(self, q, k, v, *, attn_mask=None, ab=None):
     attn_weight = q @ k.transpose(-2, -1)
diff --git a/test/test_splash_attention.py b/test/test_splash_attention.py
@@ -62,10 +62,10 @@ def setUp(self):
 
   def _make_attention_mask_from_segment_ids(self, q_segment_ids,
                                             kv_segment_ids):
-    return q_segment_ids.view(q_segment_ids.shape[0], 1, q_segment_ids.shape[1],
-                              1) != kv_segment_ids.view(kv_segment_ids.shape[0],
-                                                        1, 1,
-                                                        kv_segment_ids.shape[1])
+    return q_segment_ids.view(q_segment_ids.shape[0], 1,
+                              q_segment_ids.shape[1], 1) != kv_segment_ids.view(
+                                  kv_segment_ids.shape[0], 1, 1,
+                                  kv_segment_ids.shape[1])
 
   def maybe_repeat_kv(self, hidden_state):
     if hidden_state.size(1) == self.NUM_Q_HEADS:
diff --git a/test/tpu/run_tests.sh b/test/tpu/run_tests.sh
@@ -37,6 +37,8 @@ python3 "$TEST_CDIR/scan/test_scan_pallas.py"
 python3 "$TEST_CDIR/scan/test_scan_layers.py"
 python3 "$TEST_CDIR/test_gru.py"
 python3 "$TEST_CDIR/test_assume_pure.py"
+python3 "$TEST_CDIR/test_assume_pure_spmd.py"
+python3 "$TEST_CDIR/test_jax_interop_spmd.py"
 python3 "$TEST_CDIR/test_as_stride_use_slice.py"
 run_xla_hlo_debug python3 "$TEST_CDIR/scan/test_scan_debug.py"
 python3 "$TEST_CDIR/test_pallas.py" -v
diff --git a/torch_xla/core/xla_builder.py b/torch_xla/core/xla_builder.py
@@ -920,8 +920,19 @@ def get_xla_computation():
       import torch_xla.debug.profiler as xp
       # If we see this trace span in the profiler, we'll know that there's a cache miss.
       with xp.Trace('jax_to_xla_computation'):
-        lowered = jax.jit(fn, keep_unused=True).lower(*sample_tensor_args)
-        hlo_ir = lowered.compiler_ir('hlo')
+        jitted = jax.jit(fn, keep_unused=True)
+
+        def do_lower():
+          import torch_xla.runtime as xr
+          import torch_xla.distributed.spmd as xs
+          if xr.is_spmd():
+            mesh = xs.get_global_mesh()
+            if mesh is not None:
+              with mesh.get_jax_mesh():
+                return jitted.lower(*sample_tensor_args)
+          return jitted.lower(*sample_tensor_args)
+
+        hlo_ir = do_lower().compiler_ir('hlo')
         assert len(traced_out_spec) == 1, \
             "fn must be traced to obtain the output tree spec"
         spec = traced_out_spec[0]
diff --git a/torch_xla/distributed/spmd/xla_sharding.py b/torch_xla/distributed/spmd/xla_sharding.py
@@ -182,7 +182,7 @@ def from_str(cls, mesh_str: str) -> Optional["Mesh"]:
       return None
 
   @requires_jax
-  def maybe_convert_and_get_jax_mesh(self):
+  def get_jax_mesh(self):
     # Construct a JAX mesh object with the same device ids shape and ordering
     # from torch_xla device mesh.
     import jax
@@ -611,7 +611,7 @@ def mark_sharding(t: Union[torch.Tensor, XLAShardedTensor], mesh: Mesh,
   if tx is not None and isinstance(t, tx.tensor.Tensor):
     from jax.sharding import PartitionSpec as P, NamedSharding
     op_sharding = tuple(str(i) if i is not None else i for i in partition_spec)
-    jmesh = mesh.maybe_convert_and_get_jax_mesh()
+    jmesh = mesh.get_jax_mesh()
     t.shard_(NamedSharding(jmesh, P(*op_sharding)))
     return t
 
diff --git a/torch_xla/distributed/xla_multiprocessing.py b/torch_xla/distributed/xla_multiprocessing.py
@@ -174,10 +174,7 @@ def _v6e_create_replica_groups() -> List | None:
   return None
 
 
-device_kind_handler_dict: dict[
-    str,
-    Callable[..., List | None],
-] = {
+device_kind_handler_dict: dict[str, Callable[..., List | None],] = {
     _TPU_V5P: _v5p_create_replica_groups,
     _TPU_V6E: _v6e_create_replica_groups
 }
diff --git a/torch_xla/experimental/assume_pure.py b/torch_xla/experimental/assume_pure.py
diff --git a/torch_xla/experimental/gradient_accumulation.py b/torch_xla/experimental/gradient_accumulation.py
diff --git a/torch_xla/experimental/pallas_kernels/ragged_paged_attention_kernel.py b/torch_xla/experimental/pallas_kernels/ragged_paged_attention_kernel.py
diff --git a/torch_xla/experimental/pallas_kernels/ragged_paged_attention_v2.py b/torch_xla/experimental/pallas_kernels/ragged_paged_attention_v2.py
diff --git a/torch_xla/experimental/splash_attention.py b/torch_xla/experimental/splash_attention.py

Original file line number	Diff line number	Diff line change
`@@ -211,7 +211,9 @@ function run_xla_op_tests2 {`
`211`	`211`	`run_test "$CDIR/test_callback.py"`
`212`	`212`	`XLA_USE_SPMD=1 run_test "$CDIR/test_callback.py"`
`213`	`213`	`run_test "$CDIR/test_jax_interop.py"`
	`214`	`+ run_test "$CDIR/test_jax_interop_spmd.py"`
`214`	`215`	`run_test "$CDIR/test_assume_pure.py"`
	`216`	`+ run_test "$CDIR/test_assume_pure_spmd.py"`
`215`	`217`	`}`
`216`	`218`
`217`	`219`	`# All the new xla op tests should go to run_xla_op_tests3`