Fix errors

ghpvnist · ghpvnist · commit 2fd8ab065683 · 2025-06-11T22:46:38.000Z
diff --git a/API_GUIDE.md b/API_GUIDE.md
@@ -197,7 +197,7 @@ single device snippet. Let's go over then one by one.
 - `torch_xla.launch()`
   - Creates the processes that each run an XLA device.
   - This function is a wrapper of multithreading spawn to allow user run the script with torchrun command line also. Each process will only be able to access the device assigned to the current process. For example on a TPU v4-8, there will be 4 processes being spawn up and each process will own a TPU device.
-  - Note that if you print the `torch.device('xla')` on each process you will see `xla:0` on all devices. This is because each process can only see one device. This does not mean multi-process is not functioning. The only exeption is with PJRT runtime on TPU v2 and TPU v3 since there will be `#devices/2` processes and each process will have 2 threads (check this [doc](https://github.com/pytorch/xla/blob/master/docs/pjrt.md#tpus-v2v3-vs-v4) for more details).
+  - Note that if you print the `torch_xla.device()` on each process you will see `xla:0` on all devices. This is because each process can only see one device. This does not mean multi-process is not functioning. The only exeption is with PJRT runtime on TPU v2 and TPU v3 since there will be `#devices/2` processes and each process will have 2 threads (check this [doc](https://github.com/pytorch/xla/blob/master/docs/pjrt.md#tpus-v2v3-vs-v4) for more details).
 - `MpDeviceLoader`
   - Loads the training data onto each device.
   - `MpDeviceLoader` can wrap on a torch dataloader. It can preload the data to the device and overlap the dataloading with device execution to improve the performance.
diff --git a/test/pjrt/test_runtime_multi_cpu.py b/test/pjrt/test_runtime_multi_cpu.py
@@ -27,7 +27,7 @@ def test_default_cpu_device(self):
     os.environ.pop(xenv.PJRT_CPU_ASYNC_CLIENT, None)
 
     expected = {0: torch.device('xla:0')}
-    devices_per_process = pjrt.run_multiprocess(xm.xla_device)
+    devices_per_process = pjrt.run_multiprocess(torch_xla.device)
     self.assertDictEqual(devices_per_process, expected)
 
   def test_multi_cpu_devices(self):
@@ -38,7 +38,7 @@ def test_multi_cpu_devices(self):
         3: torch.device('xla:3'),
     }
 
-    devices_per_process = pjrt.run_multiprocess(xm.xla_device)
+    devices_per_process = pjrt.run_multiprocess(torch_xla.device)
     self.assertDictEqual(devices_per_process, expected)
 
   def test_global_ordinal(self):
@@ -65,7 +65,7 @@ def forward(ctx, x):
       def backward(ctx, grad_output):
         results['forward_ordinal'] = ctx.forward_ordinal
         results['backward_ordinal'] = xr.global_ordinal()
-        results['device'] = str(torch.device('xla'))
+        results['device'] = str(torch_xla.device())
         return grad_output
 
     x = torch.ones(1, requires_grad=True, device='xla')
diff --git a/test/pjrt/test_runtime_multi_gpu.py b/test/pjrt/test_runtime_multi_gpu.py
diff --git a/test/pytorch_test_base.py b/test/pytorch_test_base.py
@@ -559,7 +559,7 @@ def _alt_lookup(d, keys, defval):
   def instantiate_test(cls, name, test, *, generic_cls):
     test_name = name + '_' + cls.device_type
     class_name = cls.__name__
-    real_device_type = xm.xla_device_hw(str(torch.device('xla')))
+    real_device_type = xm.xla_device_hw(str(torch.device('xla:0')))
     assert real_device_type in DISABLED_TORCH_TESTS, 'Unsupported device type:' + real_device_type
     disabled_torch_tests = DISABLED_TORCH_TESTS[real_device_type]
 
@@ -631,8 +631,8 @@ def get_primary_device(cls):
 
   @classmethod
   def setUpClass(cls):
-    # Sets the primary test device to the xla_device (CPU or TPU)
-    cls.primary_device = str(torch.device('xla'))
+    # Sets the primary test device to the torch_xla.device (CPU or TPU)
+    cls.primary_device = str(torch_xla.device())
     torch_xla._XLAC._xla_set_mat_mul_precision('highest')
 
   def setUp(self):
diff --git a/test/spmd/test_xla_spmd_python_api_interaction.py b/test/spmd/test_xla_spmd_python_api_interaction.py
@@ -38,7 +38,7 @@ def test_is_master_ordinal(self):
     self.assertTrue(xm.is_master_ordinal())
 
   def test_xla_device(self):
-    device = torch.device('xla')
+    device = torch_xla.device()
     self.assertEqual(device, torch.device('xla:0'))
 
   def test_xla_real_devices(self):
diff --git a/test/test_operations.py b/test/test_operations.py
@@ -442,7 +442,7 @@ class TestOptimizationBarrier(test_utils.XlaTestCase):
   def test_optimization_barrier_correctness(self):
     device = torch.device('xla')
     # only test optimization_barrier on TPU
-    if xm.xla_device_hw(device) != 'TPU':
+    if xr.device_type() != 'TPU':
       return
     x = torch.randn(5, 5, device=device)
     y = torch.randn(5, 5, device=device)
@@ -1532,7 +1532,7 @@ def test_deepcopy(self):
     self.assertEqual(x[0], x0)
 
   def test_print(self):
-    xla_device = torch.device('xla')
+    xla_device = torch.device('xla:0')
     x = torch.tensor([5], device=xla_device)
     expected_str = 'tensor([5], device=\'' + str(xla_device) + '\')'
     self.assertEqual(str(x), expected_str)
@@ -2759,7 +2759,7 @@ def test_send_to_device_grad(self):
     self.assertTrue(dt[0].requires_grad)
 
   def test_send_to_device_single(self):
-    xla_device = torch.device('xla')
+    xla_device = torch.device('xla:0')
     t = _gen_tensor(2, 2)
     dt = xm.send_cpu_data_to_device(t, xla_device)
     self.assertEqual(dt[0].device, xla_device)
@@ -2859,7 +2859,7 @@ def from_tensors(self, tensors):
 
     wpack = PackWrapper(pack)
 
-    xla_device = torch.device('xla')
+    xla_device = torch.device('xla:0')
     xdata = xm.send_cpu_data_to_device(wpack, xla_device)
     self.assertTrue(isinstance(xdata, nn.utils.rnn.PackedSequence))
     self.assertEqual(xdata.batch_sizes.device, torch.device('cpu'))
diff --git a/torch_xla/_internal/pjrt.py b/torch_xla/_internal/pjrt.py
@@ -104,7 +104,7 @@ def initialize_singleprocess():
     plugins.default().configure_single_process()
   elif runtime.device_type() == 'TPU':
     tpu.configure_one_chip_topology()
-  xm.set_replication(torch.device('xla'), [])
+  xm.set_replication(torch_xla.device(), [])
 
 
 def initialize_multiprocess(local_rank: int, local_world_size: int):
@@ -119,7 +119,7 @@ def initialize_multiprocess(local_rank: int, local_world_size: int):
     neuron.initialize_env(local_rank, local_world_size)
 
   devices = xm.get_xla_supported_devices()
-  xm.set_replication(torch.device('xla'), devices)
+  xm.set_replication(torch_xla.device(), devices)
 
 
 def run_multiprocess(fn: Callable[..., R],
diff --git a/torch_xla/runtime.py b/torch_xla/runtime.py
@@ -156,7 +156,8 @@ def local_ordinal() -> int:
   Local ordinal is in range [0, local_device_count)."""
   local_rank = xu.getenv_as(xenv.PJRT_LOCAL_PROCESS_RANK, int, 0)
   devices_per_process = addressable_device_count()
-  return local_rank * devices_per_process + torch.device('xla').index
+  return local_rank * devices_per_process + torch.device(
+      torch_xla._XLAC._xla_get_default_device()).index
 
 
 def process_index() -> int: