change all_to_all check to allow for split sizes > 1 (#9100)

bfolie · web-flow · commit 91ab711a34b9 · 2025-05-06T14:04:15.000-07:00
diff --git a/test/pjrt/test_collective_ops_tpu.py b/test/pjrt/test_collective_ops_tpu.py
@@ -1,11 +1,9 @@
 import numpy as np
-from typing import List
 import torch
 import torch.nn as nn
 import torch.distributed as dist
 import torch.utils._pytree as pytree
 from absl.testing import absltest, parameterized
-from unittest import mock
 import torch_xla
 import torch_xla.core.xla_model as xm
 import torch_xla.runtime as xr
@@ -247,7 +245,7 @@ def callable(output, input):
     return output.cpu()
 
   @staticmethod
-  def _all_to_all_single(use_dynamo: bool):
+  def _all_to_all_single(use_dynamo: bool, split_size: int = 1):
     met.clear_all()
     dist.init_process_group("xla", init_method='xla://')
     device = xm.xla_device()
@@ -259,7 +257,7 @@ def callable(output, input):
     # check https://github.com/pytorch/pytorch/blob/758d78790164bfb041555daed380de96e06f78a3/torch/distributed/distributed_c10d.py#L3880
     # for input and output tensor example
     tensor_in = torch.tensor(
-        [xr.local_ordinal()] * tpu.num_expected_global_devices(),
+        [xr.local_ordinal()] * (tpu.num_expected_global_devices() * split_size),
         dtype=torch.float,
         device=device)
     tensor_out = torch.zeros_like(tensor_in)
@@ -315,14 +313,18 @@ def test_reduce_scatter(self, use_dynamo):
 
   @parameterized.named_parameters(('dynamo', True), ('nondynamo', False))
   def test_all_to_all_single(self, use_dynamo):
+    split_size = 2
     results = pjrt.run_multiprocess(
-        self._all_to_all_single, use_dynamo=use_dynamo)
+        self._all_to_all_single, use_dynamo=use_dynamo, split_size=split_size)
     expected = torch.arange(
-        tpu.num_expected_global_devices(), dtype=torch.float)
+        tpu.num_expected_global_devices(), dtype=torch.float).repeat(split_size)
     # Note: AllToAll xla op does not honor the order of the all_to_all, which means
     # the rank may not follow the order.
     for _, val in results.items():
-      self.assertTrue(torch.allclose(val.sort().values, expected.sort().values))
+      self.assertTrue(
+          torch.allclose(val.sort().values,
+                         expected.sort().values),
+          f"Got {val}, expected {expected}")
 
 
 if __name__ == '__main__':
diff --git a/torch_xla/csrc/cross_replica_reduces.cpp b/torch_xla/csrc/cross_replica_reduces.cpp
@@ -329,19 +329,22 @@ at::Tensor all_to_all_single(const at::Tensor& input,
   // this basically is the code copy from
   // init_python_bindings.cpp:_xla_all_to_all
   TORCH_LAZY_FN_COUNTER("xla::");
-  if (output_split_sizes.size() != 0 && input_split_sizes.size() != 0) {
-    for (size_t i = 0; i < input_split_sizes.size(); i++) {
-      if (input_split_sizes[i] != 1)
-        throw std::runtime_error(
-            "torch_xla does not support arbitrary split sizes for all_to_all");
-    }
-  }
   bool pin_layout = false;
   const torch::lazy::Value& token =
       GetAllReduceToken(bridge::GetCurrentDevice());
   int64_t split_count = runtime::GetComputationClient()->GetAllDevices().size();
   std::vector<int64_t> all_groups(split_count);
   std::iota(all_groups.begin(), all_groups.end(), 0);
+
+  if (output_split_sizes.size() != 0 && input_split_sizes.size() != 0) {
+    int64_t split_size = input.size(0) / split_count;
+    for (size_t i = 0; i < input_split_sizes.size(); i++) {
+      if (input_split_sizes[i] != split_size ||
+          output_split_sizes[i] != split_size)
+        throw std::runtime_error(
+            "torch_xla does not support arbitrary split sizes for all_to_all");
+    }
+  }
   XLATensorPtr result_ptr;
   torch::lazy::Value new_token;
   std::tie(result_ptr, new_token) =