NVIDIA · wujingyue · Dec 7, 2024 · Feb 14, 2025 · Feb 14, 2025 · Feb 14, 2025
diff --git a/csrc/scheduler/reduction.cpp b/csrc/scheduler/reduction.cpp
@@ -1549,7 +1549,8 @@ void scheduleReduction(Fusion* fusion, const ReductionParams* rparams) {
   }
 
   NVF_ERROR(
-      !(rparams->schedule_3D && isSharded(reduction_tv)),
+      !(rparams->schedule_3D &&
+        getShardedLoopAxis(reduction_tv, ParallelType::DIDx) >= 0),
       "Multidevice nvFuser does not support 3D reduction schedules");
 
   auto dim_analysis = scheduler_utils::canonicalDimReduction(

diff --git a/csrc/tensor_view.cpp b/csrc/tensor_view.cpp
@@ -779,12 +779,6 @@ TensorView* TensorView::rFactor(const std::vector<int64_t>& axes) {
       "Error rfactoring ",
       this,
       " its definition is either a nullptr or not a reduction.");
-  // For hopper matmuls, the mma_result logical domain is reordered as [M, N, K]
-  // using commitLeafToLogical. Thus, the original logical domain is moved to
-  // the root domain.
-  NVF_CHECK(
-      definition()->isA<MmaOp>() || !domain()->hasRoot(),
-      "Cannot call rfactor on the same view twice.");
   NVF_CHECK(
       !definition()->isA<GroupedReductionOp>(),
       "For GroupedReductionOp, use TensorView::rFactor(const std::vector<int64_t>& axes, const std::vector<TensorView*>& tvs)");

diff --git a/csrc/transform_rfactor.cpp b/csrc/transform_rfactor.cpp
@@ -121,12 +121,12 @@ class ReplayRFactor : public ReplayTransformations {
     // rfactored domains. If it isn't involved in the rfactor, it's no
     // longer a redunction domain
     std::optional<IterType> outer_iter_type;
-    if (s->outer()->isReduction() && !rfactor_dep_ids_.count(s->outer())) {
+    if (!rfactor_dep_ids_.count(s->outer())) {
       outer_iter_type = IterType::Iteration;
     }
 
     std::optional<IterType> inner_iter_type;
-    if (s->inner()->isReduction() && !rfactor_dep_ids_.count(s->inner())) {
+    if (!rfactor_dep_ids_.count(s->inner())) {
       inner_iter_type = IterType::Iteration;
     }
 

diff --git a/tests/cpp/test_tutorial.cpp b/tests/cpp/test_tutorial.cpp
@@ -345,11 +345,11 @@ TEST_F(Tutorial, ReductionRFactor) {
 
     // The fusion math should now look like:
     //
-    // tv0: root = logical = [i0]
-    // tv2 = reduction(tv0): root = [i0], logical = [r1/1024, i1024]
-    // tv1 = reduction(tv2): root = logical = [r1024]
+    // tv0: root = logical = [i{i0}]
+    // tv2 = reduction(tv0): root = [r{i0}], logical = [r{i0/1024}, i{1024}]
+    // tv1 = reduction(tv2): root = logical = [r{1024}]
     if (verbose_) {
-      fusion_copy.printMath();
+      fusion_copy.print();
     }
     // Notice that the reduction operation is now split into two
     // operations, where the first one takes care of the first domain, and the

diff --git a/tests/python/test_communication.py b/tests/python/test_communication.py
@@ -52,22 +52,36 @@ def test_allreduce(multidevice_test):
 
     class Model(FusionDefinition):
         def definition(self):
-            self.inp = self.define_tensor((d, 4), contiguity=True, dtype=DataType.Float)
+            self.inp = self.define_tensor(
+                (-1, -1), contiguity=True, dtype=DataType.Float
+            )
             self.out = self.ops.sum(self.inp, [0])
             self.add_output(self.out)
 
         def multidevice_schedule(self):
+            self.sched.split(self.inp, 0, d, False)
+            self.sched.split(self.out, 0, d, False)
+            out_local = self.sched.rfactor(self.out, [1])
+
             self.sched._set_device_mesh(self.inp, mesh)
             self.sched._set_device_mesh(self.out, mesh)
+            self.sched._set_device_mesh(out_local, mesh)
 
             self.sched.parallelize(self.inp, 0, nvfuser.ParallelType.mesh_x)
+            self.sched.parallelize(out_local, 0, nvfuser.ParallelType.mesh_x)
 
-    unsharded = torch.randn(d, 4)
+            self.sched.set_allocation_as_loop(self.inp)
+            self.sched.set_allocation_as_loop(out_local)
+            self.sched.set_allocation_as_loop(self.out)
+
+    m = d * 2
+    n = 3
+    unsharded = torch.randn(m, n)
     sharded = multidevice_test.shard_tensor(unsharded, 0, mesh)
 
     fd = Model()
-    (output,) = fd.execute([sharded])
-    torch.testing.assert_close(output.local.cpu(), unsharded.sum(0))
+    outputs = fd.execute([sharded])
+    torch.testing.assert_close(outputs[0].local.cpu(), unsharded.sum(0))
 
 
 @pytest.mark.mpi