[FuseConsumer] Update consumer fusion pass to be more generic

Abhishek-Varma · Abhishek-Varma · commit ed82f00604f4 · 2024-11-28T15:11:33.000Z
-- This commit updates consumer fusion pass to be more generic whilst trying to pick the compute op/loop to fuse with. -- Helps in [increasing L2 tile size](https://github.com/nod-ai/iree-amd-aie/tree/increase_L2_tile_sizes). Signed-off-by: Abhishek Varma <abhvarma@amd.com>
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFuseConsumerIntoLoop.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFuseConsumerIntoLoop.cpp
@@ -36,77 +36,76 @@ void AMDAIEFuseConsumerIntoLoopPass::runOnOperation() {
   mlir::FunctionOpInterface funcOp = getOperation();
   IRRewriter rewriter(context);
 
-  // The depth until which we would keep fusing consumer chain.
+  // Step 1. Find the first scf loop in postorder walk.
+  Operation *scfLoopOp = nullptr;
+  funcOp->walk<WalkOrder::PostOrder, ReverseIterator>(
+      [&](LoopLikeOpInterface op) {
+        if (isa<scf::ForOp>(op)) {
+          scfLoopOp = op;
+          return WalkResult::interrupt();
+        } else if (isa<scf::ForallOp>(op)) {
+          scfLoopOp = op;
+          return WalkResult::interrupt();
+        }
+        return WalkResult::advance();
+      });
+  if (!scfLoopOp) {
+    LLVM_DEBUG(llvm::dbgs()
+               << "There is no scf.for/forall loop to fuse with\n");
+    return;
+  }
+  // Step 2. Search the compute op and its consumer slices.
+  linalg::LinalgOp linalgOp;
+  scfLoopOp->walk<WalkOrder::PostOrder, ReverseIterator>(
+      [&](linalg::LinalgOp op) {
+        linalgOp = op;
+        return WalkResult::interrupt();
+      });
+  if (!linalgOp) {
+    LLVM_DEBUG(llvm::dbgs() << "Could not find any compute op\n");
+    return;
+  }
+  // Step 3. The depth until which we would keep fusing consumer chain.
   // TODO(avarma): This should also be part of KernelDispatch logic.
   unsigned fuseDepth = 1;
   // Check if there is matmul-elementwise fusion opportunity. If so, overwrite
-  // the `fuseDepth` to be 2.
-  funcOp->walk<WalkOrder::PostOrder, ReverseIterator>([&](linalg::LinalgOp op) {
-    if (isMatmulProducerOfElementwise(op)) {
-      fuseDepth = 2;
-      return WalkResult::interrupt();
-    }
-    return WalkResult::advance();
-  });
+  // the `fuseDepth` to be 2
+  if (isMatmulProducerOfElementwise(linalgOp)) {
+    fuseDepth = 2;
+  }
 
-  // Based on the `fuseDepth`, we would greedily fuse the consumer ops.
+  Operation *computeOp = linalgOp;
+  // Step 4. Based on the `fuseDepth`, we would greedily fuse the consumer ops.
   for (unsigned depth = 1; depth <= fuseDepth; depth++) {
-    // Walk through the graph in post order and find the loop.
-    Operation *scfLoopOp = nullptr;
-    funcOp->walk<WalkOrder::PostOrder, ReverseIterator>(
-        [&](LoopLikeOpInterface op) {
-          if (isa<scf::ForOp>(op) && useSCFFor) {
-            scfLoopOp = op;
-            return WalkResult::interrupt();
-          } else if (isa<scf::ForallOp>(op) && !useSCFFor) {
-            scfLoopOp = op;
-            return WalkResult::interrupt();
-          }
-          return WalkResult::advance();
-        });
-
-    if (!scfLoopOp) {
-      LLVM_DEBUG(llvm::dbgs()
-                 << "There is no scf.for/forall loop to fuse with\n");
-      return;
-    }
-
-    // Search the compute op and its consumer slices.
-    linalg::LinalgOp linalgOp;
-    scfLoopOp->walk<WalkOrder::PostOrder, ReverseIterator>(
-        [&](linalg::LinalgOp op) {
-          linalgOp = op;
-          return WalkResult::interrupt();
-        });
-
-    if (!linalgOp) {
-      LLVM_DEBUG(llvm::dbgs() << "Could not find any compute op\n");
-      return;
-    }
-
-    Value::user_range users = linalgOp->getResult(0).getUsers();
-    if (!llvm::hasSingleElement(users)) {
-      linalgOp->emitOpError("Expected only one user of the compute op");
-      return signalPassFailure();
-    }
-
-    Operation *terminatorStoreOp = *(users.begin());
-    if (!(isa<tensor::InsertSliceOp, tensor::ParallelInsertSliceOp>(
-            terminatorStoreOp))) {
-      terminatorStoreOp->emitOpError(
-          "Expected either tensor.insert_slice OR tensor.parallel_insert_slice "
-          "to be the only user of the compute op");
-      return signalPassFailure();
-    }
-
-    std::optional<scf::SCFFuseConsumerOfSliceResult> fusedConsumer =
-        scf::tileAndFuseConsumerOfSlice(rewriter, terminatorStoreOp);
-    if (!fusedConsumer) {
-      terminatorStoreOp->emitOpError(
-          "Failed to fuse any consumer op into the producer");
-      return signalPassFailure();
-    }
-    fusedConsumer->origConsumerOperand->getOwner()->erase();
+    LLVM_DEBUG(llvm::dbgs() << "Compute op = " << (*computeOp) << "\n");
+    do {
+      Value::user_range users = computeOp->getResult(0).getUsers();
+      if (!llvm::hasSingleElement(users)) {
+        computeOp->emitOpError("Expected only one user of the compute op");
+        return signalPassFailure();
+      }
+
+      Operation *terminatorStoreOp = *(users.begin());
+      if (!(isa<tensor::InsertSliceOp, tensor::ParallelInsertSliceOp>(
+              terminatorStoreOp))) {
+        computeOp = computeOp->getParentOfType<LoopLikeOpInterface>();
+        LLVM_DEBUG(llvm::dbgs()
+                   << "Going to reattempt fusion because didn't find "
+                      "tensor.insert_slice/tensor.parallel_insert_slice as the "
+                      "user of the compute op\n");
+        continue;
+      }
+      std::optional<scf::SCFFuseConsumerOfSliceResult> fusedConsumer =
+          scf::tileAndFuseConsumerOfSlice(rewriter, terminatorStoreOp);
+      if (!fusedConsumer) {
+        terminatorStoreOp->emitOpError(
+            "Failed to fuse any consumer op into the producer");
+        return signalPassFailure();
+      }
+      fusedConsumer->origConsumerOperand->getOwner()->erase();
+      computeOp = fusedConsumer->tiledAndFusedConsumerOperand->getOwner();
+      break;
+    } while (computeOp && computeOp->getParentOp() != funcOp);
   }
 }
 
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fuse_consumer_into_loop_scf_for.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fuse_consumer_into_loop_scf_for.mlir
@@ -195,20 +195,6 @@ func.func @no_user_of_producer(%arg0: tensor<64xf32>) {
 
 // -----
 
-func.func @insert_slice_not_found(%arg0: tensor<64xf32>) -> tensor<64xf32> {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c8 = arith.constant 8 : index
-  %0 = scf.for %arg1 = %c0 to %c8 step %c1 iter_args(%out = %arg0) -> tensor<64xf32> {
-    %1 = linalg.elemwise_binary {fun = #linalg.binary_fn<add>} ins(%arg0, %arg0 : tensor<64xf32>, tensor<64xf32>) outs(%arg0 : tensor<64xf32>) -> tensor<64xf32>
-    // expected-error @+1 {{Expected either tensor.insert_slice OR tensor.parallel_insert_slice to be the only user of the compute op}}
-    scf.yield %1 : tensor<64xf32>
-  }
-  return %0 : tensor<64xf32>
-}
-
-// -----
-
 func.func @no_consumer_fusion(%arg0: tensor<64xf32>) -> tensor<64xf32> {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fuse_consumer_into_loop_scf_forall.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fuse_consumer_into_loop_scf_forall.mlir
@@ -188,17 +188,6 @@ func.func @no_user_of_producer(%arg0: tensor<64xf32>) {
 
 // -----
 
-func.func @parallel_insert_slice_not_found(%arg0: tensor<64xf32>) -> tensor<64xf32> {
-  %0 = scf.forall (%arg1, %arg2) in (1,2) shared_outs(%out = %arg0) -> tensor<64xf32> {
-    %1 = linalg.elemwise_binary {fun = #linalg.binary_fn<add>} ins(%arg0, %arg0 : tensor<64xf32>, tensor<64xf32>) outs(%arg0 : tensor<64xf32>) -> tensor<64xf32>
-    // expected-error @+1 {{Expected either tensor.insert_slice OR tensor.parallel_insert_slice to be the only user of the compute op}}
-    %2 = arith.mulf %1, %out : tensor<64xf32>
-  }
-  return %0 : tensor<64xf32>
-}
-
-// -----
-
 func.func @no_consumer_fusion(%arg0: tensor<64xf32>) -> tensor<64xf32> {
   %0 = scf.forall (%arg1, %arg2) in (1,2) shared_outs(%out = %arg0) -> tensor<64xf32> {
     %1 = linalg.elemwise_binary {fun = #linalg.binary_fn<add>} ins(%arg0, %arg0 : tensor<64xf32>, tensor<64xf32>) outs(%arg0 : tensor<64xf32>) -> tensor<64xf32>