diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp
index b6bf6c877..7bdd8d0d8 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp
@@ -16,14 +16,14 @@ namespace mlir::iree_compiler::AMDAIE {
 
 namespace {
 
-using DmaQueue = std::pair<AMDAIE::TileOp, AMDAIE::ConnectionOp>;
+using DmaQueueKey = std::pair<AMDAIE::TileOp, AMDAIE::ConnectionOp>;
 
 /// Utility function to determine whether a DMA wait op can be folded into a
 /// queue based on its half DMA copy operation.
 FailureOr<bool> canFoldByQueue(
     const AMDAIE::AMDAIEDeviceModel &deviceModel,
     AMDAIE::NpuHalfDmaCpyNdOp &npuHalfDmaCpyNdOp,
-    DenseMap<DmaQueue, SmallVector<uint32_t>> &dmaQueueToBdIds) {
+    DenseMap<DmaQueueKey, SmallVector<uint32_t>> &dmaQueueToBdIds) {
   // Retrieve the connection op.
   std::optional<AMDAIE::ConnectionOp> maybeConnectionOp =
       npuHalfDmaCpyNdOp.getConnectionOp();
@@ -104,7 +104,7 @@ LogicalResult foldDmaWaitsByQueue(const AMDAIE::AMDAIEDeviceModel &deviceModel,
                                   AMDAIE::ControlCodeOp controlCodeOp) {
   IRRewriter rewriter(controlCodeOp->getContext());
   std::vector<AMDAIE::NpuDmaWaitOp> waitOpsToErase;
-  DenseMap<DmaQueue, SmallVector<uint32_t>> dmaQueueToBdIds;
+  DenseMap<DmaQueueKey, SmallVector<uint32_t>> dmaQueueToBdIds;
   // Traverse the control code in reverse.
   WalkResult res = controlCodeOp->walk<WalkOrder::PostOrder, ReverseIterator>(
       [&](AMDAIE::NpuDmaWaitOp waitOp) {
@@ -168,17 +168,15 @@ LogicalResult updateBatchTokens(IRRewriter &rewriter,
 
   rewriter.setInsertionPointAfter(waitOps.back());
   rewriter.create<AMDAIE::NpuDmaWaitOp>(waitOps.back().getLoc(), asyncTokens);
-  for (AMDAIE::NpuDmaWaitOp waitOp : waitOps) {
-    rewriter.eraseOp(waitOp);
-  }
+  for (AMDAIE::NpuDmaWaitOp waitOp : waitOps) rewriter.eraseOp(waitOp);
   return success();
 }
 
 /// Utility function to determine if a DMA wait operation can be folded into a
 /// a batch based on its half DMA copy operation.
-FailureOr<bool> canFoldByBatch(
-    AMDAIE::NpuHalfDmaCpyNdOp npuHalfDmaCpyNdOp,
-    SmallVector<AMDAIE::ConnectionOp> &connectionOps) {
+FailureOr<bool> canFoldByBatch(Operation *batchParentOp,
+                               AMDAIE::NpuHalfDmaCpyNdOp npuHalfDmaCpyNdOp,
+                               DenseSet<AMDAIE::ConnectionOp> &connectionOps) {
   // Retrieve the connection op.
   std::optional<AMDAIE::ConnectionOp> maybeConnectionOp =
       npuHalfDmaCpyNdOp.getConnectionOp();
@@ -199,17 +197,19 @@ FailureOr<bool> canFoldByBatch(
 
   bool canFold = true;
   // Can't fold if the current connection op already occurs in the batch, or
-  // if the current operation is a packet flow, or if the batch is empty.
-  if (llvm::is_contained(connectionOps, connectionOp) || isPacketFlow ||
-      connectionOps.empty()) {
+  // if the current operation is a packet flow, or if the batch is empty, or
+  // if the current operation is not in the same scope as the batch.
+  if (connectionOps.contains(connectionOp) || isPacketFlow ||
+      connectionOps.empty() ||
+      (batchParentOp != npuHalfDmaCpyNdOp->getParentOp())) {
     connectionOps.clear();
     canFold = false;
   }
-  connectionOps.push_back(connectionOp);
+  connectionOps.insert(connectionOp);
   return canFold;
 }
 
-/// Traverses the control code forward, ensuring that only one DMA wait op is
+/// Traverses the control code in reverse, ensuring that only one DMA wait op is
 /// retained for every batch of DMA copy operations.
 ///
 /// Example Input:
@@ -227,34 +227,42 @@ FailureOr<bool> canFoldByBatch(
 ///   %2 = dma_cpy_nd(connection2)
 ///   %3 = dma_cpy_nd(connection3)
 ///   dma_wait(%0, %1, %2, %3)
+/// Reverse traversal simplifies handling duplicate connections, preventing
+/// the need to revisit and modify earlier operations after processing later
+/// ones.
 LogicalResult foldDmaWaitsByBatch(AMDAIE::ControlCodeOp controlCodeOp) {
   IRRewriter rewriter(controlCodeOp->getContext());
   SmallVector<AMDAIE::NpuDmaWaitOp> waitOps;
-  SmallVector<AMDAIE::ConnectionOp> connectionOps;
-  WalkResult res = controlCodeOp->walk([&](AMDAIE::NpuDmaWaitOp waitOp) {
-    bool toBatch = true;
-    for (Value token : waitOp.getAsyncTokens()) {
-      if (auto npuHalfDmaCpyNdOp =
-              dyn_cast_if_present<AMDAIE::NpuHalfDmaCpyNdOp>(
-                  token.getDefiningOp())) {
-        FailureOr<bool> result =
-            canFoldByBatch(npuHalfDmaCpyNdOp, connectionOps);
-        if (failed(result)) return WalkResult::interrupt();
-        toBatch &= *result;
-      }
-    }
-    // Process the previous batch of wait ops, and start a new batch.
-    if (!toBatch) {
-      if (failed(updateBatchTokens(rewriter, waitOps)))
-        return WalkResult::interrupt();
-      waitOps.clear();
-    }
-    waitOps.push_back(waitOp);
-    return WalkResult::advance();
-  });
+  DenseSet<AMDAIE::ConnectionOp> connectionOps;
+  WalkResult res = controlCodeOp->walk<WalkOrder::PostOrder, ReverseIterator>(
+      [&](AMDAIE::NpuDmaWaitOp waitOp) {
+        bool toBatch = true;
+        Operation *batchParentOp =
+            waitOps.empty() ? waitOp->getParentOp() : waitOps[0]->getParentOp();
+        for (Value token : waitOp.getAsyncTokens()) {
+          if (auto npuHalfDmaCpyNdOp =
+                  dyn_cast_if_present<AMDAIE::NpuHalfDmaCpyNdOp>(
+                      token.getDefiningOp())) {
+            FailureOr<bool> result =
+                canFoldByBatch(batchParentOp, npuHalfDmaCpyNdOp, connectionOps);
+            if (failed(result)) return WalkResult::interrupt();
+            toBatch &= *result;
+          }
+        }
+        // Process the previous batch of wait ops, and start a new batch.
+        if (!toBatch) {
+          std::reverse(waitOps.begin(), waitOps.end());
+          if (failed(updateBatchTokens(rewriter, waitOps)))
+            return WalkResult::interrupt();
+          waitOps.clear();
+        }
+        waitOps.push_back(waitOp);
+        return WalkResult::advance();
+      });
 
   if (res.wasInterrupted()) return failure();
   // Process the remaining wait ops.
+  std::reverse(waitOps.begin(), waitOps.end());
   if (failed(updateBatchTokens(rewriter, waitOps))) return failure();
   return success();
 }
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertDmaBdChain.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertDmaBdChain.cpp
index b21ceb025..352c8e500 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertDmaBdChain.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertDmaBdChain.cpp
@@ -17,7 +17,7 @@ namespace mlir::iree_compiler::AMDAIE {
 
 namespace {
 
-using DmaChain = std::pair<AMDAIE::TileOp, AMDAIE::ConnectionOp>;
+using DmaChainKey = std::pair<AMDAIE::TileOp, AMDAIE::ConnectionOp>;
 
 /// Utility function to update `next_bd` and `start_bd` operands.
 LogicalResult updateChainOperands(
@@ -83,9 +83,9 @@ LogicalResult updateChainOperands(
 ///   - Chain X: [0] (the newly added BD ID).
 ///   - Chain Y: [] (emptied after breaking).
 void checkForChainsToBeBroken(
-    uint32_t currBdId, const DmaChain &currDmaChain,
-    const DenseMap<DmaChain, DenseSet<uint32_t>> &dmaChainToBdIds,
-    SmallVector<DmaChain> &chainsToBreak) {
+    uint32_t currBdId, const DmaChainKey &currDmaChain,
+    const DenseMap<DmaChainKey, DenseSet<uint32_t>> &dmaChainToBdIds,
+    SmallVector<DmaChainKey> &chainsToBreak) {
   for (auto &[entry, bdIds] : dmaChainToBdIds) {
     if (entry.first == currDmaChain.first && bdIds.contains(currBdId)) {
       // Break the chain that contains the duplicate BD ID.
@@ -120,9 +120,10 @@ LogicalResult insertDmaBdChain(const AMDAIE::AMDAIEDeviceModel &deviceModel,
   }
 
   // BD IDs that have been assigned in each tile.
-  DenseMap<DmaChain, DenseSet<uint32_t>> dmaChainToBdIds;
+  DenseMap<DmaChainKey, DenseSet<uint32_t>> dmaChainToBdIds;
   // Buffers the DMA ops that will be chained.
-  DenseMap<DmaChain, SmallVector<AMDAIE::NpuHalfDmaCpyNdOp>> dmaChainToDmaOps;
+  DenseMap<DmaChainKey, SmallVector<AMDAIE::NpuHalfDmaCpyNdOp>>
+      dmaChainToDmaOps;
 
   res = controlCodeOp->walk<WalkOrder::PostOrder,
                             ReverseIterator>([&](Operation *op) {
@@ -185,8 +186,8 @@ LogicalResult insertDmaBdChain(const AMDAIE::AMDAIEDeviceModel &deviceModel,
       // Any duplicate BD ID from the same tile indicates that the chain
       // cannot grow further and requires breaking to release the
       // conflicting BD ID.
-      SmallVector<DmaChain> chainsToBreak;
-      DmaChain currDmaChain = {tileOp, connectionOp};
+      SmallVector<DmaChainKey> chainsToBreak;
+      DmaChainKey currDmaChain = {tileOp, connectionOp};
       checkForChainsToBeBroken(bdId, currDmaChain, dmaChainToBdIds,
                                chainsToBreak);
 
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fold_dma_waits.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fold_dma_waits.mlir
index 954f86687..f74b8bad6 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fold_dma_waits.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fold_dma_waits.mlir
@@ -70,6 +70,66 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 
 // -----
 
+// Expect no DMA waits to be folded, since they are operating on different scopes.
+// CHECK-LABEL: @fold_dma_waits_loop
+// CHECK-COUNT-2: amdaie.npu.dma_wait
+// CHECK-NOT:     amdaie.npu.dma_wait
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @fold_dma_waits_loop() {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c8 = arith.constant 8 : index
+    amdaie.workgroup {
+      %tile_0_1 = amdaie.tile(%c0, %c1)
+      %tile_0_0 = amdaie.tile(%c0, %c0)
+      %tile_1_1 = amdaie.tile(%c1, %c1)
+      %tile_1_0 = amdaie.tile(%c1, %c0)
+      %buffer = amdaie.buffer(%tile_0_1) : memref<2048xi32, 1 : i32>
+      %buffer_0 = amdaie.buffer(%tile_0_1) : memref<2048xi32, 1 : i32>
+      %buffer_1 = amdaie.buffer(%tile_1_1) : memref<2048xi32, 1 : i32>
+      %buffer_2 = amdaie.buffer(%tile_1_1) : memref<2048xi32, 1 : i32>
+      %lock = amdaie.lock(%tile_0_1(4), 4)
+      %lock_3 = amdaie.lock(%tile_0_1(5), 0)
+      %lock_4 = amdaie.lock(%tile_1_1(4), 4)
+      %lock_5 = amdaie.lock(%tile_1_1(5), 0)
+      %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_0}, {%lock}, {%lock_3}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>
+      %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32>
+      %2 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo<memref<64x32xi32>>
+      %3 = amdaie.logicalobjectfifo.from_buffers({%buffer_1, %buffer_2}, {%lock_4}, {%lock_5}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>
+      %4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32>
+      %5 = amdaie.logicalobjectfifo.placeholder{%tile_1_0} : !amdaie.logicalobjectfifo<memref<64x32xi32>>
+      %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = MM2S)
+      %channel_6 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = S2MM)
+      %channel_7 = amdaie.channel(%tile_1_0, 0, port_type = DMA, direction = MM2S)
+      %channel_8 = amdaie.channel(%tile_1_1, 0, port_type = DMA, direction = S2MM)
+      %6 = amdaie.flow({%channel} -> {%channel_6}) {is_packet_flow = false}
+      %7 = amdaie.connection(%0 {%channel_6}, %2 {%channel}, flow = %6) {connection_type = #amdaie<connection_type Packet>} : (!amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>, !amdaie.logicalobjectfifo<memref<64x32xi32>>)
+      %8 = amdaie.flow({%channel_7} -> {%channel_8}) {is_packet_flow = false}
+      %9 = amdaie.connection(%3 {%channel_8}, %5 {%channel_7}, flow = %8) {connection_type = #amdaie<connection_type Packet>} : (!amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>, !amdaie.logicalobjectfifo<memref<64x32xi32>>)
+      amdaie.controlcode {
+        %10 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
+        memref.assume_alignment %1, 64 : memref<64x32xi32>
+        %11 = amdaie.logicalobjectfifo.from_memref %4, {%tile_1_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
+        memref.assume_alignment %4, 64 : memref<64x32xi32>
+        scf.for %arg0 = %c0 to %c1 step %c8 {
+          %bd_id_9 = amdaie.bd_id(%tile_0_0, %c0)
+          %13 = amdaie.npu.half_dma_cpy_nd async %7(%10 [] [] [] bd_id = %bd_id_9 channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+          amdaie.npu.dma_wait(%13 : !amdaie.async_token)
+        }
+        %bd_id = amdaie.bd_id(%tile_1_0, %c0)
+        %12 = amdaie.npu.half_dma_cpy_nd async %9(%11 [] [] [] bd_id = %bd_id channel = %channel_7) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+        amdaie.npu.dma_wait(%12 : !amdaie.async_token)
+        amdaie.end
+      }
+    }
+    return
+  }
+}
+
+// -----
+
 // Same connection, but different BD IDs are used. Expect the DMA waits to be folded.
 // DMA queue has a maximum size of 4. To optimize, starting from the end of the control code, 
 // retain every 4th DMA wait operation, while folding the others and removing their tokens.
@@ -229,14 +289,16 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 
 // -----
 
-// The three DMA are operating on two different connections.
-// Expect the last two DMA operations to be batched into a single DMA wait,
-// while the first DMA operation is retained standalone, as each connection can only be accessed once per batch.
-// CHECK-LABEL: @fold_dma_waits_batching
+// The five DMA are operating on three different connections.
+// Expect the first DMA operation to be retained standalone, while the rest are batched into two DMA waits.
+// This is because each connection can only be accessed once per batch.
+// CHECK-LABEL: @fold_dma_waits_multi_batching
 // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
 // CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index
 // CHECK:       %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]])
 // CHECK:       %[[TILE_1_0:.+]] = amdaie.tile(%[[C1]], %[[C0]])
+// CHECK:       %[[TILE_3_0:.+]] = amdaie.tile(%[[C3]], %[[C0]])
 // CHECK:         %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C0]])
 // CHECK:         %[[TOKEN_0:.+]] = amdaie.npu.half_dma_cpy_nd async %{{.+}}(%{{.+}} [] [] [] bd_id = %[[BD_ID_0]]
 // CHECK:         amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token)
@@ -245,10 +307,15 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 // CHECK:         %[[BD_ID_2:.+]] = amdaie.bd_id(%[[TILE_1_0]], %[[C0]])
 // CHECK:         %[[TOKEN_2:.+]] = amdaie.npu.half_dma_cpy_nd async %{{.+}}(%{{.+}} [] [] [] bd_id = %[[BD_ID_2]]
 // CHECK:         amdaie.npu.dma_wait(%[[TOKEN_1]], %[[TOKEN_2]] : !amdaie.async_token, !amdaie.async_token)
+// CHECK:         %[[BD_ID_3:.+]] = amdaie.bd_id(%[[TILE_3_0]], %[[C0]])
+// CHECK:         %[[TOKEN_3:.+]] = amdaie.npu.half_dma_cpy_nd async %{{.+}}(%{{.+}} [] [] [] bd_id = %[[BD_ID_3]]
+// CHECK:         %[[BD_ID_4:.+]] = amdaie.bd_id(%[[TILE_1_0]], %[[C0]])
+// CHECK:         %[[TOKEN_4:.+]] = amdaie.npu.half_dma_cpy_nd async %{{.+}}(%{{.+}} [] [] [] bd_id = %[[BD_ID_4]]
+// CHECK:         amdaie.npu.dma_wait(%[[TOKEN_3]], %[[TOKEN_4]] : !amdaie.async_token, !amdaie.async_token)
 #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
-  func.func @fold_dma_waits_batching() {
+  func.func @fold_dma_waits_multi_batching() {
     %c0 = arith.constant 0 : index
     %c1 = arith.constant 1 : index
     %c3 = arith.constant 3 : index
@@ -257,42 +324,63 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
       %tile_0_0 = amdaie.tile(%c0, %c0)
       %tile_1_1 = amdaie.tile(%c1, %c1)
       %tile_1_0 = amdaie.tile(%c1, %c0)
+      %tile_3_1 = amdaie.tile(%c3, %c1)
+      %tile_3_0 = amdaie.tile(%c3, %c0)
       %buffer = amdaie.buffer(%tile_0_1) : memref<2048xi32, 1 : i32>
       %buffer_0 = amdaie.buffer(%tile_0_1) : memref<2048xi32, 1 : i32>
       %buffer_1 = amdaie.buffer(%tile_1_1) : memref<2048xi32, 1 : i32>
       %buffer_2 = amdaie.buffer(%tile_1_1) : memref<2048xi32, 1 : i32>
+      %buffer_3 = amdaie.buffer(%tile_3_1) : memref<2048xi32, 1 : i32>
+      %buffer_4 = amdaie.buffer(%tile_3_1) : memref<2048xi32, 1 : i32>
       %lock = amdaie.lock(%tile_0_1(4), 4)
-      %lock_3 = amdaie.lock(%tile_0_1(5), 0)
-      %lock_4 = amdaie.lock(%tile_1_1(4), 4)
-      %lock_5 = amdaie.lock(%tile_1_1(5), 0)
-      %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_0}, {%lock}, {%lock_3}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>
+      %lock_5 = amdaie.lock(%tile_0_1(5), 0)
+      %lock_6 = amdaie.lock(%tile_1_1(4), 4)
+      %lock_7 = amdaie.lock(%tile_1_1(5), 0)
+      %lock_8 = amdaie.lock(%tile_3_1(4), 4)
+      %lock_9 = amdaie.lock(%tile_3_1(5), 0)
+      %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_0}, {%lock}, {%lock_5}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>
       %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32>
       %2 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo<memref<64x32xi32>>
-      %3 = amdaie.logicalobjectfifo.from_buffers({%buffer_1, %buffer_2}, {%lock_4}, {%lock_5}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>
+      %3 = amdaie.logicalobjectfifo.from_buffers({%buffer_1, %buffer_2}, {%lock_6}, {%lock_7}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>
       %4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32>
       %5 = amdaie.logicalobjectfifo.placeholder{%tile_1_0} : !amdaie.logicalobjectfifo<memref<64x32xi32>>
+      %6 = amdaie.logicalobjectfifo.from_buffers({%buffer_3, %buffer_4}, {%lock_8}, {%lock_9}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>
+      %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32>
+      %8 = amdaie.logicalobjectfifo.placeholder{%tile_3_0} : !amdaie.logicalobjectfifo<memref<64x32xi32>>
       %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = MM2S)
-      %channel_6 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = S2MM)
-      %channel_7 = amdaie.channel(%tile_1_0, 0, port_type = DMA, direction = MM2S)
-      %channel_8 = amdaie.channel(%tile_1_1, 0, port_type = DMA, direction = S2MM)
-      %6 = amdaie.flow({%channel} -> {%channel_6}) {is_packet_flow = false}
-      %7 = amdaie.connection(%0 {%channel_6}, %2 {%channel}, flow = %6) {connection_type = #amdaie<connection_type Packet>} : (!amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>, !amdaie.logicalobjectfifo<memref<64x32xi32>>)
-      %8 = amdaie.flow({%channel_7} -> {%channel_8}) {is_packet_flow = false}
-      %9 = amdaie.connection(%3 {%channel_8}, %5 {%channel_7}, flow = %8) {connection_type = #amdaie<connection_type Packet>} : (!amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>, !amdaie.logicalobjectfifo<memref<64x32xi32>>)
+      %channel_10 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = S2MM)
+      %channel_11 = amdaie.channel(%tile_1_0, 0, port_type = DMA, direction = MM2S)
+      %channel_12 = amdaie.channel(%tile_1_1, 0, port_type = DMA, direction = S2MM)
+      %channel_13 = amdaie.channel(%tile_3_0, 0, port_type = DMA, direction = MM2S)
+      %channel_14 = amdaie.channel(%tile_3_1, 0, port_type = DMA, direction = S2MM)
+      %9 = amdaie.flow({%channel} -> {%channel_10}) {is_packet_flow = false}
+      %10 = amdaie.connection(%0 {%channel_10}, %2 {%channel}, flow = %9) {connection_type = #amdaie<connection_type Packet>} : (!amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>, !amdaie.logicalobjectfifo<memref<64x32xi32>>)
+      %11 = amdaie.flow({%channel_11} -> {%channel_12}) {is_packet_flow = false}
+      %12 = amdaie.connection(%3 {%channel_12}, %5 {%channel_11}, flow = %11) {connection_type = #amdaie<connection_type Packet>} : (!amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>, !amdaie.logicalobjectfifo<memref<64x32xi32>>)
+      %13 = amdaie.flow({%channel_13} -> {%channel_14}) {is_packet_flow = false}
+      %14 = amdaie.connection(%6 {%channel_14}, %8 {%channel_13}, flow = %13) {connection_type = #amdaie<connection_type Packet>} : (!amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>, !amdaie.logicalobjectfifo<memref<64x32xi32>>)
       amdaie.controlcode {
-        %10 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
+        %15 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
         memref.assume_alignment %1, 64 : memref<64x32xi32>
-        %11 = amdaie.logicalobjectfifo.from_memref %4, {%tile_1_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
+        %16 = amdaie.logicalobjectfifo.from_memref %4, {%tile_1_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
         memref.assume_alignment %4, 64 : memref<64x32xi32>
+        %17 = amdaie.logicalobjectfifo.from_memref %7, {%tile_3_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
+        memref.assume_alignment %7, 64 : memref<64x32xi32>
         %bd_id = amdaie.bd_id(%tile_0_0, %c0)
-        %12 = amdaie.npu.half_dma_cpy_nd async %7(%10 [] [] [] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xi32>>
-        amdaie.npu.dma_wait(%12 : !amdaie.async_token)
-        %bd_id_9 = amdaie.bd_id(%tile_0_0, %c0)
-        %13 = amdaie.npu.half_dma_cpy_nd async %7(%10 [] [] [] bd_id = %bd_id_9 channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xi32>>
-        %bd_id_10 = amdaie.bd_id(%tile_1_0, %c0)
-        %14 = amdaie.npu.half_dma_cpy_nd async %9(%11 [] [] [] bd_id = %bd_id_10 channel = %channel_7) : !amdaie.logicalobjectfifo<memref<2048xi32>>
-        amdaie.npu.dma_wait(%13 : !amdaie.async_token)
-        amdaie.npu.dma_wait(%14 : !amdaie.async_token)
+        %18 = amdaie.npu.half_dma_cpy_nd async %10(%15 [] [] [] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+        amdaie.npu.dma_wait(%18 : !amdaie.async_token)
+        %bd_id_15 = amdaie.bd_id(%tile_0_0, %c0)
+        %19 = amdaie.npu.half_dma_cpy_nd async %10(%15 [] [] [] bd_id = %bd_id_15 channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+        %bd_id_16 = amdaie.bd_id(%tile_1_0, %c0)
+        %20 = amdaie.npu.half_dma_cpy_nd async %12(%16 [] [] [] bd_id = %bd_id_16 channel = %channel_11) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+        amdaie.npu.dma_wait(%19 : !amdaie.async_token)
+        amdaie.npu.dma_wait(%20 : !amdaie.async_token)
+        %bd_id_17 = amdaie.bd_id(%tile_3_0, %c0)
+        %21 = amdaie.npu.half_dma_cpy_nd async %14(%17 [] [] [] bd_id = %bd_id_17 channel = %channel_13) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+        %bd_id_18 = amdaie.bd_id(%tile_1_0, %c0)
+        %22 = amdaie.npu.half_dma_cpy_nd async %12(%16 [] [] [] bd_id = %bd_id_18 channel = %channel_11) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+        amdaie.npu.dma_wait(%21 : !amdaie.async_token)
+        amdaie.npu.dma_wait(%22 : !amdaie.async_token)
         amdaie.end
       }
     }