diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp index b6bf6c877..7bdd8d0d8 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp @@ -16,14 +16,14 @@ namespace mlir::iree_compiler::AMDAIE { namespace { -using DmaQueue = std::pair; +using DmaQueueKey = std::pair; /// Utility function to determine whether a DMA wait op can be folded into a /// queue based on its half DMA copy operation. FailureOr canFoldByQueue( const AMDAIE::AMDAIEDeviceModel &deviceModel, AMDAIE::NpuHalfDmaCpyNdOp &npuHalfDmaCpyNdOp, - DenseMap> &dmaQueueToBdIds) { + DenseMap> &dmaQueueToBdIds) { // Retrieve the connection op. std::optional maybeConnectionOp = npuHalfDmaCpyNdOp.getConnectionOp(); @@ -104,7 +104,7 @@ LogicalResult foldDmaWaitsByQueue(const AMDAIE::AMDAIEDeviceModel &deviceModel, AMDAIE::ControlCodeOp controlCodeOp) { IRRewriter rewriter(controlCodeOp->getContext()); std::vector waitOpsToErase; - DenseMap> dmaQueueToBdIds; + DenseMap> dmaQueueToBdIds; // Traverse the control code in reverse. WalkResult res = controlCodeOp->walk( [&](AMDAIE::NpuDmaWaitOp waitOp) { @@ -168,17 +168,15 @@ LogicalResult updateBatchTokens(IRRewriter &rewriter, rewriter.setInsertionPointAfter(waitOps.back()); rewriter.create(waitOps.back().getLoc(), asyncTokens); - for (AMDAIE::NpuDmaWaitOp waitOp : waitOps) { - rewriter.eraseOp(waitOp); - } + for (AMDAIE::NpuDmaWaitOp waitOp : waitOps) rewriter.eraseOp(waitOp); return success(); } /// Utility function to determine if a DMA wait operation can be folded into a /// a batch based on its half DMA copy operation. -FailureOr canFoldByBatch( - AMDAIE::NpuHalfDmaCpyNdOp npuHalfDmaCpyNdOp, - SmallVector &connectionOps) { +FailureOr canFoldByBatch(Operation *batchParentOp, + AMDAIE::NpuHalfDmaCpyNdOp npuHalfDmaCpyNdOp, + DenseSet &connectionOps) { // Retrieve the connection op. std::optional maybeConnectionOp = npuHalfDmaCpyNdOp.getConnectionOp(); @@ -199,17 +197,19 @@ FailureOr canFoldByBatch( bool canFold = true; // Can't fold if the current connection op already occurs in the batch, or - // if the current operation is a packet flow, or if the batch is empty. - if (llvm::is_contained(connectionOps, connectionOp) || isPacketFlow || - connectionOps.empty()) { + // if the current operation is a packet flow, or if the batch is empty, or + // if the current operation is not in the same scope as the batch. + if (connectionOps.contains(connectionOp) || isPacketFlow || + connectionOps.empty() || + (batchParentOp != npuHalfDmaCpyNdOp->getParentOp())) { connectionOps.clear(); canFold = false; } - connectionOps.push_back(connectionOp); + connectionOps.insert(connectionOp); return canFold; } -/// Traverses the control code forward, ensuring that only one DMA wait op is +/// Traverses the control code in reverse, ensuring that only one DMA wait op is /// retained for every batch of DMA copy operations. /// /// Example Input: @@ -227,34 +227,42 @@ FailureOr canFoldByBatch( /// %2 = dma_cpy_nd(connection2) /// %3 = dma_cpy_nd(connection3) /// dma_wait(%0, %1, %2, %3) +/// Reverse traversal simplifies handling duplicate connections, preventing +/// the need to revisit and modify earlier operations after processing later +/// ones. LogicalResult foldDmaWaitsByBatch(AMDAIE::ControlCodeOp controlCodeOp) { IRRewriter rewriter(controlCodeOp->getContext()); SmallVector waitOps; - SmallVector connectionOps; - WalkResult res = controlCodeOp->walk([&](AMDAIE::NpuDmaWaitOp waitOp) { - bool toBatch = true; - for (Value token : waitOp.getAsyncTokens()) { - if (auto npuHalfDmaCpyNdOp = - dyn_cast_if_present( - token.getDefiningOp())) { - FailureOr result = - canFoldByBatch(npuHalfDmaCpyNdOp, connectionOps); - if (failed(result)) return WalkResult::interrupt(); - toBatch &= *result; - } - } - // Process the previous batch of wait ops, and start a new batch. - if (!toBatch) { - if (failed(updateBatchTokens(rewriter, waitOps))) - return WalkResult::interrupt(); - waitOps.clear(); - } - waitOps.push_back(waitOp); - return WalkResult::advance(); - }); + DenseSet connectionOps; + WalkResult res = controlCodeOp->walk( + [&](AMDAIE::NpuDmaWaitOp waitOp) { + bool toBatch = true; + Operation *batchParentOp = + waitOps.empty() ? waitOp->getParentOp() : waitOps[0]->getParentOp(); + for (Value token : waitOp.getAsyncTokens()) { + if (auto npuHalfDmaCpyNdOp = + dyn_cast_if_present( + token.getDefiningOp())) { + FailureOr result = + canFoldByBatch(batchParentOp, npuHalfDmaCpyNdOp, connectionOps); + if (failed(result)) return WalkResult::interrupt(); + toBatch &= *result; + } + } + // Process the previous batch of wait ops, and start a new batch. + if (!toBatch) { + std::reverse(waitOps.begin(), waitOps.end()); + if (failed(updateBatchTokens(rewriter, waitOps))) + return WalkResult::interrupt(); + waitOps.clear(); + } + waitOps.push_back(waitOp); + return WalkResult::advance(); + }); if (res.wasInterrupted()) return failure(); // Process the remaining wait ops. + std::reverse(waitOps.begin(), waitOps.end()); if (failed(updateBatchTokens(rewriter, waitOps))) return failure(); return success(); } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertDmaBdChain.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertDmaBdChain.cpp index b21ceb025..352c8e500 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertDmaBdChain.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertDmaBdChain.cpp @@ -17,7 +17,7 @@ namespace mlir::iree_compiler::AMDAIE { namespace { -using DmaChain = std::pair; +using DmaChainKey = std::pair; /// Utility function to update `next_bd` and `start_bd` operands. LogicalResult updateChainOperands( @@ -83,9 +83,9 @@ LogicalResult updateChainOperands( /// - Chain X: [0] (the newly added BD ID). /// - Chain Y: [] (emptied after breaking). void checkForChainsToBeBroken( - uint32_t currBdId, const DmaChain &currDmaChain, - const DenseMap> &dmaChainToBdIds, - SmallVector &chainsToBreak) { + uint32_t currBdId, const DmaChainKey &currDmaChain, + const DenseMap> &dmaChainToBdIds, + SmallVector &chainsToBreak) { for (auto &[entry, bdIds] : dmaChainToBdIds) { if (entry.first == currDmaChain.first && bdIds.contains(currBdId)) { // Break the chain that contains the duplicate BD ID. @@ -120,9 +120,10 @@ LogicalResult insertDmaBdChain(const AMDAIE::AMDAIEDeviceModel &deviceModel, } // BD IDs that have been assigned in each tile. - DenseMap> dmaChainToBdIds; + DenseMap> dmaChainToBdIds; // Buffers the DMA ops that will be chained. - DenseMap> dmaChainToDmaOps; + DenseMap> + dmaChainToDmaOps; res = controlCodeOp->walk([&](Operation *op) { @@ -185,8 +186,8 @@ LogicalResult insertDmaBdChain(const AMDAIE::AMDAIEDeviceModel &deviceModel, // Any duplicate BD ID from the same tile indicates that the chain // cannot grow further and requires breaking to release the // conflicting BD ID. - SmallVector chainsToBreak; - DmaChain currDmaChain = {tileOp, connectionOp}; + SmallVector chainsToBreak; + DmaChainKey currDmaChain = {tileOp, connectionOp}; checkForChainsToBeBroken(bdId, currDmaChain, dmaChainToBdIds, chainsToBreak); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fold_dma_waits.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fold_dma_waits.mlir index 954f86687..f74b8bad6 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fold_dma_waits.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fold_dma_waits.mlir @@ -70,6 +70,66 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- +// Expect no DMA waits to be folded, since they are operating on different scopes. +// CHECK-LABEL: @fold_dma_waits_loop +// CHECK-COUNT-2: amdaie.npu.dma_wait +// CHECK-NOT: amdaie.npu.dma_wait +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +#pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @fold_dma_waits_loop() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + amdaie.workgroup { + %tile_0_1 = amdaie.tile(%c0, %c1) + %tile_0_0 = amdaie.tile(%c0, %c0) + %tile_1_1 = amdaie.tile(%c1, %c1) + %tile_1_0 = amdaie.tile(%c1, %c0) + %buffer = amdaie.buffer(%tile_0_1) : memref<2048xi32, 1 : i32> + %buffer_0 = amdaie.buffer(%tile_0_1) : memref<2048xi32, 1 : i32> + %buffer_1 = amdaie.buffer(%tile_1_1) : memref<2048xi32, 1 : i32> + %buffer_2 = amdaie.buffer(%tile_1_1) : memref<2048xi32, 1 : i32> + %lock = amdaie.lock(%tile_0_1(4), 4) + %lock_3 = amdaie.lock(%tile_0_1(5), 0) + %lock_4 = amdaie.lock(%tile_1_1(4), 4) + %lock_5 = amdaie.lock(%tile_1_1(5), 0) + %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_0}, {%lock}, {%lock_3}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32> + %2 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> + %3 = amdaie.logicalobjectfifo.from_buffers({%buffer_1, %buffer_2}, {%lock_4}, {%lock_5}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> + %4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32> + %5 = amdaie.logicalobjectfifo.placeholder{%tile_1_0} : !amdaie.logicalobjectfifo> + %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = MM2S) + %channel_6 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = S2MM) + %channel_7 = amdaie.channel(%tile_1_0, 0, port_type = DMA, direction = MM2S) + %channel_8 = amdaie.channel(%tile_1_1, 0, port_type = DMA, direction = S2MM) + %6 = amdaie.flow({%channel} -> {%channel_6}) {is_packet_flow = false} + %7 = amdaie.connection(%0 {%channel_6}, %2 {%channel}, flow = %6) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + %8 = amdaie.flow({%channel_7} -> {%channel_8}) {is_packet_flow = false} + %9 = amdaie.connection(%3 {%channel_8}, %5 {%channel_7}, flow = %8) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + %10 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo> + memref.assume_alignment %1, 64 : memref<64x32xi32> + %11 = amdaie.logicalobjectfifo.from_memref %4, {%tile_1_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo> + memref.assume_alignment %4, 64 : memref<64x32xi32> + scf.for %arg0 = %c0 to %c1 step %c8 { + %bd_id_9 = amdaie.bd_id(%tile_0_0, %c0) + %13 = amdaie.npu.half_dma_cpy_nd async %7(%10 [] [] [] bd_id = %bd_id_9 channel = %channel) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%13 : !amdaie.async_token) + } + %bd_id = amdaie.bd_id(%tile_1_0, %c0) + %12 = amdaie.npu.half_dma_cpy_nd async %9(%11 [] [] [] bd_id = %bd_id channel = %channel_7) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%12 : !amdaie.async_token) + amdaie.end + } + } + return + } +} + +// ----- + // Same connection, but different BD IDs are used. Expect the DMA waits to be folded. // DMA queue has a maximum size of 4. To optimize, starting from the end of the control code, // retain every 4th DMA wait operation, while folding the others and removing their tokens. @@ -229,14 +289,16 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- -// The three DMA are operating on two different connections. -// Expect the last two DMA operations to be batched into a single DMA wait, -// while the first DMA operation is retained standalone, as each connection can only be accessed once per batch. -// CHECK-LABEL: @fold_dma_waits_batching +// The five DMA are operating on three different connections. +// Expect the first DMA operation to be retained standalone, while the rest are batched into two DMA waits. +// This is because each connection can only be accessed once per batch. +// CHECK-LABEL: @fold_dma_waits_multi_batching // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index // CHECK: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) // CHECK: %[[TILE_1_0:.+]] = amdaie.tile(%[[C1]], %[[C0]]) +// CHECK: %[[TILE_3_0:.+]] = amdaie.tile(%[[C3]], %[[C0]]) // CHECK: %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C0]]) // CHECK: %[[TOKEN_0:.+]] = amdaie.npu.half_dma_cpy_nd async %{{.+}}(%{{.+}} [] [] [] bd_id = %[[BD_ID_0]] // CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token) @@ -245,10 +307,15 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK: %[[BD_ID_2:.+]] = amdaie.bd_id(%[[TILE_1_0]], %[[C0]]) // CHECK: %[[TOKEN_2:.+]] = amdaie.npu.half_dma_cpy_nd async %{{.+}}(%{{.+}} [] [] [] bd_id = %[[BD_ID_2]] // CHECK: amdaie.npu.dma_wait(%[[TOKEN_1]], %[[TOKEN_2]] : !amdaie.async_token, !amdaie.async_token) +// CHECK: %[[BD_ID_3:.+]] = amdaie.bd_id(%[[TILE_3_0]], %[[C0]]) +// CHECK: %[[TOKEN_3:.+]] = amdaie.npu.half_dma_cpy_nd async %{{.+}}(%{{.+}} [] [] [] bd_id = %[[BD_ID_3]] +// CHECK: %[[BD_ID_4:.+]] = amdaie.bd_id(%[[TILE_1_0]], %[[C0]]) +// CHECK: %[[TOKEN_4:.+]] = amdaie.npu.half_dma_cpy_nd async %{{.+}}(%{{.+}} [] [] [] bd_id = %[[BD_ID_4]] +// CHECK: amdaie.npu.dma_wait(%[[TOKEN_3]], %[[TOKEN_4]] : !amdaie.async_token, !amdaie.async_token) #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { - func.func @fold_dma_waits_batching() { + func.func @fold_dma_waits_multi_batching() { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %c3 = arith.constant 3 : index @@ -257,42 +324,63 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %tile_0_0 = amdaie.tile(%c0, %c0) %tile_1_1 = amdaie.tile(%c1, %c1) %tile_1_0 = amdaie.tile(%c1, %c0) + %tile_3_1 = amdaie.tile(%c3, %c1) + %tile_3_0 = amdaie.tile(%c3, %c0) %buffer = amdaie.buffer(%tile_0_1) : memref<2048xi32, 1 : i32> %buffer_0 = amdaie.buffer(%tile_0_1) : memref<2048xi32, 1 : i32> %buffer_1 = amdaie.buffer(%tile_1_1) : memref<2048xi32, 1 : i32> %buffer_2 = amdaie.buffer(%tile_1_1) : memref<2048xi32, 1 : i32> + %buffer_3 = amdaie.buffer(%tile_3_1) : memref<2048xi32, 1 : i32> + %buffer_4 = amdaie.buffer(%tile_3_1) : memref<2048xi32, 1 : i32> %lock = amdaie.lock(%tile_0_1(4), 4) - %lock_3 = amdaie.lock(%tile_0_1(5), 0) - %lock_4 = amdaie.lock(%tile_1_1(4), 4) - %lock_5 = amdaie.lock(%tile_1_1(5), 0) - %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_0}, {%lock}, {%lock_3}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> + %lock_5 = amdaie.lock(%tile_0_1(5), 0) + %lock_6 = amdaie.lock(%tile_1_1(4), 4) + %lock_7 = amdaie.lock(%tile_1_1(5), 0) + %lock_8 = amdaie.lock(%tile_3_1(4), 4) + %lock_9 = amdaie.lock(%tile_3_1(5), 0) + %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_0}, {%lock}, {%lock_5}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32> %2 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> - %3 = amdaie.logicalobjectfifo.from_buffers({%buffer_1, %buffer_2}, {%lock_4}, {%lock_5}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> + %3 = amdaie.logicalobjectfifo.from_buffers({%buffer_1, %buffer_2}, {%lock_6}, {%lock_7}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> %4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32> %5 = amdaie.logicalobjectfifo.placeholder{%tile_1_0} : !amdaie.logicalobjectfifo> + %6 = amdaie.logicalobjectfifo.from_buffers({%buffer_3, %buffer_4}, {%lock_8}, {%lock_9}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> + %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32> + %8 = amdaie.logicalobjectfifo.placeholder{%tile_3_0} : !amdaie.logicalobjectfifo> %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = MM2S) - %channel_6 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = S2MM) - %channel_7 = amdaie.channel(%tile_1_0, 0, port_type = DMA, direction = MM2S) - %channel_8 = amdaie.channel(%tile_1_1, 0, port_type = DMA, direction = S2MM) - %6 = amdaie.flow({%channel} -> {%channel_6}) {is_packet_flow = false} - %7 = amdaie.connection(%0 {%channel_6}, %2 {%channel}, flow = %6) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) - %8 = amdaie.flow({%channel_7} -> {%channel_8}) {is_packet_flow = false} - %9 = amdaie.connection(%3 {%channel_8}, %5 {%channel_7}, flow = %8) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + %channel_10 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = S2MM) + %channel_11 = amdaie.channel(%tile_1_0, 0, port_type = DMA, direction = MM2S) + %channel_12 = amdaie.channel(%tile_1_1, 0, port_type = DMA, direction = S2MM) + %channel_13 = amdaie.channel(%tile_3_0, 0, port_type = DMA, direction = MM2S) + %channel_14 = amdaie.channel(%tile_3_1, 0, port_type = DMA, direction = S2MM) + %9 = amdaie.flow({%channel} -> {%channel_10}) {is_packet_flow = false} + %10 = amdaie.connection(%0 {%channel_10}, %2 {%channel}, flow = %9) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + %11 = amdaie.flow({%channel_11} -> {%channel_12}) {is_packet_flow = false} + %12 = amdaie.connection(%3 {%channel_12}, %5 {%channel_11}, flow = %11) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + %13 = amdaie.flow({%channel_13} -> {%channel_14}) {is_packet_flow = false} + %14 = amdaie.connection(%6 {%channel_14}, %8 {%channel_13}, flow = %13) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) amdaie.controlcode { - %10 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo> + %15 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo> memref.assume_alignment %1, 64 : memref<64x32xi32> - %11 = amdaie.logicalobjectfifo.from_memref %4, {%tile_1_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo> + %16 = amdaie.logicalobjectfifo.from_memref %4, {%tile_1_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo> memref.assume_alignment %4, 64 : memref<64x32xi32> + %17 = amdaie.logicalobjectfifo.from_memref %7, {%tile_3_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo> + memref.assume_alignment %7, 64 : memref<64x32xi32> %bd_id = amdaie.bd_id(%tile_0_0, %c0) - %12 = amdaie.npu.half_dma_cpy_nd async %7(%10 [] [] [] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo> - amdaie.npu.dma_wait(%12 : !amdaie.async_token) - %bd_id_9 = amdaie.bd_id(%tile_0_0, %c0) - %13 = amdaie.npu.half_dma_cpy_nd async %7(%10 [] [] [] bd_id = %bd_id_9 channel = %channel) : !amdaie.logicalobjectfifo> - %bd_id_10 = amdaie.bd_id(%tile_1_0, %c0) - %14 = amdaie.npu.half_dma_cpy_nd async %9(%11 [] [] [] bd_id = %bd_id_10 channel = %channel_7) : !amdaie.logicalobjectfifo> - amdaie.npu.dma_wait(%13 : !amdaie.async_token) - amdaie.npu.dma_wait(%14 : !amdaie.async_token) + %18 = amdaie.npu.half_dma_cpy_nd async %10(%15 [] [] [] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%18 : !amdaie.async_token) + %bd_id_15 = amdaie.bd_id(%tile_0_0, %c0) + %19 = amdaie.npu.half_dma_cpy_nd async %10(%15 [] [] [] bd_id = %bd_id_15 channel = %channel) : !amdaie.logicalobjectfifo> + %bd_id_16 = amdaie.bd_id(%tile_1_0, %c0) + %20 = amdaie.npu.half_dma_cpy_nd async %12(%16 [] [] [] bd_id = %bd_id_16 channel = %channel_11) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%19 : !amdaie.async_token) + amdaie.npu.dma_wait(%20 : !amdaie.async_token) + %bd_id_17 = amdaie.bd_id(%tile_3_0, %c0) + %21 = amdaie.npu.half_dma_cpy_nd async %14(%17 [] [] [] bd_id = %bd_id_17 channel = %channel_13) : !amdaie.logicalobjectfifo> + %bd_id_18 = amdaie.bd_id(%tile_1_0, %c0) + %22 = amdaie.npu.half_dma_cpy_nd async %12(%16 [] [] [] bd_id = %bd_id_18 channel = %channel_11) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%21 : !amdaie.async_token) + amdaie.npu.dma_wait(%22 : !amdaie.async_token) amdaie.end } }