diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td index c1bba31f1..fa45c468e 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td @@ -336,7 +336,7 @@ def AMDAIE_BdIdOp: AMDAIE_Op<"bd_id", [ let arguments = ( ins Index:$tile, - UI32Attr:$value + Index:$value ); let assemblyFormat = [{ `(` $tile `,` $value `)` attr-dict }]; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir index bb31f7ec9..ad161a626 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir @@ -3,11 +3,11 @@ // CHECK-LABEL: func.func @bd_id // CHECK: %[[C0:.*]] = arith.constant 0 : index // CHECK: %[[TILE_0:.*]] = amdaie.tile(%[[C0]], %[[C0]]) -// CHECK: %[[BD_ID:.*]] = amdaie.bd_id(%[[TILE_0]], 0) +// CHECK: %[[BD_ID:.*]] = amdaie.bd_id(%[[TILE_0]], %[[C0]]) func.func @bd_id() { %c0 = arith.constant 0 : index %tile = amdaie.tile(%c0, %c0) - %bd_id = amdaie.bd_id(%tile, 0) + %bd_id = amdaie.bd_id(%tile, %c0) return } @@ -295,7 +295,7 @@ func.func @npu_dma_cpy_nd(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, // CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index // CHECK-DAG: %[[C128:.+]] = arith.constant 128 : index // CHECK-DAG: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) -// CHECK-DAG: %[[BD_ID_0_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], 0) +// CHECK-DAG: %[[BD_ID_0_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C0]]) // CHECK-DAG: %[[CONNECTION_0:.+]] = amdaie.connection // CHECK: %{{.*}} = amdaie.npu.dma_cpy_nd async_source // CHECK-SAME: %[[CONNECTION_0]] @@ -308,7 +308,7 @@ func.func @npu_dma_cpy_nd_bd_id(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16 %c16 = arith.constant 16 : index %c128 = arith.constant 128 : index %tile = amdaie.tile(%c0, %c0) - %bd_id = amdaie.bd_id(%tile, 0) + %bd_id = amdaie.bd_id(%tile, %c0) %0 = amdaie.connection(%arg0, %arg1) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) %1 = amdaie.npu.dma_cpy_nd async_source %0([%c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16] [%c128, %c128, %c16, %c1] bd_id = %bd_id, [%c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16] [%c128, %c16, %c16, %c1] bd_id = %bd_id) return @@ -371,7 +371,7 @@ func.func @npu_dma_cpy_nd_target_source(%arg0: !amdaie.logicalobjectfifo<memref< // CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index // CHECK-DAG: %[[C128:.+]] = arith.constant 128 : index // CHECK-DAG: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) -// CHECK-DAG: %[[BD_ID_0_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], 0) +// CHECK-DAG: %[[BD_ID_0_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C0]]) // CHECK-DAG: %[[CONNECTION_0:.+]] = amdaie.connection // CHECK: %{{.*}} = amdaie.npu.dma_cpy_nd async_source %[[CONNECTION_0]] // CHECK-SAME: %[[ARG0]][%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [1, 1, %[[C8]], %[[C16]]] [%[[C128]], %[[C128]], %[[C16]], 1] bd_id = %[[BD_ID_0_0]] @@ -383,7 +383,7 @@ func.func @npu_dma_cpy_nd_all_operands(%arg0: !amdaie.logicalobjectfifo<memref<1 %c16 = arith.constant 16 : index %c128 = arith.constant 128 : index %tile = amdaie.tile(%c0, %c0) - %bd_id = amdaie.bd_id(%tile, 0) + %bd_id = amdaie.bd_id(%tile, %c0) %0 = amdaie.connection(%arg0, %arg1) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) %1 = amdaie.npu.dma_cpy_nd async_source %0(%arg0[%c0, %c0, %c0, %c0] [1, 1, %c8, %c16] [%c128, %c128, %c16, 1] bd_id = %bd_id, %arg1[%c0, %c0, %c0, %c0] [1, 1, %c8, %c16] [%c128, %c16, %c16, 1] bd_id = %bd_id) : target_type = !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>> source_type = !amdaie.logicalobjectfifo<memref<8x16xi32, 1>> return @@ -396,14 +396,14 @@ func.func @npu_dma_cpy_nd_all_operands(%arg0: !amdaie.logicalobjectfifo<memref<1 // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index // CHECK-DAG: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) -// CHECK-DAG: %[[BD_ID:.+]] = amdaie.bd_id(%[[TILE_0_0]], 0) +// CHECK-DAG: %[[BD_ID:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C0]]) // CHECK-DAG: %[[CHANNEL:.*]] = amdaie.channel(%[[TILE_0_0]], 0, port_type = DMA, direction = S2MM) // CHECK-DAG: %[[CONNECTION_0:.+]] = amdaie.connection func.func @npu_half_dma_cpy_nd(%arg0: !amdaie.logicalobjectfifo<memref<2048xi32>>, %arg1: !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>>) { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %tile_0_0 = amdaie.tile(%c0, %c0) - %bd_id = amdaie.bd_id(%tile_0_0, 0) + %bd_id = amdaie.bd_id(%tile_0_0, %c0) %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = S2MM) %0 = amdaie.connection(%arg0, %arg1) : (!amdaie.logicalobjectfifo<memref<2048xi32>>, !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>>) // CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[ARG0]] [] [] []) : !amdaie.logicalobjectfifo<memref<2048xi32>> diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignNpuDmaBdIds.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignNpuDmaBdIds.cpp index 1ba8d185d..29b4bdd2b 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignNpuDmaBdIds.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignNpuDmaBdIds.cpp @@ -9,6 +9,8 @@ #include "iree-amd-aie/Transforms/Passes.h" #include "iree-amd-aie/aie_runtime/Utils/ChannelBdIdGenerator.h" #include "iree-amd-aie/aie_runtime/iree_aie_runtime.h" +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/SCF/Utils/Utils.h" #define DEBUG_TYPE "iree-amdaie-assign-npu-dma-bd-ids" @@ -16,7 +18,161 @@ namespace mlir::iree_compiler::AMDAIE { namespace { -/// Assign BD ids to NPU dma operations using the BD generator +// Utility to retrieve a TileOp from a vector of tile values, while doing +// appropriate verifications. +FailureOr<AMDAIE::TileOp> getGeneratorTileOp( + AMDAIE::NpuDmaCpyNdOp &npuDmaOp, + DenseMap<Value, ChannelBdIdGenerator> &shimTileToGeneratorMap) { + SmallVector<Value> tiles; + if (npuDmaOp.getSource()) { + auto logicalObjFifo = + dyn_cast_if_present<AMDAIE::LogicalObjFifoOpInterface>( + npuDmaOp.getSource().getDefiningOp()); + if (!logicalObjFifo) + return npuDmaOp.emitOpError() << "expected a source logical objectFifo"; + tiles = logicalObjFifo.getTiles(); + } + if (npuDmaOp.getTarget()) { + auto logicalObjFifo = + dyn_cast_if_present<AMDAIE::LogicalObjectFifoFromMemrefOp>( + npuDmaOp.getTarget().getDefiningOp()); + if (!logicalObjFifo) + return npuDmaOp.emitOpError() + << "expected a target `amdaie.logicalobjectfifo.from_memref`"; + tiles = logicalObjFifo.getTiles(); + } + if (tiles.size() != 1) { + if (tiles.empty()) { + return npuDmaOp.emitOpError() << "no tiles found"; + } else { + return npuDmaOp.emitOpError() + << "operating on multiple tiles is not supported"; + } + } + Value tile = tiles[0]; + if (!shimTileToGeneratorMap.contains(tile)) { + return npuDmaOp.emitOpError() + << "no channel BD ID generator found for tile: " << tile; + } + auto tileOp = dyn_cast_if_present<AMDAIE::TileOp>(tile.getDefiningOp()); + if (!tileOp) return npuDmaOp.emitOpError() << "no tile op found"; + return tileOp; +}; + +// Check if the DMA operation is in the innermost loop of controlcode. +bool isInMostInnerLoop(AMDAIE::NpuDmaCpyNdOp op) { + auto parentLoop = op->getParentOfType<scf::ForOp>(); + if (!parentLoop) { + return false; + } + bool hasNestedLoop = false; + parentLoop.walk([&](scf::ForOp nestedLoop) { + if (nestedLoop != parentLoop) { + hasNestedLoop = true; + } + }); + return !hasNestedLoop; +} + +// Count the number of BD IDs needed per loop iteration, +// so that we know where to start the BD ID for the next iteration. +uint32_t countBdIdPerLoopIteration( + scf::ForOp loop, AMDAIE::TileOp tileOp, + DenseMap<Value, ChannelBdIdGenerator> &shimTileToGeneratorMap) { + uint32_t count = 0; + loop.walk([&](AMDAIE::NpuDmaCpyNdOp dmaOp) { + if (dmaOp.getSource() || dmaOp.getTarget()) { + FailureOr<AMDAIE::TileOp> tile = + getGeneratorTileOp(dmaOp, shimTileToGeneratorMap); + if (succeeded(tile) && *tile == tileOp) { + count++; + } + } + }); + return count; +} + +FailureOr<AMDAIE::BdIdOp> getBdIdOp( + IRRewriter &rewriter, AMDAIE::NpuDmaCpyNdOp &npuDmaOp, + DenseMap<Value, ChannelBdIdGenerator> &shimTileToGeneratorMap, + DenseMap<TileOp, uint32_t> &tileToBdIdOffsetMap, + DenseMap<TileOp, uint32_t> &tileToBdIdSizeMap, uint32_t channel) { + FailureOr<AMDAIE::TileOp> tileOp = + getGeneratorTileOp(npuDmaOp, shimTileToGeneratorMap); + if (failed(tileOp)) return failure(); + + ChannelBdIdGenerator &generator = shimTileToGeneratorMap[tileOp->getResult()]; + std::optional<uint32_t> bdId = + generator.getAndAssignBdId(channel, BdIdAssignmentMode::Incremental); + if (!bdId) return failure(); + AMDAIE::BdIdOp bdIdOp; + rewriter.setInsertionPoint(npuDmaOp); + if (isInMostInnerLoop(npuDmaOp)) { + // If the DMA is in the innermost loop, assign a BD ID using the + // semi-affine expression: + // `(iv * step + bdId - offset) % size + offset`, + // `step` represents the number of BD IDs needed per loop iteration, + // `bdId` is the BD ID assigned by the generator, + // `offset` is the BD ID assigned to the first DMA in the loop, + // `size` is the number of BD IDs available. + if (!tileToBdIdOffsetMap.contains(*tileOp)) + tileToBdIdOffsetMap[*tileOp] = bdId.value(); + // plus one because one BD ID is just assigned in this function. + if (!tileToBdIdSizeMap.contains(*tileOp)) + tileToBdIdSizeMap[*tileOp] = generator.getAvailableBdIdNum(channel) + 1; + auto loop = npuDmaOp->getParentOfType<scf::ForOp>(); + uint32_t bdIdCount = + countBdIdPerLoopIteration(loop, *tileOp, shimTileToGeneratorMap); + Value iv = loop.getInductionVar(); + auto step = rewriter.create<arith::ConstantOp>( + rewriter.getUnknownLoc(), rewriter.getIndexAttr(bdIdCount)); + auto diff = rewriter.create<arith::ConstantOp>( + rewriter.getUnknownLoc(), + rewriter.getIndexAttr(bdId.value() - + tileToBdIdOffsetMap[*tileOp])); // bdId - offset + auto offset = rewriter.create<arith::ConstantOp>( + rewriter.getUnknownLoc(), + rewriter.getIndexAttr(tileToBdIdOffsetMap[*tileOp])); + auto size = rewriter.create<arith::ConstantOp>( + rewriter.getUnknownLoc(), + rewriter.getIndexAttr(tileToBdIdSizeMap[*tileOp])); + auto mul = rewriter.create<arith::MulIOp>(rewriter.getUnknownLoc(), iv, + step); // iv * step + auto add1 = + rewriter.create<arith::AddIOp>(rewriter.getUnknownLoc(), mul, + diff); // iv * step + bdId - offset + auto mod = rewriter.create<arith::RemUIOp>( + rewriter.getUnknownLoc(), add1, + size); // (iv * step + bdId - offset) % size + auto add2 = rewriter.create<arith::AddIOp>( + rewriter.getUnknownLoc(), mod, + offset); // (iv * step + bdId - offset) % size + offset + bdIdOp = rewriter.create<AMDAIE::BdIdOp>(rewriter.getUnknownLoc(), *tileOp, + add2.getResult()); + } else { + // If the DMA is not in the innermost loop, assign a constant BD ID + auto constant = rewriter.create<arith::ConstantOp>( + rewriter.getUnknownLoc(), rewriter.getIndexAttr(bdId.value())); + bdIdOp = rewriter.create<AMDAIE::BdIdOp>(rewriter.getUnknownLoc(), *tileOp, + constant.getResult()); + } + return bdIdOp; +}; + +FailureOr<uint32_t> retriveBdId(arith::AddIOp add2) { + uint32_t offset = getConstantIndexOrAssert(add2.getOperand(1)); + if (auto mod = dyn_cast<arith::RemUIOp>(add2.getOperand(0).getDefiningOp())) { + if (auto add1 = + dyn_cast<arith::AddIOp>(mod.getOperand(0).getDefiningOp())) { + uint32_t diff = getConstantIndexOrAssert(add1.getOperand(1)); + uint32_t bdId = offset + diff; + return bdId; + } + } + return failure(); +}; + +/// Assign BD ids to NPU dma operations using the BD generator. LogicalResult assignNpuDmaBdIds(AMDAIE::WorkgroupOp workgroupOp) { IRRewriter rewriter(workgroupOp->getContext()); @@ -39,25 +195,15 @@ LogicalResult assignNpuDmaBdIds(AMDAIE::WorkgroupOp workgroupOp) { } }); - // Utility to retrieve a TileOp from a vector of tile values, while doing - // appropriate verifications. - auto getGeneratorTileOp = [&](AMDAIE::NpuDmaCpyNdOp &npuDmaOp, - const SmallVector<Value> &tiles, - AMDAIE::TileOp &tileOp) -> LogicalResult { - if (tiles.size() != 1) { - return npuDmaOp.emitOpError() - << "operating on multiple tiles is not supported"; - } - Value tile = tiles[0]; - if (!shimTileToGeneratorMap.contains(tile)) { - return npuDmaOp.emitOpError() - << "no channel BD ID generator found for tile: " << tile; - } - tileOp = dyn_cast_if_present<AMDAIE::TileOp>(tile.getDefiningOp()); - if (!tileOp) return npuDmaOp.emitOpError() << "no tile op found"; - return success(); - }; + // TODO(jornt): Temporarily use channel 0 for all DMAs. This should + // return correct results for Shim channels, however, for generality + // towards other DMAs and future hardware generations, channel + // assignment should happen before BD assignemnt. This requires more + // refactoring. + const uint32_t channel = 0; + DenseMap<AMDAIE::TileOp, uint32_t> tileToBdIdOffsetMap; + DenseMap<AMDAIE::TileOp, uint32_t> tileToBdIdSizeMap; // Walk `amdaie.npu_dma_cpy_nd` and `amdaie.dma_wait` operations and assign // and release BD IDs when encountering the respective operations using the // tile BD ID generators initialized earlier. @@ -65,28 +211,10 @@ LogicalResult assignNpuDmaBdIds(AMDAIE::WorkgroupOp workgroupOp) { WalkResult res = controlCodeOp->walk([&](Operation *op) { if (auto npuDmaOp = dyn_cast<AMDAIE::NpuDmaCpyNdOp>(op)) { if (npuDmaOp.getSource()) { - auto logicalObjFifo = - dyn_cast_if_present<AMDAIE::LogicalObjFifoOpInterface>( - npuDmaOp.getSource().getDefiningOp()); - if (!logicalObjFifo) { - npuDmaOp.emitOpError() << "expected a source logical objectFifo"; - return WalkResult::interrupt(); - } - SmallVector<Value> tiles = logicalObjFifo.getTiles(); - AMDAIE::TileOp tileOp; - if (failed(getGeneratorTileOp(npuDmaOp, tiles, tileOp))) - return WalkResult::interrupt(); - ChannelBdIdGenerator &generator = - shimTileToGeneratorMap[tileOp.getResult()]; - // TODO(jornt): Temporarily use channel 0 for all DMAs. This should - // return correct results for Shim channels, however, for generality - // towards other DMAs and future hardware generations, channel - // assignment should happen before BD assignemnt. This requires more - // refactoring. - std::optional<uint32_t> bdId = generator.getAndAssignBdId(0); - rewriter.setInsertionPointAfter(tileOp); - auto bdIdOp = rewriter.create<AMDAIE::BdIdOp>(rewriter.getUnknownLoc(), - tileOp, bdId.value()); + FailureOr<AMDAIE::BdIdOp> bdIdOp = + getBdIdOp(rewriter, npuDmaOp, shimTileToGeneratorMap, + tileToBdIdOffsetMap, tileToBdIdSizeMap, channel); + if (failed(bdIdOp)) return WalkResult::interrupt(); rewriter.setInsertionPoint(npuDmaOp); npuDmaOp = rewriter.replaceOpWithNewOp<AMDAIE::NpuDmaCpyNdOp>( npuDmaOp, npuDmaOp.getResultTypes(), npuDmaOp.getConnection(), @@ -94,38 +222,19 @@ LogicalResult assignNpuDmaBdIds(AMDAIE::WorkgroupOp workgroupOp) { npuDmaOp.getTargetMixedSizes(), npuDmaOp.getTargetMixedStrides(), npuDmaOp.getTargetBdId(), npuDmaOp.getSource(), npuDmaOp.getSourceMixedOffsets(), npuDmaOp.getSourceMixedSizes(), - npuDmaOp.getSourceMixedStrides(), bdIdOp); + npuDmaOp.getSourceMixedStrides(), *bdIdOp); } if (npuDmaOp.getTarget()) { - auto logicalObjFifo = - dyn_cast_if_present<AMDAIE::LogicalObjectFifoFromMemrefOp>( - npuDmaOp.getTarget().getDefiningOp()); - if (!logicalObjFifo) { - npuDmaOp.emitOpError() - << "expected a target `amdaie.logicalobjectfifo.from_memref`"; - return WalkResult::interrupt(); - } - SmallVector<Value> tiles = logicalObjFifo.getTiles(); - AMDAIE::TileOp tileOp; - if (failed(getGeneratorTileOp(npuDmaOp, tiles, tileOp))) - return WalkResult::interrupt(); - ChannelBdIdGenerator &generator = - shimTileToGeneratorMap[tileOp.getResult()]; - // TODO(jornt): Temporarily use channel 0 for all DMAs. This should - // return correct results for Shim channels, however, for generality - // towards other DMAs and future hardware generations, channel - // assignment should happen before BD assignemnt. This requires more - // refactoring. - std::optional<uint32_t> bdId = generator.getAndAssignBdId(0); - rewriter.setInsertionPointAfter(tileOp); - auto bdIdOp = rewriter.create<AMDAIE::BdIdOp>(rewriter.getUnknownLoc(), - tileOp, bdId.value()); + FailureOr<AMDAIE::BdIdOp> bdIdOp = + getBdIdOp(rewriter, npuDmaOp, shimTileToGeneratorMap, + tileToBdIdOffsetMap, tileToBdIdSizeMap, channel); + if (failed(bdIdOp)) return WalkResult::interrupt(); rewriter.setInsertionPoint(npuDmaOp); (void)rewriter.replaceOpWithNewOp<AMDAIE::NpuDmaCpyNdOp>( npuDmaOp, npuDmaOp.getResultTypes(), npuDmaOp.getConnection(), npuDmaOp.getTarget(), npuDmaOp.getTargetMixedOffsets(), npuDmaOp.getTargetMixedSizes(), npuDmaOp.getTargetMixedStrides(), - bdIdOp, npuDmaOp.getSource(), npuDmaOp.getSourceMixedOffsets(), + *bdIdOp, npuDmaOp.getSource(), npuDmaOp.getSourceMixedOffsets(), npuDmaOp.getSourceMixedSizes(), npuDmaOp.getSourceMixedStrides(), npuDmaOp.getSourceBdId()); } @@ -158,8 +267,18 @@ LogicalResult assignNpuDmaBdIds(AMDAIE::WorkgroupOp workgroupOp) { } ChannelBdIdGenerator &generator = shimTileToGeneratorMap[tileOp.getResult()]; - uint32_t value = bdIdOp.getValue(); - generator.releaseBdId(value); + Value value = bdIdOp.getValue(); + if (auto addOp = value.getDefiningOp<arith::AddIOp>()) { + // If the BD ID is a semi-affine expression, retrieve the BD ID for + // the first iteration. + FailureOr<uint32_t> bdId = retriveBdId(addOp); + if (failed(bdId)) return WalkResult::interrupt(); + generator.releaseBdId(*bdId); + } else { + // Else, must be a constant BD ID. + uint32_t bdId = getConstantIndexOrAssert(value); + generator.releaseBdId(bdId); + } } return WalkResult::advance(); } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeLowering.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeLowering.cpp index 2348478d9..8fdd11f56 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeLowering.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeLowering.cpp @@ -136,7 +136,7 @@ struct HalfDmaCpyNdToNpuConverter final return bdIdOp.emitOpError() << "must operate on an `amdaie.tile`"; int64_t col = getConstantIndexOrAssert(tileOp.getCol()); int64_t row = getConstantIndexOrAssert(tileOp.getRow()); - int32_t bdId = bdIdOp.getValue(); + int32_t bdId = getConstantIndexOrAssert(bdIdOp.getValue()); int32_t outOfOrderId{0}; SmallVector<int32_t, 4> staticSizes; @@ -159,22 +159,18 @@ struct HalfDmaCpyNdToNpuConverter final if (stride == 0) { repeatCount = size; } else { - iterationStride = - std::max(stride * elemWidthInBits / minStrideBitWidth, - (int64_t)1); + iterationStride = std::max( + stride * elemWidthInBits / minStrideBitWidth, (int64_t)1); iterationSize = size; - if (stride == 1) - size = (size * elemWidthInBits) / minStrideBitWidth; + if (stride == 1) size = (size * elemWidthInBits) / minStrideBitWidth; repeatCount = iterationSize; } } else { staticStrides.push_back( - std::max(stride * elemWidthInBits / minStrideBitWidth, - (int64_t)1)); + std::max(stride * elemWidthInBits / minStrideBitWidth, (int64_t)1)); // Innermost size needs to account for addressing granularity. if (iter.index() == (sizes.size() - 1)) { - staticSizes.push_back(size * elemWidthInBits / - minStrideBitWidth); + staticSizes.push_back(size * elemWidthInBits / minStrideBitWidth); } else { staticSizes.push_back(size); } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_npu_dma_bd_ids.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_npu_dma_bd_ids.mlir index b482663de..b4d0aabe0 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_npu_dma_bd_ids.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_npu_dma_bd_ids.mlir @@ -15,10 +15,10 @@ module { // CHECK: %[[C0:.+]] = arith.constant 0 : index // CHECK: amdaie.workgroup // CHECK-DAG: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) -// CHECK-DAG: %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], 0) // CHECK-DAG: %[[FROM_MEMREF:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_0_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>> // CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd // CHECK: amdaie.controlcode +// CHECK: %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C0]]) // CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA]]([] [] [], %[[FROM_MEMREF]][0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_0]]) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]] : !amdaie.async_source_token) #map = affine_map<(d0) -> (d0 * 16)> @@ -50,10 +50,10 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK: %[[C0:.+]] = arith.constant 0 : index // CHECK: amdaie.workgroup // CHECK-DAG: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) -// CHECK-DAG: %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], 0) // CHECK-DAG: %[[FROM_MEMREF:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_0_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>> // CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd // CHECK: amdaie.controlcode +// CHECK: %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C0]]) // CHECK: %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd async_target %[[CIRC_DMA]](%[[FROM_MEMREF]][0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_0]], [] [] []) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA]] : !amdaie.async_target_token) #map = affine_map<(d0) -> (d0 * 16)> @@ -87,11 +87,8 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK: %[[C2:.+]] = arith.constant 2 : index // CHECK: amdaie.workgroup // CHECK-DAG: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) -// CHECK-DAG: %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], 0) // CHECK-DAG: %[[TILE_1_0:.+]] = amdaie.tile(%[[C1]], %[[C0]]) -// CHECK-DAG: %[[BD_ID_1:.+]] = amdaie.bd_id(%[[TILE_1_0]], 0) // CHECK-DAG: %[[TILE_2_0:.+]] = amdaie.tile(%[[C2]], %[[C0]]) -// CHECK-DAG: %[[BD_ID_2:.+]] = amdaie.bd_id(%[[TILE_2_0]], 0) // CHECK-DAG: %[[FROM_MEMREF_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_0_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>> // CHECK-DAG: %[[FROM_MEMREF_1:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_1_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>> // CHECK-DAG: %[[FROM_MEMREF_2:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_2_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>> @@ -99,8 +96,11 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK: %[[CIRC_DMA_1:.+]] = amdaie.circular_dma_cpy_nd // CHECK: %[[CIRC_DMA_2:.+]] = amdaie.circular_dma_cpy_nd // CHECK: amdaie.controlcode +// CHECK: %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C0]]) // CHECK: %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA_0]]([] [] [], %[[FROM_MEMREF_0]][0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_0]]) +// CHECK: %[[BD_ID_1:.+]] = amdaie.bd_id(%[[TILE_1_0]], %[[C0]]) // CHECK: %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA_1]]([] [] [], %[[FROM_MEMREF_1]][0, 0] [8, 16] [16, 1] bd_id = %[[BD_ID_1]]) +// CHECK: %[[BD_ID_2:.+]] = amdaie.bd_id(%[[TILE_2_0]], %[[C0]]) // CHECK: %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA_2]]([] [] [], %[[FROM_MEMREF_2]][0] [128] [1] bd_id = %[[BD_ID_2]]) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_0]] : !amdaie.async_source_token) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_1]] : !amdaie.async_source_token) @@ -143,24 +143,28 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- -// CHECK-LABEL: @multiple_dma_cpy_with_bd_id_reuse -// CHECK: %[[C0:.+]] = arith.constant 0 : index +// CHECK-LABEL: @multiple_dma_cpy_with_diff_bd_id_1 +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index // CHECK: amdaie.workgroup // CHECK-DAG: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) -// CHECK-DAG: %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], 0) // CHECK-DAG: %[[FROM_MEMREF_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_0_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>> // CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd // CHECK: amdaie.controlcode +// CHECK: %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C0]]) // CHECK: %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA]]([] [] [], %[[FROM_MEMREF_0]][0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_0]]) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_0]] : !amdaie.async_source_token) -// CHECK: %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA]]([] [] [], %[[FROM_MEMREF_0]][0, 0] [8, 16] [16, 1] bd_id = %[[BD_ID_0]]) +// CHECK: %[[BD_ID_1:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C1]]) +// CHECK: %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA]]([] [] [], %[[FROM_MEMREF_0]][0, 0] [8, 16] [16, 1] bd_id = %[[BD_ID_1]]) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_1]] : !amdaie.async_source_token) -// CHECK: %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA]]([] [] [], %[[FROM_MEMREF_0]][0] [128] [1] bd_id = %[[BD_ID_0]]) +// CHECK: %[[BD_ID_2:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C2]]) +// CHECK: %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA]]([] [] [], %[[FROM_MEMREF_0]][0] [128] [1] bd_id = %[[BD_ID_2]]) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_2]] : !amdaie.async_source_token) #map = affine_map<(d0) -> (d0 * 16)> #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { - func.func @multiple_dma_cpy_with_bd_id_reuse(%arg0: memref<8x16xi32>, %arg1: memref<1x1x8x16xi32, 1>) { + func.func @multiple_dma_cpy_with_diff_bd_id_1(%arg0: memref<8x16xi32>, %arg1: memref<1x1x8x16xi32, 1>) { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index amdaie.workgroup { @@ -186,18 +190,20 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- -// CHECK-LABEL: @multiple_dma_cpy_with_diff_bd_id -// CHECK: %[[C0:.+]] = arith.constant 0 : index +// CHECK-LABEL: @multiple_dma_cpy_with_diff_bd_id_2 +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index // CHECK: amdaie.workgroup // CHECK: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) -// CHECK-DAG: %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], 0) -// CHECK-DAG: %[[BD_ID_1:.+]] = amdaie.bd_id(%[[TILE_0_0]], 1) -// CHECK-DAG: %[[BD_ID_2:.+]] = amdaie.bd_id(%[[TILE_0_0]], 2) // CHECK-DAG: %[[FROM_MEMREF_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_0_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>> // CHECK: %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd // CHECK: amdaie.controlcode +// CHECK: %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C0]]) // CHECK: %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA]]([] [] [], %[[FROM_MEMREF_0]][0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_0]]) +// CHECK: %[[BD_ID_1:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C1]]) // CHECK: %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA]]([] [] [], %[[FROM_MEMREF_0]][0, 0] [8, 16] [16, 1] bd_id = %[[BD_ID_1]]) +// CHECK: %[[BD_ID_2:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C2]]) // CHECK: %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA]]([] [] [], %[[FROM_MEMREF_0]][0] [128] [1] bd_id = %[[BD_ID_2]]) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_0]] : !amdaie.async_source_token) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_1]] : !amdaie.async_source_token) @@ -205,7 +211,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} #map = affine_map<(d0) -> (d0 * 16)> #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { - func.func @multiple_dma_cpy_with_diff_bd_id(%arg0: memref<8x16xi32>, %arg1: memref<1x1x8x16xi32, 1>) { + func.func @multiple_dma_cpy_with_diff_bd_id_2(%arg0: memref<8x16xi32>, %arg1: memref<1x1x8x16xi32, 1>) { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index amdaie.workgroup { @@ -231,20 +237,17 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- -// CHECK-LABEL: @nested_loops -// CHECK: %[[C0:.+]] = arith.constant 0 : index -// CHECK: %[[C1:.+]] = arith.constant 1 : index -// CHECK: %[[C2:.+]] = arith.constant 2 : index -// CHECK: %[[C6:.+]] = arith.constant 6 : index +// CHECK-LABEL: @nested_loops_1 +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index +// CHECK-DAG: %[[C6:.+]] = arith.constant 6 : index +// CHECK-DAG: %[[C15:.+]] = arith.constant 15 : index +// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index // CHECK: amdaie.workgroup // CHECK-DAG: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) -// CHECK-DAG: %[[BD_ID_0_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], 0) -// CHECK-DAG: %[[BD_ID_0_1:.+]] = amdaie.bd_id(%[[TILE_0_0]], 1) // CHECK-DAG: %[[TILE_1_0:.+]] = amdaie.tile(%[[C1]], %[[C0]]) -// CHECK-DAG: %[[BD_ID_1_0:.+]] = amdaie.bd_id(%[[TILE_1_0]], 0) -// CHECK-DAG: %[[BD_ID_1_1:.+]] = amdaie.bd_id(%[[TILE_1_0]], 1) // CHECK-DAG: %[[TILE_2_0:.+]] = amdaie.tile(%[[C2]], %[[C0]]) -// CHECK-DAG: %[[BD_ID_2_0:.+]] = amdaie.bd_id(%[[TILE_2_0]], 0) // CHECK-DAG: %[[FROM_MEMREF_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_0_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>> // CHECK-DAG: %[[FROM_MEMREF_1:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_1_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>> // CHECK-DAG: %[[FROM_MEMREF_2:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_2_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>> @@ -252,14 +255,22 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK: %[[CIRC_DMA_1:.+]] = amdaie.circular_dma_cpy_nd // CHECK: %[[CIRC_DMA_2:.+]] = amdaie.circular_dma_cpy_nd // CHECK: amdaie.controlcode +// CHECK: %[[BD_ID_0_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C0]]) // CHECK: %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA_0]]([] [] [], %[[FROM_MEMREF_0]][0, 0, 0, 0] [1, 1, 8, 16] [128, 128, 16, 1] bd_id = %[[BD_ID_0_0]]) // CHECK: scf.forall (%{{.+}}, %{{.+}}) in (2, 2) +// CHECK: %[[BD_ID_1_0:.+]] = amdaie.bd_id(%[[TILE_1_0]], %[[C0]]) // CHECK: %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA_1]]([] [] [], %[[FROM_MEMREF_1]][0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_1_0]]) -// CHECK: scf.for %{{.+}} = %[[C0]] to %[[C6]] step %[[C1]] +// CHECK: scf.for %[[LOOP_VAR_0:.+]] = %[[C0]] to %[[C6]] step %[[C1]] +// CHECK: %[[VAR_0:.+]] = arith.remui %[[LOOP_VAR_0:.+]], %[[C15]] : index +// CHECK: %[[VAR_1:.+]] = arith.addi %[[VAR_0]], %[[C1]] : index +// CHECK: %[[BD_ID_1_1:.+]] = amdaie.bd_id(%[[TILE_1_0]], %[[VAR_1]]) // CHECK: %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA_1]]([] [] [], %[[FROM_MEMREF_1]][0, 0] [1, 128] [128, 1] bd_id = %[[BD_ID_1_1]]) +// CHECK: %[[BD_ID_0_1:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[VAR_1]]) // CHECK: %[[NPU_DMA_3:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA_0]]([] [] [], %[[FROM_MEMREF_0]][0] [128] [1] bd_id = %[[BD_ID_0_1]]) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_2]] : !amdaie.async_source_token) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_3]] : !amdaie.async_source_token) +// CHECK: %[[VAR_2:.+]] = arith.remui %[[LOOP_VAR_0:.+]], %[[C16]] : index +// CHECK: %[[BD_ID_2_0:.+]] = amdaie.bd_id(%[[TILE_2_0]], %[[VAR_2]]) // CHECK: %[[NPU_DMA_4:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA_2]]([] [] [], %[[FROM_MEMREF_2]][] [] [] bd_id = %[[BD_ID_2_0]]) // CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_4]] : !amdaie.async_source_token) // CHECK: } @@ -269,7 +280,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} #map = affine_map<(d0) -> (d0 * 16)> #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { - func.func @nested_loops(%arg0: memref<8x16xi32>, %arg1: memref<8x16xi32>, %arg2: memref<8x16xi32>, %arg3: memref<1x1x8x16xi32, 1>) { + func.func @nested_loops_1(%arg0: memref<8x16xi32>, %arg1: memref<8x16xi32>, %arg2: memref<8x16xi32>, %arg3: memref<1x1x8x16xi32, 1>) { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index @@ -310,3 +321,85 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} return } } + +// ----- + +// CHECK-LABEL: @nested_loops_2 +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index +// CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index +// CHECK-DAG: %[[C6:.+]] = arith.constant 6 : index +// CHECK-DAG: %[[C14:.+]] = arith.constant 14 : index +// CHECK: amdaie.workgroup +// CHECK-DAG: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) +// CHECK-DAG: %[[FROM_MEMREF_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_0_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>> +// CHECK-DAG: %[[FROM_MEMREF_1:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_0_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>> +// CHECK-DAG: %[[FROM_MEMREF_2:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_0_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>> +// CHECK: %[[CIRC_DMA_0:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: amdaie.controlcode +// CHECK: %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C0]]) +// CHECK: %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA_0]]([] [] [], %[[FROM_MEMREF_0]][0, 0, 0, 0] [1, 1, 8, 16] [128, 128, 16, 1] bd_id = %[[BD_ID_0]]) +// CHECK: scf.forall (%{{.+}}, %{{.+}}) in (2, 2) +// CHECK: %[[BD_ID_1:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C1]]) +// CHECK: %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA_0]]([] [] [], %[[FROM_MEMREF_1]][0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_1]]) +// CHECK: scf.for %[[LOOP_VAR_0:.+]] = %[[C0]] to %[[C6]] step %[[C1]] +// CHECK: %[[VAR_0:.+]] = arith.muli %[[LOOP_VAR_0]], %[[C3]] : index +// CHECK: %[[VAR_1:.+]] = arith.remui %[[VAR_0]], %[[C14]] : index +// CHECK: %[[VAR_2:.+]] = arith.addi %[[VAR_1]], %[[C2]] : index +// CHECK: %[[BD_ID_2:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[VAR_2]]) +// CHECK: %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA_0]]([] [] [], %[[FROM_MEMREF_1]][0, 0] [1, 128] [128, 1] bd_id = %[[BD_ID_2]]) +// CHECK: %[[VAR_3:.+]] = arith.addi %[[VAR_0]], %[[C1]] : index +// CHECK: %[[VAR_4:.+]] = arith.remui %[[VAR_3]], %[[C14]] : index +// CHECK: %[[VAR_5:.+]] = arith.addi %[[VAR_4]], %[[C2]] : index +// CHECK: %[[BD_ID_3:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[VAR_5]]) +// CHECK: %[[NPU_DMA_3:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA_0]]([] [] [], %[[FROM_MEMREF_0]][0] [128] [1] bd_id = %[[BD_ID_3]]) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_2]] : !amdaie.async_source_token) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_3]] : !amdaie.async_source_token) +// CHECK: %[[VAR_6:.+]] = arith.addi %[[VAR_0]], %[[C2]] : index +// CHECK: %[[VAR_7:.+]] = arith.remui %[[VAR_6]], %[[C14]] : index +// CHECK: %[[VAR_8:.+]] = arith.addi %[[VAR_7]], %[[C2]] : index +// CHECK: %[[BD_ID_4:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[VAR_8]]) +// CHECK: %[[NPU_DMA_4:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA_0]]([] [] [], %[[FROM_MEMREF_2]][] [] [] bd_id = %[[BD_ID_4]]) +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_4]] : !amdaie.async_source_token) +// CHECK: } +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_1]] : !amdaie.async_source_token) +// CHECK: } +// CHECK: amdaie.npu.dma_wait(%[[NPU_DMA_0]] : !amdaie.async_source_token) +#map = affine_map<(d0) -> (d0 * 16)> +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @nested_loops_2(%arg0: memref<8x16xi32>, %arg1: memref<8x16xi32>, %arg2: memref<8x16xi32>, %arg3: memref<1x1x8x16xi32, 1>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c6 = arith.constant 6 : index + amdaie.workgroup { + %tile_0_0 = amdaie.tile(%c0, %c0) + %placeholder0 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo<memref<8x16xi32>> + %from_memref_0 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>> + %from_memref_1 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>> + %from_memref_2 = amdaie.logicalobjectfifo.from_memref %arg2, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>> + %from_memref_3 = amdaie.logicalobjectfifo.from_memref %arg3, {%tile_0_0} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>> + %dma0 = amdaie.circular_dma_cpy_nd(%from_memref_3[] [] [], %placeholder0[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32>>) + amdaie.controlcode { + %0 = amdaie.npu.dma_cpy_nd async_source %dma0([] [] [], %from_memref_0[0, 0, 0, 0] [1, 1, 8, 16] [128, 128, 16, 1]) : source_type = !amdaie.logicalobjectfifo<memref<8x16xi32>> + scf.forall (%arg4, %arg5) in (2, 2) { + %1 = amdaie.npu.dma_cpy_nd async_source %dma0([] [] [], %from_memref_1[0, 0, 0] [1, 8, 16] [128, 16, 1]) : source_type = !amdaie.logicalobjectfifo<memref<8x16xi32>> + scf.for %arg6 = %c0 to %c6 step %c1 { + %2 = amdaie.npu.dma_cpy_nd async_source %dma0([] [] [], %from_memref_1[0, 0] [1, 128] [128, 1]) : source_type = !amdaie.logicalobjectfifo<memref<8x16xi32>> + %3 = amdaie.npu.dma_cpy_nd async_source %dma0([] [] [], %from_memref_0[0] [128] [1]) : source_type = !amdaie.logicalobjectfifo<memref<8x16xi32>> + amdaie.npu.dma_wait(%2 : !amdaie.async_source_token) + amdaie.npu.dma_wait(%3 : !amdaie.async_source_token) + %4 = amdaie.npu.dma_cpy_nd async_source %dma0([] [] [], %from_memref_2[] [] []) : source_type = !amdaie.logicalobjectfifo<memref<8x16xi32>> + amdaie.npu.dma_wait(%4 : !amdaie.async_source_token) + } + amdaie.npu.dma_wait(%1 : !amdaie.async_source_token) + } + amdaie.npu.dma_wait(%0 : !amdaie.async_source_token) + amdaie.end + } + } + return + } +} diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_lowering.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_lowering.mlir index e15cabc27..db465c0f0 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_lowering.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_lowering.mlir @@ -40,7 +40,6 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} amdaie.workgroup { %tile = amdaie.tile(%c0, %c1) %tile_0 = amdaie.tile(%c0, %c0) - %bd_id = amdaie.bd_id(%tile_0, 0) %buffer = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32> %buffer_1 = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32> %lock = amdaie.lock(%tile(4), 4) @@ -55,6 +54,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} amdaie.controlcode { %5 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>> memref.assume_alignment %1, 64 : memref<64x32xi32> + %bd_id = amdaie.bd_id(%tile_0, %c0) // CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 0 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 0, 0, 0>, strides = array<i32: 0, 0, 0>, use_next_bd = false, valid_bd = true} // CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32} // CHECK: amdaie.npu.push_to_queue {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32} @@ -91,7 +91,6 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} amdaie.workgroup { %tile = amdaie.tile(%c0, %c1) %tile_0 = amdaie.tile(%c0, %c0) - %bd_id = amdaie.bd_id(%tile_0, 0) %buffer = amdaie.buffer(%tile) : memref<2048xbf16, 1 : i32> %buffer_1 = amdaie.buffer(%tile) : memref<2048xbf16, 1 : i32> %lock = amdaie.lock(%tile(4), 4) @@ -106,6 +105,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} amdaie.controlcode { %5 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo<memref<2048xbf16>> memref.assume_alignment %1, 64 : memref<64x32xi32> + %bd_id = amdaie.bd_id(%tile_0, %c0) // CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 0 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 0, 0, 0>, strides = array<i32: 0, 0, 0>, use_next_bd = false, valid_bd = true} // CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32} // CHECK: amdaie.npu.push_to_queue {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32} @@ -142,7 +142,6 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} amdaie.workgroup { %tile = amdaie.tile(%c0, %c1) %tile_0 = amdaie.tile(%c0, %c0) - %bd_id = amdaie.bd_id(%tile_0, 0) %buffer = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32> %buffer_1 = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32> %lock = amdaie.lock(%tile(4), 4) @@ -157,6 +156,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} amdaie.controlcode { %5 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>> memref.assume_alignment %1, 64 : memref<64x32xi32> + %bd_id = amdaie.bd_id(%tile_0, %c0) // CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 0 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 0, 0, 0>, strides = array<i32: 0, 0, 0>, use_next_bd = false, valid_bd = true} // CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32} // CHECK: amdaie.npu.push_to_queue {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 0 : i32, repeat_count = 1 : ui32, row = 0 : ui32} @@ -193,7 +193,6 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} amdaie.workgroup { %tile = amdaie.tile(%c0, %c1) %tile_0 = amdaie.tile(%c0, %c0) - %bd_id = amdaie.bd_id(%tile_0, 0) %buffer = amdaie.buffer(%tile) : memref<2048xi8, 1 : i32> %buffer_1 = amdaie.buffer(%tile) : memref<2048xi8, 1 : i32> %lock = amdaie.lock(%tile(4), 4) @@ -208,6 +207,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} amdaie.controlcode { %5 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo<memref<2048xi8>> memref.assume_alignment %1, 64 : memref<64x32xi32> + %bd_id = amdaie.bd_id(%tile_0, %c0) // CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 0 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 0, 0, 0>, strides = array<i32: 0, 0, 0>, use_next_bd = false, valid_bd = true} // CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32} // CHECK: amdaie.npu.push_to_queue {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 0 : i32, repeat_count = 1 : ui32, row = 0 : ui32} @@ -243,7 +243,6 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} amdaie.workgroup { %tile = amdaie.tile(%c0, %c1) %tile_0 = amdaie.tile(%c0, %c0) - %bd_id = amdaie.bd_id(%tile_0, 0) %buffer = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32> %buffer_1 = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32> %lock = amdaie.lock(%tile(4), 4) @@ -258,6 +257,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} amdaie.controlcode { %5 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>> memref.assume_alignment %1, 64 : memref<64x32xi32> + %bd_id = amdaie.bd_id(%tile_0, %c0) // CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 0 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 0, 0, 0>, strides = array<i32: 0, 0, 0>, use_next_bd = false, valid_bd = true} // CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32} // CHECK: amdaie.npu.push_to_queue {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32} @@ -294,7 +294,6 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} amdaie.workgroup { %tile = amdaie.tile(%c0, %c1) %tile_0 = amdaie.tile(%c0, %c0) - %bd_id = amdaie.bd_id(%tile_0, 0) %buffer = amdaie.buffer(%tile) : memref<2048xbf16, 1 : i32> %buffer_1 = amdaie.buffer(%tile) : memref<2048xbf16, 1 : i32> %lock = amdaie.lock(%tile(4), 4) @@ -309,6 +308,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} amdaie.controlcode { %5 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo<memref<2048xbf16>> memref.assume_alignment %1, 64 : memref<64x32xi32> + %bd_id = amdaie.bd_id(%tile_0, %c0) // CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 0 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 0, 0, 0>, strides = array<i32: 0, 0, 0>, use_next_bd = false, valid_bd = true} // CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32} // CHECK: amdaie.npu.push_to_queue {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32} diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/dma_loop_subsumption.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/dma_loop_subsumption.mlir index d52696980..d570d2624 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/dma_loop_subsumption.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/dma_loop_subsumption.mlir @@ -31,7 +31,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %1 = amdaie.connection(%arg0, %arg1) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) amdaie.controlcode { scf.for %arg2 = %c1 to %c6 step %c2 { - %bd_id = amdaie.bd_id(%tile, 0) + %bd_id = amdaie.bd_id(%tile, %c0) amdaie.npu.dma_cpy_nd %0([%arg2] [16] [1] bd_id = %bd_id, [] [] []) amdaie.npu.dma_cpy_nd %1([%arg2] [16] [1], [] [] []) } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir index a036bcb5f..9c4e7a1d4 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir @@ -874,7 +874,6 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %tile_0 = amdaie.tile(%c0, %c1) %tile_1 = amdaie.tile(%c0, %c2) %tile_2 = amdaie.tile(%c1, %c2) - %bd_id = amdaie.bd_id(%tile, 0) %buffer = amdaie.buffer(%tile_0) : memref<4096xi32, 1 : i32> %buffer_3 = amdaie.buffer(%tile_0) : memref<4096xi32, 1 : i32> %lock = amdaie.lock(%tile_0(0), 2) diff --git a/runtime/src/iree-amd-aie/aie_runtime/Utils/ChannelBdIdGenerator.cpp b/runtime/src/iree-amd-aie/aie_runtime/Utils/ChannelBdIdGenerator.cpp index 339146e5f..de8373916 100644 --- a/runtime/src/iree-amd-aie/aie_runtime/Utils/ChannelBdIdGenerator.cpp +++ b/runtime/src/iree-amd-aie/aie_runtime/Utils/ChannelBdIdGenerator.cpp @@ -9,19 +9,39 @@ namespace mlir::iree_compiler::AMDAIE { std::optional<uint32_t> ChannelBdIdGenerator::getAndAssignBdId( - uint32_t channel) { + uint32_t channel, BdIdAssignmentMode mode) { if (!channelToValidBdIds.contains(channel) || channelToValidBdIds[channel].empty()) { return std::nullopt; } - uint32_t bdId = channelToValidBdIds[channel][0]; - size_t index{1}; - while (isBdIdAssigned(bdId) && index < channelToValidBdIds[channel].size()) { - bdId = channelToValidBdIds[channel][index++]; + + if (mode == BdIdAssignmentMode::Smallest) { + // Smallest: Find the smallest unassigned BD id + for (uint32_t bdId : channelToValidBdIds[channel]) { + if (!isBdIdAssigned(bdId)) { + assignBdId(bdId); + return bdId; + } + } + } else { + // Incremental: Find the first unassigned BD id greater than lastUsedBdId, + for (uint32_t bdId : channelToValidBdIds[channel]) { + if (bdId > lastUsedBdId && !isBdIdAssigned(bdId)) { + assignBdId(bdId); + return bdId; + } + } + // If not found, wrap around and check again + for (uint32_t bdId : channelToValidBdIds[channel]) { + if (bdId <= lastUsedBdId && !isBdIdAssigned(bdId)) { + assignBdId(bdId); + return bdId; + } + } } - if (isBdIdAssigned(bdId)) return std::nullopt; - assignBdId(bdId); - return bdId; + + // No valid BD id found + return std::nullopt; } } // namespace mlir::iree_compiler::AMDAIE diff --git a/runtime/src/iree-amd-aie/aie_runtime/Utils/ChannelBdIdGenerator.h b/runtime/src/iree-amd-aie/aie_runtime/Utils/ChannelBdIdGenerator.h index e478ac155..bd18d9c8e 100644 --- a/runtime/src/iree-amd-aie/aie_runtime/Utils/ChannelBdIdGenerator.h +++ b/runtime/src/iree-amd-aie/aie_runtime/Utils/ChannelBdIdGenerator.h @@ -16,6 +16,11 @@ using namespace llvm; namespace mlir::iree_compiler::AMDAIE { +enum class BdIdAssignmentMode { + Incremental, // Prioritize incremental assignment + Smallest // Choose the smallest unused id +}; + /// Utility to generate valid buffer descriptor (BD) ids for channels. Keeps /// state on assigned BD ids to avoid reuse. class ChannelBdIdGenerator { @@ -28,11 +33,15 @@ class ChannelBdIdGenerator { DenseMap<uint32_t, SmallVector<uint32_t>> &&channelToValidBdIds) : channelToValidBdIds(std::move(channelToValidBdIds)) {} - void assignBdId(uint32_t bdId) { assignedBdIds.insert(bdId); } + void assignBdId(uint32_t bdId) { + assignedBdIds.insert(bdId); + lastUsedBdId = bdId; + } /// Attempts to find and assign an unused BD id for the provided channel. /// Returns `std::nullopt` if no valid BD id could be found. - std::optional<uint32_t> getAndAssignBdId(uint32_t channel); + std::optional<uint32_t> getAndAssignBdId( + uint32_t channel, BdIdAssignmentMode mode = BdIdAssignmentMode::Smallest); /// Check whether the provided BD id is currently assigned. bool isBdIdAssigned(uint32_t bdId) const { return assignedBdIds.count(bdId); } @@ -41,11 +50,25 @@ class ChannelBdIdGenerator { /// reused. void releaseBdId(uint32_t bdId) { assignedBdIds.erase(bdId); } + uint32_t getAvailableBdIdNum(uint32_t channel) { + if (!channelToValidBdIds.contains(channel)) { + return 0; + } else { + uint32_t count = 0; + for (uint32_t bdId : channelToValidBdIds[channel]) { + if (!isBdIdAssigned(bdId)) count += 1; + } + return count; + } + } + private: // Maps channel indices to vectors of valid BD ids. DenseMap<uint32_t, SmallVector<uint32_t>> channelToValidBdIds; // Set with all BD ids that are currently assigned. DenseSet<uint32_t> assignedBdIds; + // Tracks the last used index for Incremental mode + uint32_t lastUsedBdId = std::numeric_limits<uint32_t>::max(); }; } // namespace mlir::iree_compiler::AMDAIE diff --git a/runtime/src/iree-amd-aie/aie_runtime/Utils/test/ChannelBdIdGeneratorTest.cpp b/runtime/src/iree-amd-aie/aie_runtime/Utils/test/ChannelBdIdGeneratorTest.cpp index 3b05cf19d..3ae8530d9 100644 --- a/runtime/src/iree-amd-aie/aie_runtime/Utils/test/ChannelBdIdGeneratorTest.cpp +++ b/runtime/src/iree-amd-aie/aie_runtime/Utils/test/ChannelBdIdGeneratorTest.cpp @@ -4,19 +4,15 @@ // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - #include <numeric> #include "gtest/gtest.h" #include "iree-amd-aie/aie_runtime/Utils/ChannelBdIdGenerator.h" - namespace { - using namespace mlir::iree_compiler::AMDAIE; - DenseMap<uint32_t, SmallVector<uint32_t>> getTestSingleRangeChannelToValidBdIds() { SmallVector<uint32_t> range(3); @@ -26,7 +22,6 @@ getTestSingleRangeChannelToValidBdIds() { return channelToValidBdIds; } - DenseMap<uint32_t, SmallVector<uint32_t>> getTestEvenOddChannelToValidBdIds() { SmallVector<uint32_t> evenRange(4); std::iota(evenRange.begin(), evenRange.end(), 0); @@ -38,7 +33,6 @@ DenseMap<uint32_t, SmallVector<uint32_t>> getTestEvenOddChannelToValidBdIds() { return channelToValidBdIds; } - TEST(ChannelBdIdGeneratorTest, SingleRange) { ChannelBdIdGenerator generator(getTestSingleRangeChannelToValidBdIds()); EXPECT_EQ(generator.getAndAssignBdId(0).value(), 0); @@ -50,7 +44,6 @@ TEST(ChannelBdIdGeneratorTest, SingleRange) { EXPECT_EQ(generator.getAndAssignBdId(1), std::nullopt); } - TEST(ChannelBdIdGeneratorTest, EvenOdd) { ChannelBdIdGenerator generator(getTestEvenOddChannelToValidBdIds()); // Check that even channel BDs start from 0 @@ -77,7 +70,6 @@ TEST(ChannelBdIdGeneratorTest, EvenOdd) { EXPECT_EQ(generator.getAndAssignBdId(3), std::nullopt); } - TEST(ChannelBdIdGeneratorTest, AssignBdId) { ChannelBdIdGenerator generator(getTestSingleRangeChannelToValidBdIds()); generator.assignBdId(0); @@ -87,7 +79,6 @@ TEST(ChannelBdIdGeneratorTest, AssignBdId) { EXPECT_EQ(generator.getAndAssignBdId(1), std::nullopt); } - TEST(ChannelBdIdGeneratorTest, Release) { ChannelBdIdGenerator generator(getTestSingleRangeChannelToValidBdIds()); EXPECT_EQ(generator.getAndAssignBdId(0).value(), 0); @@ -102,10 +93,26 @@ TEST(ChannelBdIdGeneratorTest, Release) { EXPECT_EQ(generator.isBdIdAssigned(1), true); } +TEST(ChannelBdIdGeneratorTest, IncrementalAssign) { + ChannelBdIdGenerator generator(getTestSingleRangeChannelToValidBdIds()); + EXPECT_EQ( + generator.getAndAssignBdId(0, BdIdAssignmentMode::Incremental).value(), + 0); + generator.releaseBdId(0); + EXPECT_EQ( + generator.getAndAssignBdId(0, BdIdAssignmentMode::Incremental).value(), + 1); + generator.releaseBdId(1); + EXPECT_EQ( + generator.getAndAssignBdId(0, BdIdAssignmentMode::Incremental).value(), + 2); + generator.releaseBdId(2); + EXPECT_EQ(generator.getAndAssignBdId(0).value(), 0); + generator.releaseBdId(0); +} } // namespace - int main(int argc, char **argv) { ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS();