Skip to content

Commit

Permalink
init
Browse files Browse the repository at this point in the history
  • Loading branch information
Yu-Zhewen committed Dec 1, 2024
1 parent 4f9d65a commit d57f854
Show file tree
Hide file tree
Showing 11 changed files with 401 additions and 144 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -336,7 +336,7 @@ def AMDAIE_BdIdOp: AMDAIE_Op<"bd_id", [

let arguments = (
ins Index:$tile,
UI32Attr:$value
Index:$value
);

let assemblyFormat = [{ `(` $tile `,` $value `)` attr-dict }];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@
// CHECK-LABEL: func.func @bd_id
// CHECK: %[[C0:.*]] = arith.constant 0 : index
// CHECK: %[[TILE_0:.*]] = amdaie.tile(%[[C0]], %[[C0]])
// CHECK: %[[BD_ID:.*]] = amdaie.bd_id(%[[TILE_0]], 0)
// CHECK: %[[BD_ID:.*]] = amdaie.bd_id(%[[TILE_0]], %[[C0]])
func.func @bd_id() {
%c0 = arith.constant 0 : index
%tile = amdaie.tile(%c0, %c0)
%bd_id = amdaie.bd_id(%tile, 0)
%bd_id = amdaie.bd_id(%tile, %c0)
return
}

Expand Down Expand Up @@ -295,7 +295,7 @@ func.func @npu_dma_cpy_nd(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32,
// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index
// CHECK-DAG: %[[C128:.+]] = arith.constant 128 : index
// CHECK-DAG: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]])
// CHECK-DAG: %[[BD_ID_0_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], 0)
// CHECK-DAG: %[[BD_ID_0_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C0]])
// CHECK-DAG: %[[CONNECTION_0:.+]] = amdaie.connection
// CHECK: %{{.*}} = amdaie.npu.dma_cpy_nd async_source
// CHECK-SAME: %[[CONNECTION_0]]
Expand All @@ -308,7 +308,7 @@ func.func @npu_dma_cpy_nd_bd_id(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16
%c16 = arith.constant 16 : index
%c128 = arith.constant 128 : index
%tile = amdaie.tile(%c0, %c0)
%bd_id = amdaie.bd_id(%tile, 0)
%bd_id = amdaie.bd_id(%tile, %c0)
%0 = amdaie.connection(%arg0, %arg1) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
%1 = amdaie.npu.dma_cpy_nd async_source %0([%c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16] [%c128, %c128, %c16, %c1] bd_id = %bd_id, [%c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16] [%c128, %c16, %c16, %c1] bd_id = %bd_id)
return
Expand Down Expand Up @@ -371,7 +371,7 @@ func.func @npu_dma_cpy_nd_target_source(%arg0: !amdaie.logicalobjectfifo<memref<
// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index
// CHECK-DAG: %[[C128:.+]] = arith.constant 128 : index
// CHECK-DAG: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]])
// CHECK-DAG: %[[BD_ID_0_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], 0)
// CHECK-DAG: %[[BD_ID_0_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C0]])
// CHECK-DAG: %[[CONNECTION_0:.+]] = amdaie.connection
// CHECK: %{{.*}} = amdaie.npu.dma_cpy_nd async_source %[[CONNECTION_0]]
// CHECK-SAME: %[[ARG0]][%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [1, 1, %[[C8]], %[[C16]]] [%[[C128]], %[[C128]], %[[C16]], 1] bd_id = %[[BD_ID_0_0]]
Expand All @@ -383,7 +383,7 @@ func.func @npu_dma_cpy_nd_all_operands(%arg0: !amdaie.logicalobjectfifo<memref<1
%c16 = arith.constant 16 : index
%c128 = arith.constant 128 : index
%tile = amdaie.tile(%c0, %c0)
%bd_id = amdaie.bd_id(%tile, 0)
%bd_id = amdaie.bd_id(%tile, %c0)
%0 = amdaie.connection(%arg0, %arg1) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
%1 = amdaie.npu.dma_cpy_nd async_source %0(%arg0[%c0, %c0, %c0, %c0] [1, 1, %c8, %c16] [%c128, %c128, %c16, 1] bd_id = %bd_id, %arg1[%c0, %c0, %c0, %c0] [1, 1, %c8, %c16] [%c128, %c16, %c16, 1] bd_id = %bd_id) : target_type = !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>> source_type = !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>
return
Expand All @@ -396,14 +396,14 @@ func.func @npu_dma_cpy_nd_all_operands(%arg0: !amdaie.logicalobjectfifo<memref<1
// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
// CHECK-DAG: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]])
// CHECK-DAG: %[[BD_ID:.+]] = amdaie.bd_id(%[[TILE_0_0]], 0)
// CHECK-DAG: %[[BD_ID:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C0]])
// CHECK-DAG: %[[CHANNEL:.*]] = amdaie.channel(%[[TILE_0_0]], 0, port_type = DMA, direction = S2MM)
// CHECK-DAG: %[[CONNECTION_0:.+]] = amdaie.connection
func.func @npu_half_dma_cpy_nd(%arg0: !amdaie.logicalobjectfifo<memref<2048xi32>>, %arg1: !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>>) {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%tile_0_0 = amdaie.tile(%c0, %c0)
%bd_id = amdaie.bd_id(%tile_0_0, 0)
%bd_id = amdaie.bd_id(%tile_0_0, %c0)
%channel = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = S2MM)
%0 = amdaie.connection(%arg0, %arg1) : (!amdaie.logicalobjectfifo<memref<2048xi32>>, !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>>)
// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[ARG0]] [] [] []) : !amdaie.logicalobjectfifo<memref<2048xi32>>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,170 @@
#include "iree-amd-aie/Transforms/Passes.h"
#include "iree-amd-aie/aie_runtime/Utils/ChannelBdIdGenerator.h"
#include "iree-amd-aie/aie_runtime/iree_aie_runtime.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/SCF/Utils/Utils.h"

#define DEBUG_TYPE "iree-amdaie-assign-npu-dma-bd-ids"

namespace mlir::iree_compiler::AMDAIE {

namespace {

/// Assign BD ids to NPU dma operations using the BD generator
// Utility to retrieve a TileOp from a vector of tile values, while doing
// appropriate verifications.
FailureOr<AMDAIE::TileOp> getGeneratorTileOp(
AMDAIE::NpuDmaCpyNdOp &npuDmaOp,
DenseMap<Value, ChannelBdIdGenerator> &shimTileToGeneratorMap) {
SmallVector<Value> tiles;
if (npuDmaOp.getSource()) {
auto logicalObjFifo =
dyn_cast_if_present<AMDAIE::LogicalObjFifoOpInterface>(
npuDmaOp.getSource().getDefiningOp());
if (!logicalObjFifo)
return npuDmaOp.emitOpError() << "expected a source logical objectFifo";
tiles = logicalObjFifo.getTiles();
}
if (npuDmaOp.getTarget()) {
auto logicalObjFifo =
dyn_cast_if_present<AMDAIE::LogicalObjectFifoFromMemrefOp>(
npuDmaOp.getTarget().getDefiningOp());
if (!logicalObjFifo)
return npuDmaOp.emitOpError()
<< "expected a target `amdaie.logicalobjectfifo.from_memref`";
tiles = logicalObjFifo.getTiles();
}
if (tiles.size() != 1) {
if (tiles.empty()) {
return npuDmaOp.emitOpError() << "no tiles found";
} else {
return npuDmaOp.emitOpError()
<< "operating on multiple tiles is not supported";
}
}
Value tile = tiles[0];
if (!shimTileToGeneratorMap.contains(tile)) {
return npuDmaOp.emitOpError()
<< "no channel BD ID generator found for tile: " << tile;
}
auto tileOp = dyn_cast_if_present<AMDAIE::TileOp>(tile.getDefiningOp());
if (!tileOp) return npuDmaOp.emitOpError() << "no tile op found";
return tileOp;
};

// Check if the DMA operation is in the innermost loop of controlcode.
bool isInMostInnerLoop(AMDAIE::NpuDmaCpyNdOp op) {
auto parentLoop = op->getParentOfType<scf::ForOp>();
if (!parentLoop) {
return false;
}
bool hasNestedLoop = false;
parentLoop.walk([&](scf::ForOp nestedLoop) {
if (nestedLoop != parentLoop) {
hasNestedLoop = true;
}
});
return !hasNestedLoop;
}

// Count the number of BD IDs needed per loop iteration,
// so that we know where to start the BD ID for the next iteration.
uint32_t countBdIdPerLoopIteration(
scf::ForOp loop, AMDAIE::TileOp tileOp,
DenseMap<Value, ChannelBdIdGenerator> &shimTileToGeneratorMap) {
uint32_t count = 0;
loop.walk([&](AMDAIE::NpuDmaCpyNdOp dmaOp) {
if (dmaOp.getSource() || dmaOp.getTarget()) {
FailureOr<AMDAIE::TileOp> tile =
getGeneratorTileOp(dmaOp, shimTileToGeneratorMap);
if (succeeded(tile) && *tile == tileOp) {
count++;
}
}
});
return count;
}

FailureOr<AMDAIE::BdIdOp> getBdIdOp(
IRRewriter &rewriter, AMDAIE::NpuDmaCpyNdOp &npuDmaOp,
DenseMap<Value, ChannelBdIdGenerator> &shimTileToGeneratorMap,
DenseMap<TileOp, uint32_t> &tileToBdIdOffsetMap,
DenseMap<TileOp, uint32_t> &tileToBdIdSizeMap, uint32_t channel) {
FailureOr<AMDAIE::TileOp> tileOp =
getGeneratorTileOp(npuDmaOp, shimTileToGeneratorMap);
if (failed(tileOp)) return failure();

ChannelBdIdGenerator &generator = shimTileToGeneratorMap[tileOp->getResult()];
std::optional<uint32_t> bdId =
generator.getAndAssignBdId(channel, BdIdAssignmentMode::Incremental);
if (!bdId) return failure();
AMDAIE::BdIdOp bdIdOp;
rewriter.setInsertionPoint(npuDmaOp);
if (isInMostInnerLoop(npuDmaOp)) {
// If the DMA is in the innermost loop, assign a BD ID using the
// semi-affine expression:
// `(iv * step + bdId - offset) % size + offset`,
// `step` represents the number of BD IDs needed per loop iteration,
// `bdId` is the BD ID assigned by the generator,
// `offset` is the BD ID assigned to the first DMA in the loop,
// `size` is the number of BD IDs available.
if (!tileToBdIdOffsetMap.contains(*tileOp))
tileToBdIdOffsetMap[*tileOp] = bdId.value();
// plus one because one BD ID is just assigned in this function.
if (!tileToBdIdSizeMap.contains(*tileOp))
tileToBdIdSizeMap[*tileOp] = generator.getAvailableBdIdNum(channel) + 1;
auto loop = npuDmaOp->getParentOfType<scf::ForOp>();
uint32_t bdIdCount =
countBdIdPerLoopIteration(loop, *tileOp, shimTileToGeneratorMap);
Value iv = loop.getInductionVar();
auto step = rewriter.create<arith::ConstantOp>(
rewriter.getUnknownLoc(), rewriter.getIndexAttr(bdIdCount));
auto diff = rewriter.create<arith::ConstantOp>(
rewriter.getUnknownLoc(),
rewriter.getIndexAttr(bdId.value() -
tileToBdIdOffsetMap[*tileOp])); // bdId - offset
auto offset = rewriter.create<arith::ConstantOp>(
rewriter.getUnknownLoc(),
rewriter.getIndexAttr(tileToBdIdOffsetMap[*tileOp]));
auto size = rewriter.create<arith::ConstantOp>(
rewriter.getUnknownLoc(),
rewriter.getIndexAttr(tileToBdIdSizeMap[*tileOp]));
auto mul = rewriter.create<arith::MulIOp>(rewriter.getUnknownLoc(), iv,
step); // iv * step
auto add1 =
rewriter.create<arith::AddIOp>(rewriter.getUnknownLoc(), mul,
diff); // iv * step + bdId - offset
auto mod = rewriter.create<arith::RemUIOp>(
rewriter.getUnknownLoc(), add1,
size); // (iv * step + bdId - offset) % size
auto add2 = rewriter.create<arith::AddIOp>(
rewriter.getUnknownLoc(), mod,
offset); // (iv * step + bdId - offset) % size + offset
bdIdOp = rewriter.create<AMDAIE::BdIdOp>(rewriter.getUnknownLoc(), *tileOp,
add2.getResult());
} else {
// If the DMA is not in the innermost loop, assign a constant BD ID
auto constant = rewriter.create<arith::ConstantOp>(
rewriter.getUnknownLoc(), rewriter.getIndexAttr(bdId.value()));
bdIdOp = rewriter.create<AMDAIE::BdIdOp>(rewriter.getUnknownLoc(), *tileOp,
constant.getResult());
}
return bdIdOp;
};

FailureOr<uint32_t> retriveBdId(arith::AddIOp add2) {
uint32_t offset = getConstantIndexOrAssert(add2.getOperand(1));
if (auto mod = dyn_cast<arith::RemUIOp>(add2.getOperand(0).getDefiningOp())) {
if (auto add1 =
dyn_cast<arith::AddIOp>(mod.getOperand(0).getDefiningOp())) {
uint32_t diff = getConstantIndexOrAssert(add1.getOperand(1));
uint32_t bdId = offset + diff;
return bdId;
}
}
return failure();
};

/// Assign BD ids to NPU dma operations using the BD generator.
LogicalResult assignNpuDmaBdIds(AMDAIE::WorkgroupOp workgroupOp) {
IRRewriter rewriter(workgroupOp->getContext());

Expand All @@ -39,93 +195,46 @@ LogicalResult assignNpuDmaBdIds(AMDAIE::WorkgroupOp workgroupOp) {
}
});

// Utility to retrieve a TileOp from a vector of tile values, while doing
// appropriate verifications.
auto getGeneratorTileOp = [&](AMDAIE::NpuDmaCpyNdOp &npuDmaOp,
const SmallVector<Value> &tiles,
AMDAIE::TileOp &tileOp) -> LogicalResult {
if (tiles.size() != 1) {
return npuDmaOp.emitOpError()
<< "operating on multiple tiles is not supported";
}
Value tile = tiles[0];
if (!shimTileToGeneratorMap.contains(tile)) {
return npuDmaOp.emitOpError()
<< "no channel BD ID generator found for tile: " << tile;
}
tileOp = dyn_cast_if_present<AMDAIE::TileOp>(tile.getDefiningOp());
if (!tileOp) return npuDmaOp.emitOpError() << "no tile op found";
return success();
};
// TODO(jornt): Temporarily use channel 0 for all DMAs. This should
// return correct results for Shim channels, however, for generality
// towards other DMAs and future hardware generations, channel
// assignment should happen before BD assignemnt. This requires more
// refactoring.
const uint32_t channel = 0;

DenseMap<AMDAIE::TileOp, uint32_t> tileToBdIdOffsetMap;
DenseMap<AMDAIE::TileOp, uint32_t> tileToBdIdSizeMap;
// Walk `amdaie.npu_dma_cpy_nd` and `amdaie.dma_wait` operations and assign
// and release BD IDs when encountering the respective operations using the
// tile BD ID generators initialized earlier.
AMDAIE::ControlCodeOp controlCodeOp = workgroupOp.getControlCode();
WalkResult res = controlCodeOp->walk([&](Operation *op) {
if (auto npuDmaOp = dyn_cast<AMDAIE::NpuDmaCpyNdOp>(op)) {
if (npuDmaOp.getSource()) {
auto logicalObjFifo =
dyn_cast_if_present<AMDAIE::LogicalObjFifoOpInterface>(
npuDmaOp.getSource().getDefiningOp());
if (!logicalObjFifo) {
npuDmaOp.emitOpError() << "expected a source logical objectFifo";
return WalkResult::interrupt();
}
SmallVector<Value> tiles = logicalObjFifo.getTiles();
AMDAIE::TileOp tileOp;
if (failed(getGeneratorTileOp(npuDmaOp, tiles, tileOp)))
return WalkResult::interrupt();
ChannelBdIdGenerator &generator =
shimTileToGeneratorMap[tileOp.getResult()];
// TODO(jornt): Temporarily use channel 0 for all DMAs. This should
// return correct results for Shim channels, however, for generality
// towards other DMAs and future hardware generations, channel
// assignment should happen before BD assignemnt. This requires more
// refactoring.
std::optional<uint32_t> bdId = generator.getAndAssignBdId(0);
rewriter.setInsertionPointAfter(tileOp);
auto bdIdOp = rewriter.create<AMDAIE::BdIdOp>(rewriter.getUnknownLoc(),
tileOp, bdId.value());
FailureOr<AMDAIE::BdIdOp> bdIdOp =
getBdIdOp(rewriter, npuDmaOp, shimTileToGeneratorMap,
tileToBdIdOffsetMap, tileToBdIdSizeMap, channel);
if (failed(bdIdOp)) return WalkResult::interrupt();
rewriter.setInsertionPoint(npuDmaOp);
npuDmaOp = rewriter.replaceOpWithNewOp<AMDAIE::NpuDmaCpyNdOp>(
npuDmaOp, npuDmaOp.getResultTypes(), npuDmaOp.getConnection(),
npuDmaOp.getTarget(), npuDmaOp.getTargetMixedOffsets(),
npuDmaOp.getTargetMixedSizes(), npuDmaOp.getTargetMixedStrides(),
npuDmaOp.getTargetBdId(), npuDmaOp.getSource(),
npuDmaOp.getSourceMixedOffsets(), npuDmaOp.getSourceMixedSizes(),
npuDmaOp.getSourceMixedStrides(), bdIdOp);
npuDmaOp.getSourceMixedStrides(), *bdIdOp);
}
if (npuDmaOp.getTarget()) {
auto logicalObjFifo =
dyn_cast_if_present<AMDAIE::LogicalObjectFifoFromMemrefOp>(
npuDmaOp.getTarget().getDefiningOp());
if (!logicalObjFifo) {
npuDmaOp.emitOpError()
<< "expected a target `amdaie.logicalobjectfifo.from_memref`";
return WalkResult::interrupt();
}
SmallVector<Value> tiles = logicalObjFifo.getTiles();
AMDAIE::TileOp tileOp;
if (failed(getGeneratorTileOp(npuDmaOp, tiles, tileOp)))
return WalkResult::interrupt();
ChannelBdIdGenerator &generator =
shimTileToGeneratorMap[tileOp.getResult()];
// TODO(jornt): Temporarily use channel 0 for all DMAs. This should
// return correct results for Shim channels, however, for generality
// towards other DMAs and future hardware generations, channel
// assignment should happen before BD assignemnt. This requires more
// refactoring.
std::optional<uint32_t> bdId = generator.getAndAssignBdId(0);
rewriter.setInsertionPointAfter(tileOp);
auto bdIdOp = rewriter.create<AMDAIE::BdIdOp>(rewriter.getUnknownLoc(),
tileOp, bdId.value());
FailureOr<AMDAIE::BdIdOp> bdIdOp =
getBdIdOp(rewriter, npuDmaOp, shimTileToGeneratorMap,
tileToBdIdOffsetMap, tileToBdIdSizeMap, channel);
if (failed(bdIdOp)) return WalkResult::interrupt();
rewriter.setInsertionPoint(npuDmaOp);
(void)rewriter.replaceOpWithNewOp<AMDAIE::NpuDmaCpyNdOp>(
npuDmaOp, npuDmaOp.getResultTypes(), npuDmaOp.getConnection(),
npuDmaOp.getTarget(), npuDmaOp.getTargetMixedOffsets(),
npuDmaOp.getTargetMixedSizes(), npuDmaOp.getTargetMixedStrides(),
bdIdOp, npuDmaOp.getSource(), npuDmaOp.getSourceMixedOffsets(),
*bdIdOp, npuDmaOp.getSource(), npuDmaOp.getSourceMixedOffsets(),
npuDmaOp.getSourceMixedSizes(), npuDmaOp.getSourceMixedStrides(),
npuDmaOp.getSourceBdId());
}
Expand Down Expand Up @@ -158,8 +267,18 @@ LogicalResult assignNpuDmaBdIds(AMDAIE::WorkgroupOp workgroupOp) {
}
ChannelBdIdGenerator &generator =
shimTileToGeneratorMap[tileOp.getResult()];
uint32_t value = bdIdOp.getValue();
generator.releaseBdId(value);
Value value = bdIdOp.getValue();
if (auto addOp = value.getDefiningOp<arith::AddIOp>()) {
// If the BD ID is a semi-affine expression, retrieve the BD ID for
// the first iteration.
FailureOr<uint32_t> bdId = retriveBdId(addOp);
if (failed(bdId)) return WalkResult::interrupt();
generator.releaseBdId(*bdId);
} else {
// Else, must be a constant BD ID.
uint32_t bdId = getConstantIndexOrAssert(value);
generator.releaseBdId(bdId);
}
}
return WalkResult::advance();
}
Expand Down
Loading

0 comments on commit d57f854

Please sign in to comment.