-
Notifications
You must be signed in to change notification settings - Fork 31
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add a pass to chain DMA BDs #931
Changes from all commits
5b9b1d4
fc2f42f
33aa97b
3d850db
2c857a6
2940ed5
bb56113
30c65f7
c527a09
49a0a5b
a8c52b5
770fd34
3104c84
358a3d7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -591,6 +591,14 @@ def AMDAIE_NpuHalfDmaCpyNdOp | |
ShapedType::kDynamic encodes that the corresponding entry has a dynamic | ||
value. | ||
|
||
It also supports the representation of DMA BD chaining using the `use_next_bd`, | ||
`next_bd`, and `start_bd` operands. The `use_next_bd` operand indicates | ||
whether another DMA operation is chained to follow this one. | ||
If `use_next_bd` is `true`, the `next_bd` operand specifies the BD ID of | ||
the next DMA operation in the chain. Within a chain, the `start_bd` operand | ||
identifies the BD ID of the first DMA operation in the sequence. | ||
When `use_next_bd` is `false`, the `start_bd` is set to the same value as `bd_id`. | ||
|
||
Example: | ||
|
||
```mlir | ||
|
@@ -604,7 +612,7 @@ def AMDAIE_NpuHalfDmaCpyNdOp | |
%5 = amdaie.logicalobjectfifo.from_memref %0, {%tile_0_0} | ||
: memref<32x1024xi32> -> !amdaie.logicalobjectfifo<memref<32768xi32>> | ||
%4 = amdaie.npu.half_dma_cpy_nd async %2(%0[0, 0] [32, 64] [1024, 1] | ||
bd_id = %bd_id channel = %channel) | ||
bd_id = %bd_id channel = %channel use_next_bd = false start_bd = %bd_id) | ||
... | ||
} | ||
``` | ||
|
@@ -620,7 +628,10 @@ def AMDAIE_NpuHalfDmaCpyNdOp | |
DenseI64ArrayAttr:$static_sizes, | ||
DenseI64ArrayAttr:$static_strides, | ||
Optional<Index>:$bd_id, | ||
Optional<Index>:$channel | ||
Optional<Index>:$channel, | ||
BoolAttr:$use_next_bd, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it might be useful to make this optional as well. |
||
Optional<Index>:$next_bd, | ||
Optional<Index>:$start_bd | ||
); | ||
|
||
let results = (outs Optional<AMDAIE_AsyncTokenType>:$async_token); | ||
|
@@ -635,6 +646,9 @@ def AMDAIE_NpuHalfDmaCpyNdOp | |
custom<DynamicIndexList>($strides, $static_strides) | ||
(`bd_id` `=` $bd_id^)? | ||
(`channel` `=` $channel^)? | ||
`use_next_bd` `=` $use_next_bd | ||
(`next_bd` `=` $next_bd^)? | ||
(`start_bd` `=` $start_bd^)? | ||
`)` | ||
attr-dict | ||
`:` type($input) | ||
|
@@ -645,16 +659,19 @@ def AMDAIE_NpuHalfDmaCpyNdOp | |
OpBuilder<(ins "::mlir::TypeRange":$result_types, "Value":$connection, | ||
"::mlir::Value":$input, "ArrayRef<OpFoldResult>":$offsets, | ||
"ArrayRef<OpFoldResult>":$sizes, "ArrayRef<OpFoldResult>":$strides, | ||
"::mlir::Value":$bd_id, "::mlir::Value":$channel)>, | ||
"::mlir::Value":$bd_id, "::mlir::Value":$channel, | ||
"bool":$use_next_bd, "::mlir::Value":$next_bd, "::mlir::Value":$start_bd)>, | ||
// Build a NpuHalfDmaCpyNdOp with static entries. | ||
OpBuilder<(ins "::mlir::TypeRange":$result_types, "Value":$connection, | ||
"::mlir::Value":$target, "ArrayRef<int64_t>":$offsets, | ||
"ArrayRef<int64_t>":$sizes, "ArrayRef<int64_t>":$strides, | ||
"::mlir::Value":$bd_id, "::mlir::Value":$channel)>, | ||
"::mlir::Value":$bd_id, "::mlir::Value":$channel, | ||
"bool":$use_next_bd, "::mlir::Value":$next_bd, "::mlir::Value":$start_bd)>, | ||
// Build a NpuHalfDmaCpyNdOp with dynamic entries. | ||
OpBuilder<(ins "::mlir::TypeRange":$result_types, "Value":$connection, | ||
"::mlir::Value":$input, "ValueRange":$offsets, "ValueRange":$sizes, | ||
"ValueRange":$strides, "::mlir::Value":$bd_id, "::mlir::Value":$channel)> | ||
"ValueRange":$strides, "::mlir::Value":$bd_id, "::mlir::Value":$channel, | ||
"bool":$use_next_bd, "::mlir::Value":$next_bd, "::mlir::Value":$start_bd)>, | ||
]; | ||
|
||
let extraClassDeclaration = [{ | ||
|
@@ -673,6 +690,16 @@ def AMDAIE_NpuHalfDmaCpyNdOp | |
return dyn_cast_if_present<BdIdOp>(getBdId().getDefiningOp()); | ||
} | ||
|
||
std::optional<BdIdOp> getNextBdIdOp() { | ||
if (!getNextBd()) return std::nullopt; | ||
return dyn_cast_if_present<BdIdOp>(getNextBd().getDefiningOp()); | ||
} | ||
|
||
std::optional<BdIdOp> getStartBdIdOp() { | ||
if (!getStartBd()) return std::nullopt; | ||
return dyn_cast_if_present<BdIdOp>(getStartBd().getDefiningOp()); | ||
} | ||
|
||
// Return the input `amdaie.connection` operation. | ||
std::optional<ConnectionOp> getConnectionOp() { | ||
return dyn_cast_if_present<ConnectionOp>(getConnection().getDefiningOp()); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -406,16 +406,16 @@ func.func @npu_half_dma_cpy_nd(%arg0: !amdaie.logicalobjectfifo<memref<2048xi32> | |
%bd_id = amdaie.bd_id(%tile_0_0, 0) | ||
%channel = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = S2MM) | ||
%0 = amdaie.connection(%arg0, %arg1) : (!amdaie.logicalobjectfifo<memref<2048xi32>>, !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>>) | ||
// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[ARG0]] [] [] []) : !amdaie.logicalobjectfifo<memref<2048xi32>> | ||
amdaie.npu.half_dma_cpy_nd %0(%arg0[] [] []) : !amdaie.logicalobjectfifo<memref<2048xi32>> | ||
// CHECK: %{{.+}} = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION_0]](%[[ARG0]] [] [] []) : !amdaie.logicalobjectfifo<memref<2048xi32>> | ||
amdaie.npu.half_dma_cpy_nd async %0(%arg0[] [] []) : !amdaie.logicalobjectfifo<memref<2048xi32>> | ||
// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[ARG0]] [0] [1024] [1] bd_id = %[[BD_ID]]) : !amdaie.logicalobjectfifo<memref<2048xi32>> | ||
amdaie.npu.half_dma_cpy_nd %0(%arg0[0] [1024] [1] bd_id = %bd_id) : !amdaie.logicalobjectfifo<memref<2048xi32>> | ||
// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[ARG0]] [%[[C0]], 0] [%[[C0]], 64] [%[[C0]], 1] channel = %[[CHANNEL]]) : !amdaie.logicalobjectfifo<memref<2048xi32>> | ||
amdaie.npu.half_dma_cpy_nd %0(%arg0[%c0, 0] [%c0, 64] [%c0, 1] channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xi32>> | ||
// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[ARG0]] [] [] [] bd_id = %[[BD_ID]] channel = %[[CHANNEL]]) : !amdaie.logicalobjectfifo<memref<2048xi32>> | ||
amdaie.npu.half_dma_cpy_nd %0(%arg0[] [] [] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xi32>> | ||
// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[ARG0]] [] [] [] use_next_bd = false) : !amdaie.logicalobjectfifo<memref<2048xi32>> | ||
amdaie.npu.half_dma_cpy_nd %0(%arg0[] [] [] use_next_bd = false) : !amdaie.logicalobjectfifo<memref<2048xi32>> | ||
// CHECK: %{{.+}} = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION_0]](%[[ARG0]] [] [] [] use_next_bd = false) : !amdaie.logicalobjectfifo<memref<2048xi32>> | ||
amdaie.npu.half_dma_cpy_nd async %0(%arg0[] [] [] use_next_bd = false) : !amdaie.logicalobjectfifo<memref<2048xi32>> | ||
// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[ARG0]] [0] [1024] [1] bd_id = %[[BD_ID]] use_next_bd = false) : !amdaie.logicalobjectfifo<memref<2048xi32>> | ||
amdaie.npu.half_dma_cpy_nd %0(%arg0[0] [1024] [1] bd_id = %bd_id use_next_bd = false) : !amdaie.logicalobjectfifo<memref<2048xi32>> | ||
// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[ARG0]] [%[[C0]], 0] [%[[C0]], 64] [%[[C0]], 1] channel = %[[CHANNEL]] use_next_bd = false) : !amdaie.logicalobjectfifo<memref<2048xi32>> | ||
amdaie.npu.half_dma_cpy_nd %0(%arg0[%c0, 0] [%c0, 64] [%c0, 1] channel = %channel use_next_bd = false) : !amdaie.logicalobjectfifo<memref<2048xi32>> | ||
// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[ARG0]] [] [] [] bd_id = %[[BD_ID]] channel = %[[CHANNEL]] use_next_bd = false) : !amdaie.logicalobjectfifo<memref<2048xi32>> | ||
amdaie.npu.half_dma_cpy_nd %0(%arg0[] [] [] bd_id = %bd_id channel = %channel use_next_bd = false) : !amdaie.logicalobjectfifo<memref<2048xi32>> | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add some new roundtrip checks with |
||
return | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -58,6 +58,13 @@ LogicalResult assignNpuDmaBdIds(AMDAIE::WorkgroupOp workgroupOp) { | |
return success(); | ||
}; | ||
|
||
// TODO(jornt): Temporarily use channel 0 for all DMAs. This should | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we can move this pass after There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I will have it in a separate PR |
||
// return correct results for Shim channels, however, for generality | ||
// towards other DMAs and future hardware generations, channel | ||
// assignment should happen before BD assignemnt. This requires more | ||
// refactoring. | ||
const uint32_t channel = 0; | ||
|
||
// Walk `amdaie.npu_dma_cpy_nd` and `amdaie.dma_wait` operations and assign | ||
// and release BD IDs when encountering the respective operations using the | ||
// tile BD ID generators initialized earlier. | ||
|
@@ -78,12 +85,8 @@ LogicalResult assignNpuDmaBdIds(AMDAIE::WorkgroupOp workgroupOp) { | |
return WalkResult::interrupt(); | ||
ChannelBdIdGenerator &generator = | ||
shimTileToGeneratorMap[tileOp.getResult()]; | ||
// TODO(jornt): Temporarily use channel 0 for all DMAs. This should | ||
// return correct results for Shim channels, however, for generality | ||
// towards other DMAs and future hardware generations, channel | ||
// assignment should happen before BD assignemnt. This requires more | ||
// refactoring. | ||
std::optional<uint32_t> bdId = generator.getAndAssignBdId(0); | ||
std::optional<uint32_t> bdId = generator.getAndAssignBdId( | ||
channel, BdIdAssignmentMode::Incremental); | ||
rewriter.setInsertionPointAfter(tileOp); | ||
auto bdIdOp = rewriter.create<AMDAIE::BdIdOp>(rewriter.getUnknownLoc(), | ||
tileOp, bdId.value()); | ||
|
@@ -111,12 +114,8 @@ LogicalResult assignNpuDmaBdIds(AMDAIE::WorkgroupOp workgroupOp) { | |
return WalkResult::interrupt(); | ||
ChannelBdIdGenerator &generator = | ||
shimTileToGeneratorMap[tileOp.getResult()]; | ||
// TODO(jornt): Temporarily use channel 0 for all DMAs. This should | ||
// return correct results for Shim channels, however, for generality | ||
// towards other DMAs and future hardware generations, channel | ||
// assignment should happen before BD assignemnt. This requires more | ||
// refactoring. | ||
std::optional<uint32_t> bdId = generator.getAndAssignBdId(0); | ||
std::optional<uint32_t> bdId = generator.getAndAssignBdId( | ||
channel, BdIdAssignmentMode::Incremental); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What's the reasoning behind using incremental mode? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since %0 = dma_cpy_nd(bd_id=0)
dma_wait(%0)
%1 = dma_cpy_nd(bd_id=0)
dma_wait(%1)
... using incremental code, the goal is to have different ids which can be chained later %0 = dma_cpy_nd(bd_id=0)
dma_wait(%0)
%1 = dma_cpy_nd(bd_id=1)
dma_wait(%1)
... During the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Right, yeah, I see. Otherwise, |
||
rewriter.setInsertionPointAfter(tileOp); | ||
auto bdIdOp = rewriter.create<AMDAIE::BdIdOp>(rewriter.getUnknownLoc(), | ||
tileOp, bdId.value()); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It would be good to check that in the verifier.