nod-ai · Yu-Zhewen · Nov 20, 2024 · Nov 25, 2024 · Nov 25, 2024 · Nov 25, 2024
@@ -1101,15 +1101,16 @@ void NpuHalfDmaCpyNdOp::build(OpBuilder &b, OperationState &result,
                               Value input, ArrayRef<OpFoldResult> offsets,
                               ArrayRef<OpFoldResult> sizes,
                               ArrayRef<OpFoldResult> strides, Value bdId,
-                              Value channel) {
+                              Value channel, bool useNextBd, Value nextBd,
+                              Value startBd) {
   SmallVector<int64_t> staticOffsets, staticSizes, staticStrides;
   SmallVector<Value> dynamicOffsets, dynamicSizes, dynamicStrides;
   dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
   dispatchIndexOpFoldResults(sizes, dynamicSizes, staticSizes);
   dispatchIndexOpFoldResults(strides, dynamicStrides, staticStrides);
   build(b, result, resultTypes, connection, input, dynamicOffsets, dynamicSizes,
         dynamicStrides, staticOffsets, staticSizes, staticStrides, bdId,
-        channel);
+        channel, useNextBd, nextBd, startBd);
 }
 
 // Build a NpuHalfDmaCpyNdOp with static entries.
@@ -1118,7 +1119,8 @@ void NpuHalfDmaCpyNdOp::build(OpBuilder &b, OperationState &result,
                               Value input, ArrayRef<int64_t> offsets,
                               ArrayRef<int64_t> sizes,
                               ArrayRef<int64_t> strides, mlir::Value bdId,
-                              Value channel) {
+                              Value channel, bool useNextBd, Value nextBd,
+                              Value startBd) {
   SmallVector<OpFoldResult> offsetValues = llvm::to_vector<4>(llvm::map_range(
       offsets,
       [&](int64_t v) -> OpFoldResult { return b.getI64IntegerAttr(v); }));
@@ -1130,23 +1132,24 @@ void NpuHalfDmaCpyNdOp::build(OpBuilder &b, OperationState &result,
       strides,
       [&](int64_t v) -> OpFoldResult { return b.getI64IntegerAttr(v); }));
   build(b, result, resultTypes, connection, input, offsetValues, sizeValues,
-        strideValues, bdId, channel);
+        strideValues, bdId, channel, useNextBd, nextBd, startBd);
 }
 
 // Build a NpuHalfDmaCpyNdOp with dynamic entries.
 void NpuHalfDmaCpyNdOp::build(OpBuilder &b, OperationState &result,
                               TypeRange resultTypes, Value connection,
                               Value input, ValueRange offsets, ValueRange sizes,
                               ValueRange strides, mlir::Value bdId,
-                              Value channel) {
+                              Value channel, bool useNextBd, Value nextBd,
+                              Value startBd) {
   SmallVector<OpFoldResult> offsetValues = llvm::to_vector<4>(
       llvm::map_range(offsets, [](Value v) -> OpFoldResult { return v; }));
   SmallVector<OpFoldResult> sizeValues = llvm::to_vector<4>(
       llvm::map_range(sizes, [](Value v) -> OpFoldResult { return v; }));
   SmallVector<OpFoldResult> strideValues = llvm::to_vector<4>(
       llvm::map_range(strides, [](Value v) -> OpFoldResult { return v; }));
   build(b, result, resultTypes, connection, input, offsetValues, sizeValues,
-        strideValues, bdId, channel);
+        strideValues, bdId, channel, useNextBd, nextBd, startBd);
 }
 
 std::optional<int64_t> NpuHalfDmaCpyNdOp::getStaticBaseOffset() {

@@ -591,6 +591,14 @@ def AMDAIE_NpuHalfDmaCpyNdOp
     ShapedType::kDynamic encodes that the corresponding entry has a dynamic
     value.
 
+    It also supports the representation of DMA BD chaining using the `use_next_bd`, 
+    `next_bd`, and `start_bd` operands. The `use_next_bd` operand indicates 
+    whether another DMA operation is chained to follow this one. 
+    If `use_next_bd` is `true`, the `next_bd` operand specifies the BD ID of 
+    the next DMA operation in the chain. Within a chain, the `start_bd` operand
+    identifies the BD ID of the first DMA operation in the sequence. 
+    When `use_next_bd` is `false`, the `start_bd` is set to the same value as `bd_id`.
+
     Example:
 
     ```mlir
@@ -604,7 +612,7 @@ def AMDAIE_NpuHalfDmaCpyNdOp
       %5 = amdaie.logicalobjectfifo.from_memref %0, {%tile_0_0} 
         : memref<32x1024xi32> -> !amdaie.logicalobjectfifo<memref<32768xi32>>
       %4 = amdaie.npu.half_dma_cpy_nd async %2(%0[0, 0] [32, 64] [1024, 1]
-        bd_id = %bd_id channel = %channel)
+        bd_id = %bd_id channel = %channel use_next_bd = false start_bd = %bd_id)
       ...
     }
     ```
@@ -620,7 +628,10 @@ def AMDAIE_NpuHalfDmaCpyNdOp
         DenseI64ArrayAttr:$static_sizes,
         DenseI64ArrayAttr:$static_strides,
         Optional<Index>:$bd_id,
-        Optional<Index>:$channel
+        Optional<Index>:$channel,
+        BoolAttr:$use_next_bd,
+        Optional<Index>:$next_bd,
+        Optional<Index>:$start_bd
   );
 
   let results = (outs Optional<AMDAIE_AsyncTokenType>:$async_token);
@@ -635,6 +646,9 @@ def AMDAIE_NpuHalfDmaCpyNdOp
     custom<DynamicIndexList>($strides, $static_strides)
     (`bd_id` `=` $bd_id^)?
     (`channel` `=` $channel^)?
+    `use_next_bd` `=` $use_next_bd
+    (`next_bd` `=` $next_bd^)?
+    (`start_bd` `=` $start_bd^)?
     `)`
     attr-dict 
     `:` type($input)
@@ -645,16 +659,19 @@ def AMDAIE_NpuHalfDmaCpyNdOp
     OpBuilder<(ins "::mlir::TypeRange":$result_types, "Value":$connection,
       "::mlir::Value":$input, "ArrayRef<OpFoldResult>":$offsets,
       "ArrayRef<OpFoldResult>":$sizes, "ArrayRef<OpFoldResult>":$strides,
-      "::mlir::Value":$bd_id, "::mlir::Value":$channel)>,
+      "::mlir::Value":$bd_id, "::mlir::Value":$channel,
+      "bool":$use_next_bd, "::mlir::Value":$next_bd, "::mlir::Value":$start_bd)>,
     // Build a NpuHalfDmaCpyNdOp with static entries.
     OpBuilder<(ins "::mlir::TypeRange":$result_types, "Value":$connection,
       "::mlir::Value":$target, "ArrayRef<int64_t>":$offsets,
       "ArrayRef<int64_t>":$sizes, "ArrayRef<int64_t>":$strides,
-      "::mlir::Value":$bd_id, "::mlir::Value":$channel)>,
+      "::mlir::Value":$bd_id, "::mlir::Value":$channel,
+      "bool":$use_next_bd, "::mlir::Value":$next_bd, "::mlir::Value":$start_bd)>,
     // Build a NpuHalfDmaCpyNdOp with dynamic entries.
     OpBuilder<(ins "::mlir::TypeRange":$result_types, "Value":$connection,
       "::mlir::Value":$input, "ValueRange":$offsets, "ValueRange":$sizes,
-      "ValueRange":$strides, "::mlir::Value":$bd_id, "::mlir::Value":$channel)>
+      "ValueRange":$strides, "::mlir::Value":$bd_id, "::mlir::Value":$channel,
+      "bool":$use_next_bd, "::mlir::Value":$next_bd, "::mlir::Value":$start_bd)>,
   ];
 
   let extraClassDeclaration = [{
@@ -673,6 +690,16 @@ def AMDAIE_NpuHalfDmaCpyNdOp
       return dyn_cast_if_present<BdIdOp>(getBdId().getDefiningOp());
     }
 
+    std::optional<BdIdOp> getNextBdIdOp() {
+      if (!getNextBd()) return std::nullopt;
+      return dyn_cast_if_present<BdIdOp>(getNextBd().getDefiningOp());
+    }
+
+    std::optional<BdIdOp> getStartBdIdOp() {
+      if (!getStartBd()) return std::nullopt;
+      return dyn_cast_if_present<BdIdOp>(getStartBd().getDefiningOp());
+    }
+
     // Return the input `amdaie.connection` operation.
     std::optional<ConnectionOp> getConnectionOp() {
       return dyn_cast_if_present<ConnectionOp>(getConnection().getDefiningOp());

@@ -406,16 +406,16 @@ func.func @npu_half_dma_cpy_nd(%arg0: !amdaie.logicalobjectfifo<memref<2048xi32>
   %bd_id = amdaie.bd_id(%tile_0_0, 0)
   %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = S2MM)
   %0 = amdaie.connection(%arg0, %arg1) : (!amdaie.logicalobjectfifo<memref<2048xi32>>, !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>>)
-// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[ARG0]] [] [] []) : !amdaie.logicalobjectfifo<memref<2048xi32>>
-  amdaie.npu.half_dma_cpy_nd %0(%arg0[] [] []) : !amdaie.logicalobjectfifo<memref<2048xi32>>
-// CHECK: %{{.+}} = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION_0]](%[[ARG0]] [] [] []) : !amdaie.logicalobjectfifo<memref<2048xi32>>
-  amdaie.npu.half_dma_cpy_nd async %0(%arg0[] [] []) : !amdaie.logicalobjectfifo<memref<2048xi32>>
-// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[ARG0]] [0] [1024] [1] bd_id = %[[BD_ID]]) : !amdaie.logicalobjectfifo<memref<2048xi32>>
-  amdaie.npu.half_dma_cpy_nd %0(%arg0[0] [1024] [1] bd_id = %bd_id) : !amdaie.logicalobjectfifo<memref<2048xi32>>
-// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[ARG0]] [%[[C0]], 0] [%[[C0]], 64] [%[[C0]], 1] channel = %[[CHANNEL]]) : !amdaie.logicalobjectfifo<memref<2048xi32>>
-  amdaie.npu.half_dma_cpy_nd %0(%arg0[%c0, 0] [%c0, 64] [%c0, 1] channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xi32>>
-// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[ARG0]] [] [] [] bd_id = %[[BD_ID]] channel = %[[CHANNEL]]) : !amdaie.logicalobjectfifo<memref<2048xi32>>
-  amdaie.npu.half_dma_cpy_nd %0(%arg0[] [] [] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[ARG0]] [] [] [] use_next_bd = false) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+  amdaie.npu.half_dma_cpy_nd %0(%arg0[] [] [] use_next_bd = false) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+// CHECK: %{{.+}} = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION_0]](%[[ARG0]] [] [] [] use_next_bd = false) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+  amdaie.npu.half_dma_cpy_nd async %0(%arg0[] [] [] use_next_bd = false) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[ARG0]] [0] [1024] [1] bd_id = %[[BD_ID]] use_next_bd = false) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+  amdaie.npu.half_dma_cpy_nd %0(%arg0[0] [1024] [1] bd_id = %bd_id use_next_bd = false) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[ARG0]] [%[[C0]], 0] [%[[C0]], 64] [%[[C0]], 1] channel = %[[CHANNEL]] use_next_bd = false) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+  amdaie.npu.half_dma_cpy_nd %0(%arg0[%c0, 0] [%c0, 64] [%c0, 1] channel = %channel use_next_bd = false) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[ARG0]] [] [] [] bd_id = %[[BD_ID]] channel = %[[CHANNEL]] use_next_bd = false) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+  amdaie.npu.half_dma_cpy_nd %0(%arg0[] [] [] bd_id = %bd_id channel = %channel use_next_bd = false) : !amdaie.logicalobjectfifo<memref<2048xi32>>
   return
 }
 

@@ -1,6 +1,6 @@
 // This pipeline is obtained by going into Passes.cpp, and dumping the pass pipeline (at the end of addAMDAIEObjectFifoLoweringPasses) using `passManager.dump()`. This test is included, as it can be useful to have a reference in IR of all the passes that are run.
 
-// RUN: iree-opt --pass-pipeline="builtin.module(fold-memref-alias-ops,iree-amdaie-distribute-l1-allocations,iree-amdaie-convert-to-dma,iree-amdaie-normalize-loop-bounds,iree-amdaie-insert-cores,iree-amdaie-localize-logicalobjectfifo,cse,iree-amdaie-distribute-cores-and-objectfifos,cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-split-logical-objectfifos-for-connection-reuse,cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-assign-tiles,cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-to-circular-dma,func.func(iree-amdaie-create-aie-workgroup),cse,iree-amdaie-dma-cse,iree-amdaie-hoist-logical-objectfifo,iree-amdaie-canonicalize-doubly-strided-op{fold-single-dims=false},iree-amdaie-flatten-logicalobjectfifo,iree-amdaie-assign-logical-objectfifo-depth{l1-buffer-depth=2 l2-buffer-depth=2 l3-buffer-depth=1},iree-amdaie-access-to-acquire-release,iree-amdaie-none-access-to-temporary-buffer,iree-amdaie-assign-connection-types,cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-composition{only-zero-stride-on-outer-dim=true},cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-cse,iree-amdaie-assign-npu-dma-bd-ids,cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-controlcode-loop-unroll,cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-cse,iree-amdaie-canonicalize-doubly-strided-op{fold-single-dims=false},canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-convert-core-forall-to-for,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-assign-channels,cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-objfifo-bufferization,iree-amdaie-connection-to-flow,iree-amdaie-assign-packet-ids,iree-amdaie-controlcode-lowering,iree-amdaie-controlcode-to-transaction,iree-amdaie-acquire-release-to-use-lock,iree-amdaie-canonicalize-npu-dma-cpy-nd{nb-dimensions=4},canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-sink-into-core,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-lower-to-aie,iree-amdaie-remove-memoryspace)" --split-input-file %s | FileCheck %s
+// RUN: iree-opt --pass-pipeline="builtin.module(fold-memref-alias-ops,iree-amdaie-distribute-l1-allocations,iree-amdaie-convert-to-dma,iree-amdaie-normalize-loop-bounds,iree-amdaie-insert-cores,iree-amdaie-localize-logicalobjectfifo,cse,iree-amdaie-distribute-cores-and-objectfifos,cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-split-logical-objectfifos-for-connection-reuse,cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-assign-tiles,cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-to-circular-dma,func.func(iree-amdaie-create-aie-workgroup),cse,iree-amdaie-dma-cse,iree-amdaie-hoist-logical-objectfifo,iree-amdaie-canonicalize-doubly-strided-op{fold-single-dims=false},iree-amdaie-flatten-logicalobjectfifo,iree-amdaie-assign-logical-objectfifo-depth{l1-buffer-depth=2 l2-buffer-depth=2 l3-buffer-depth=1},iree-amdaie-access-to-acquire-release,iree-amdaie-none-access-to-temporary-buffer,iree-amdaie-assign-connection-types,cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-composition{only-zero-stride-on-outer-dim=true},cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-cse,iree-amdaie-assign-npu-dma-bd-ids,cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-controlcode-loop-unroll,cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-cse,iree-amdaie-canonicalize-doubly-strided-op{fold-single-dims=false},canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-convert-core-forall-to-for,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-assign-channels,cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-objfifo-bufferization,iree-amdaie-connection-to-flow,iree-amdaie-assign-packet-ids,iree-amdaie-controlcode-to-half-dma-cpy-nd,iree-amdaie-controlcode-to-npu,iree-amdaie-controlcode-to-transaction,iree-amdaie-acquire-release-to-use-lock,iree-amdaie-canonicalize-npu-dma-cpy-nd{nb-dimensions=4},canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-sink-into-core,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-lower-to-aie,iree-amdaie-remove-memoryspace)" --split-input-file %s | FileCheck %s
 
 
 

@@ -58,6 +58,13 @@ LogicalResult assignNpuDmaBdIds(AMDAIE::WorkgroupOp workgroupOp) {
     return success();
   };
 
+  // TODO(jornt): Temporarily use channel 0 for all DMAs. This should
+  // return correct results for Shim channels, however, for generality
+  // towards other DMAs and future hardware generations, channel
+  // assignment should happen before BD assignemnt. This requires more
+  // refactoring.
+  const uint32_t channel = 0;
+
   // Walk `amdaie.npu_dma_cpy_nd` and  `amdaie.dma_wait` operations and assign
   // and release BD IDs when encountering the respective operations using the
   // tile BD ID generators initialized earlier.
@@ -78,12 +85,8 @@ LogicalResult assignNpuDmaBdIds(AMDAIE::WorkgroupOp workgroupOp) {
           return WalkResult::interrupt();
         ChannelBdIdGenerator &generator =
             shimTileToGeneratorMap[tileOp.getResult()];
-        // TODO(jornt): Temporarily use channel 0 for all DMAs. This should
-        // return correct results for Shim channels, however, for generality
-        // towards other DMAs and future hardware generations, channel
-        // assignment should happen before BD assignemnt. This requires more
-        // refactoring.
-        std::optional<uint32_t> bdId = generator.getAndAssignBdId(0);
+        std::optional<uint32_t> bdId = generator.getAndAssignBdId(
+            channel, BdIdAssignmentMode::Incremental);
         rewriter.setInsertionPointAfter(tileOp);
         auto bdIdOp = rewriter.create<AMDAIE::BdIdOp>(rewriter.getUnknownLoc(),
                                                       tileOp, bdId.value());
@@ -111,12 +114,8 @@ LogicalResult assignNpuDmaBdIds(AMDAIE::WorkgroupOp workgroupOp) {
           return WalkResult::interrupt();
         ChannelBdIdGenerator &generator =
             shimTileToGeneratorMap[tileOp.getResult()];
-        // TODO(jornt): Temporarily use channel 0 for all DMAs. This should
-        // return correct results for Shim channels, however, for generality
-        // towards other DMAs and future hardware generations, channel
-        // assignment should happen before BD assignemnt. This requires more
-        // refactoring.
-        std::optional<uint32_t> bdId = generator.getAndAssignBdId(0);
+        std::optional<uint32_t> bdId = generator.getAndAssignBdId(
+            channel, BdIdAssignmentMode::Incremental);
         rewriter.setInsertionPointAfter(tileOp);
         auto bdIdOp = rewriter.create<AMDAIE::BdIdOp>(rewriter.getUnknownLoc(),
                                                       tileOp, bdId.value());