diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td
index c1bba31f1..fa45c468e 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td
@@ -336,7 +336,7 @@ def AMDAIE_BdIdOp: AMDAIE_Op<"bd_id", [
 
   let arguments = (
     ins Index:$tile,
-        UI32Attr:$value
+        Index:$value
   );
 
   let assemblyFormat = [{ `(` $tile `,` $value `)` attr-dict }];
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir
index bb31f7ec9..ad161a626 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir
@@ -3,11 +3,11 @@
 // CHECK-LABEL: func.func @bd_id
 // CHECK: %[[C0:.*]] = arith.constant 0 : index
 // CHECK: %[[TILE_0:.*]] = amdaie.tile(%[[C0]], %[[C0]])
-// CHECK: %[[BD_ID:.*]] = amdaie.bd_id(%[[TILE_0]], 0)
+// CHECK: %[[BD_ID:.*]] = amdaie.bd_id(%[[TILE_0]], %[[C0]])
 func.func @bd_id() {
   %c0 = arith.constant 0 : index
   %tile = amdaie.tile(%c0, %c0)
-  %bd_id = amdaie.bd_id(%tile, 0)
+  %bd_id = amdaie.bd_id(%tile, %c0)
   return
 }
 
@@ -295,7 +295,7 @@ func.func @npu_dma_cpy_nd(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32,
 // CHECK-DAG:   %[[C16:.+]] = arith.constant 16 : index
 // CHECK-DAG:   %[[C128:.+]] = arith.constant 128 : index
 // CHECK-DAG:   %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]])
-// CHECK-DAG:   %[[BD_ID_0_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], 0)
+// CHECK-DAG:   %[[BD_ID_0_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C0]])
 // CHECK-DAG:   %[[CONNECTION_0:.+]] = amdaie.connection
 // CHECK:       %{{.*}} = amdaie.npu.dma_cpy_nd async_source
 // CHECK-SAME:  %[[CONNECTION_0]]
@@ -308,7 +308,7 @@ func.func @npu_dma_cpy_nd_bd_id(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16
   %c16 = arith.constant 16 : index
   %c128 = arith.constant 128 : index
   %tile = amdaie.tile(%c0, %c0)
-  %bd_id = amdaie.bd_id(%tile, 0)
+  %bd_id = amdaie.bd_id(%tile, %c0)
   %0 = amdaie.connection(%arg0, %arg1) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
   %1 = amdaie.npu.dma_cpy_nd async_source %0([%c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16] [%c128, %c128, %c16, %c1] bd_id = %bd_id, [%c0, %c0, %c0, %c0] [%c1, %c1, %c8, %c16] [%c128, %c16, %c16, %c1]  bd_id = %bd_id)
   return
@@ -371,7 +371,7 @@ func.func @npu_dma_cpy_nd_target_source(%arg0: !amdaie.logicalobjectfifo<memref<
 // CHECK-DAG:   %[[C16:.+]] = arith.constant 16 : index
 // CHECK-DAG:   %[[C128:.+]] = arith.constant 128 : index
 // CHECK-DAG:   %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]])
-// CHECK-DAG:   %[[BD_ID_0_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], 0)
+// CHECK-DAG:   %[[BD_ID_0_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C0]])
 // CHECK-DAG:   %[[CONNECTION_0:.+]] = amdaie.connection
 // CHECK:       %{{.*}} = amdaie.npu.dma_cpy_nd async_source %[[CONNECTION_0]]
 // CHECK-SAME:  %[[ARG0]][%[[C0]], %[[C0]], %[[C0]], %[[C0]]] [1, 1, %[[C8]], %[[C16]]] [%[[C128]], %[[C128]], %[[C16]], 1] bd_id = %[[BD_ID_0_0]]
@@ -383,7 +383,7 @@ func.func @npu_dma_cpy_nd_all_operands(%arg0: !amdaie.logicalobjectfifo<memref<1
   %c16 = arith.constant 16 : index
   %c128 = arith.constant 128 : index
   %tile = amdaie.tile(%c0, %c0)
-  %bd_id = amdaie.bd_id(%tile, 0)
+  %bd_id = amdaie.bd_id(%tile, %c0)
   %0 = amdaie.connection(%arg0, %arg1) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
   %1 = amdaie.npu.dma_cpy_nd async_source %0(%arg0[%c0, %c0, %c0, %c0] [1, 1, %c8, %c16] [%c128, %c128, %c16, 1] bd_id = %bd_id, %arg1[%c0, %c0, %c0, %c0] [1, 1, %c8, %c16] [%c128, %c16, %c16, 1] bd_id = %bd_id) : target_type = !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>  source_type = !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>
   return
@@ -396,14 +396,14 @@ func.func @npu_dma_cpy_nd_all_operands(%arg0: !amdaie.logicalobjectfifo<memref<1
 // CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
 // CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
 // CHECK-DAG:   %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]])
-// CHECK-DAG:   %[[BD_ID:.+]] = amdaie.bd_id(%[[TILE_0_0]], 0)
+// CHECK-DAG:   %[[BD_ID:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C0]])
 // CHECK-DAG:   %[[CHANNEL:.*]] = amdaie.channel(%[[TILE_0_0]], 0, port_type = DMA, direction = S2MM)
 // CHECK-DAG:   %[[CONNECTION_0:.+]] = amdaie.connection
 func.func @npu_half_dma_cpy_nd(%arg0: !amdaie.logicalobjectfifo<memref<2048xi32>>, %arg1: !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>>) {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %tile_0_0 = amdaie.tile(%c0, %c0)
-  %bd_id = amdaie.bd_id(%tile_0_0, 0)
+  %bd_id = amdaie.bd_id(%tile_0_0, %c0)
   %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = S2MM)
   %0 = amdaie.connection(%arg0, %arg1) : (!amdaie.logicalobjectfifo<memref<2048xi32>>, !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>>)
 // CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[ARG0]] [] [] []) : !amdaie.logicalobjectfifo<memref<2048xi32>>
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignNpuDmaBdIds.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignNpuDmaBdIds.cpp
index 1ba8d185d..29b4bdd2b 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignNpuDmaBdIds.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignNpuDmaBdIds.cpp
@@ -9,6 +9,8 @@
 #include "iree-amd-aie/Transforms/Passes.h"
 #include "iree-amd-aie/aie_runtime/Utils/ChannelBdIdGenerator.h"
 #include "iree-amd-aie/aie_runtime/iree_aie_runtime.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/SCF/Utils/Utils.h"
 
 #define DEBUG_TYPE "iree-amdaie-assign-npu-dma-bd-ids"
 
@@ -16,7 +18,161 @@ namespace mlir::iree_compiler::AMDAIE {
 
 namespace {
 
-/// Assign BD ids to NPU dma operations using the BD generator
+// Utility to retrieve a TileOp from a vector of tile values, while doing
+// appropriate verifications.
+FailureOr<AMDAIE::TileOp> getGeneratorTileOp(
+    AMDAIE::NpuDmaCpyNdOp &npuDmaOp,
+    DenseMap<Value, ChannelBdIdGenerator> &shimTileToGeneratorMap) {
+  SmallVector<Value> tiles;
+  if (npuDmaOp.getSource()) {
+    auto logicalObjFifo =
+        dyn_cast_if_present<AMDAIE::LogicalObjFifoOpInterface>(
+            npuDmaOp.getSource().getDefiningOp());
+    if (!logicalObjFifo)
+      return npuDmaOp.emitOpError() << "expected a source logical objectFifo";
+    tiles = logicalObjFifo.getTiles();
+  }
+  if (npuDmaOp.getTarget()) {
+    auto logicalObjFifo =
+        dyn_cast_if_present<AMDAIE::LogicalObjectFifoFromMemrefOp>(
+            npuDmaOp.getTarget().getDefiningOp());
+    if (!logicalObjFifo)
+      return npuDmaOp.emitOpError()
+             << "expected a target `amdaie.logicalobjectfifo.from_memref`";
+    tiles = logicalObjFifo.getTiles();
+  }
+  if (tiles.size() != 1) {
+    if (tiles.empty()) {
+      return npuDmaOp.emitOpError() << "no tiles found";
+    } else {
+      return npuDmaOp.emitOpError()
+             << "operating on multiple tiles is not supported";
+    }
+  }
+  Value tile = tiles[0];
+  if (!shimTileToGeneratorMap.contains(tile)) {
+    return npuDmaOp.emitOpError()
+           << "no channel BD ID generator found for tile: " << tile;
+  }
+  auto tileOp = dyn_cast_if_present<AMDAIE::TileOp>(tile.getDefiningOp());
+  if (!tileOp) return npuDmaOp.emitOpError() << "no tile op found";
+  return tileOp;
+};
+
+// Check if the DMA operation is in the innermost loop of controlcode.
+bool isInMostInnerLoop(AMDAIE::NpuDmaCpyNdOp op) {
+  auto parentLoop = op->getParentOfType<scf::ForOp>();
+  if (!parentLoop) {
+    return false;
+  }
+  bool hasNestedLoop = false;
+  parentLoop.walk([&](scf::ForOp nestedLoop) {
+    if (nestedLoop != parentLoop) {
+      hasNestedLoop = true;
+    }
+  });
+  return !hasNestedLoop;
+}
+
+// Count the number of BD IDs needed per loop iteration,
+// so that we know where to start the BD ID for the next iteration.
+uint32_t countBdIdPerLoopIteration(
+    scf::ForOp loop, AMDAIE::TileOp tileOp,
+    DenseMap<Value, ChannelBdIdGenerator> &shimTileToGeneratorMap) {
+  uint32_t count = 0;
+  loop.walk([&](AMDAIE::NpuDmaCpyNdOp dmaOp) {
+    if (dmaOp.getSource() || dmaOp.getTarget()) {
+      FailureOr<AMDAIE::TileOp> tile =
+          getGeneratorTileOp(dmaOp, shimTileToGeneratorMap);
+      if (succeeded(tile) && *tile == tileOp) {
+        count++;
+      }
+    }
+  });
+  return count;
+}
+
+FailureOr<AMDAIE::BdIdOp> getBdIdOp(
+    IRRewriter &rewriter, AMDAIE::NpuDmaCpyNdOp &npuDmaOp,
+    DenseMap<Value, ChannelBdIdGenerator> &shimTileToGeneratorMap,
+    DenseMap<TileOp, uint32_t> &tileToBdIdOffsetMap,
+    DenseMap<TileOp, uint32_t> &tileToBdIdSizeMap, uint32_t channel) {
+  FailureOr<AMDAIE::TileOp> tileOp =
+      getGeneratorTileOp(npuDmaOp, shimTileToGeneratorMap);
+  if (failed(tileOp)) return failure();
+
+  ChannelBdIdGenerator &generator = shimTileToGeneratorMap[tileOp->getResult()];
+  std::optional<uint32_t> bdId =
+      generator.getAndAssignBdId(channel, BdIdAssignmentMode::Incremental);
+  if (!bdId) return failure();
+  AMDAIE::BdIdOp bdIdOp;
+  rewriter.setInsertionPoint(npuDmaOp);
+  if (isInMostInnerLoop(npuDmaOp)) {
+    // If the DMA is in the innermost loop, assign a BD ID using the
+    // semi-affine expression:
+    // `(iv * step + bdId - offset) % size + offset`,
+    // `step` represents the number of BD IDs needed per loop iteration,
+    // `bdId` is the BD ID assigned by the generator,
+    // `offset` is the BD ID assigned to the first DMA in the loop,
+    // `size` is the number of BD IDs available.
+    if (!tileToBdIdOffsetMap.contains(*tileOp))
+      tileToBdIdOffsetMap[*tileOp] = bdId.value();
+    // plus one because one BD ID is just assigned in this function.
+    if (!tileToBdIdSizeMap.contains(*tileOp))
+      tileToBdIdSizeMap[*tileOp] = generator.getAvailableBdIdNum(channel) + 1;
+    auto loop = npuDmaOp->getParentOfType<scf::ForOp>();
+    uint32_t bdIdCount =
+        countBdIdPerLoopIteration(loop, *tileOp, shimTileToGeneratorMap);
+    Value iv = loop.getInductionVar();
+    auto step = rewriter.create<arith::ConstantOp>(
+        rewriter.getUnknownLoc(), rewriter.getIndexAttr(bdIdCount));
+    auto diff = rewriter.create<arith::ConstantOp>(
+        rewriter.getUnknownLoc(),
+        rewriter.getIndexAttr(bdId.value() -
+                              tileToBdIdOffsetMap[*tileOp]));  // bdId - offset
+    auto offset = rewriter.create<arith::ConstantOp>(
+        rewriter.getUnknownLoc(),
+        rewriter.getIndexAttr(tileToBdIdOffsetMap[*tileOp]));
+    auto size = rewriter.create<arith::ConstantOp>(
+        rewriter.getUnknownLoc(),
+        rewriter.getIndexAttr(tileToBdIdSizeMap[*tileOp]));
+    auto mul = rewriter.create<arith::MulIOp>(rewriter.getUnknownLoc(), iv,
+                                              step);  // iv * step
+    auto add1 =
+        rewriter.create<arith::AddIOp>(rewriter.getUnknownLoc(), mul,
+                                       diff);  // iv * step + bdId - offset
+    auto mod = rewriter.create<arith::RemUIOp>(
+        rewriter.getUnknownLoc(), add1,
+        size);  // (iv * step + bdId - offset) % size
+    auto add2 = rewriter.create<arith::AddIOp>(
+        rewriter.getUnknownLoc(), mod,
+        offset);  // (iv * step + bdId - offset) % size + offset
+    bdIdOp = rewriter.create<AMDAIE::BdIdOp>(rewriter.getUnknownLoc(), *tileOp,
+                                             add2.getResult());
+  } else {
+    // If the DMA is not in the innermost loop, assign a constant BD ID
+    auto constant = rewriter.create<arith::ConstantOp>(
+        rewriter.getUnknownLoc(), rewriter.getIndexAttr(bdId.value()));
+    bdIdOp = rewriter.create<AMDAIE::BdIdOp>(rewriter.getUnknownLoc(), *tileOp,
+                                             constant.getResult());
+  }
+  return bdIdOp;
+};
+
+FailureOr<uint32_t> retriveBdId(arith::AddIOp add2) {
+  uint32_t offset = getConstantIndexOrAssert(add2.getOperand(1));
+  if (auto mod = dyn_cast<arith::RemUIOp>(add2.getOperand(0).getDefiningOp())) {
+    if (auto add1 =
+            dyn_cast<arith::AddIOp>(mod.getOperand(0).getDefiningOp())) {
+      uint32_t diff = getConstantIndexOrAssert(add1.getOperand(1));
+      uint32_t bdId = offset + diff;
+      return bdId;
+    }
+  }
+  return failure();
+};
+
+/// Assign BD ids to NPU dma operations using the BD generator.
 LogicalResult assignNpuDmaBdIds(AMDAIE::WorkgroupOp workgroupOp) {
   IRRewriter rewriter(workgroupOp->getContext());
 
@@ -39,25 +195,15 @@ LogicalResult assignNpuDmaBdIds(AMDAIE::WorkgroupOp workgroupOp) {
     }
   });
 
-  // Utility to retrieve a TileOp from a vector of tile values, while doing
-  // appropriate verifications.
-  auto getGeneratorTileOp = [&](AMDAIE::NpuDmaCpyNdOp &npuDmaOp,
-                                const SmallVector<Value> &tiles,
-                                AMDAIE::TileOp &tileOp) -> LogicalResult {
-    if (tiles.size() != 1) {
-      return npuDmaOp.emitOpError()
-             << "operating on multiple tiles is not supported";
-    }
-    Value tile = tiles[0];
-    if (!shimTileToGeneratorMap.contains(tile)) {
-      return npuDmaOp.emitOpError()
-             << "no channel BD ID generator found for tile: " << tile;
-    }
-    tileOp = dyn_cast_if_present<AMDAIE::TileOp>(tile.getDefiningOp());
-    if (!tileOp) return npuDmaOp.emitOpError() << "no tile op found";
-    return success();
-  };
+  // TODO(jornt): Temporarily use channel 0 for all DMAs. This should
+  // return correct results for Shim channels, however, for generality
+  // towards other DMAs and future hardware generations, channel
+  // assignment should happen before BD assignemnt. This requires more
+  // refactoring.
+  const uint32_t channel = 0;
 
+  DenseMap<AMDAIE::TileOp, uint32_t> tileToBdIdOffsetMap;
+  DenseMap<AMDAIE::TileOp, uint32_t> tileToBdIdSizeMap;
   // Walk `amdaie.npu_dma_cpy_nd` and  `amdaie.dma_wait` operations and assign
   // and release BD IDs when encountering the respective operations using the
   // tile BD ID generators initialized earlier.
@@ -65,28 +211,10 @@ LogicalResult assignNpuDmaBdIds(AMDAIE::WorkgroupOp workgroupOp) {
   WalkResult res = controlCodeOp->walk([&](Operation *op) {
     if (auto npuDmaOp = dyn_cast<AMDAIE::NpuDmaCpyNdOp>(op)) {
       if (npuDmaOp.getSource()) {
-        auto logicalObjFifo =
-            dyn_cast_if_present<AMDAIE::LogicalObjFifoOpInterface>(
-                npuDmaOp.getSource().getDefiningOp());
-        if (!logicalObjFifo) {
-          npuDmaOp.emitOpError() << "expected a source logical objectFifo";
-          return WalkResult::interrupt();
-        }
-        SmallVector<Value> tiles = logicalObjFifo.getTiles();
-        AMDAIE::TileOp tileOp;
-        if (failed(getGeneratorTileOp(npuDmaOp, tiles, tileOp)))
-          return WalkResult::interrupt();
-        ChannelBdIdGenerator &generator =
-            shimTileToGeneratorMap[tileOp.getResult()];
-        // TODO(jornt): Temporarily use channel 0 for all DMAs. This should
-        // return correct results for Shim channels, however, for generality
-        // towards other DMAs and future hardware generations, channel
-        // assignment should happen before BD assignemnt. This requires more
-        // refactoring.
-        std::optional<uint32_t> bdId = generator.getAndAssignBdId(0);
-        rewriter.setInsertionPointAfter(tileOp);
-        auto bdIdOp = rewriter.create<AMDAIE::BdIdOp>(rewriter.getUnknownLoc(),
-                                                      tileOp, bdId.value());
+        FailureOr<AMDAIE::BdIdOp> bdIdOp =
+            getBdIdOp(rewriter, npuDmaOp, shimTileToGeneratorMap,
+                      tileToBdIdOffsetMap, tileToBdIdSizeMap, channel);
+        if (failed(bdIdOp)) return WalkResult::interrupt();
         rewriter.setInsertionPoint(npuDmaOp);
         npuDmaOp = rewriter.replaceOpWithNewOp<AMDAIE::NpuDmaCpyNdOp>(
             npuDmaOp, npuDmaOp.getResultTypes(), npuDmaOp.getConnection(),
@@ -94,38 +222,19 @@ LogicalResult assignNpuDmaBdIds(AMDAIE::WorkgroupOp workgroupOp) {
             npuDmaOp.getTargetMixedSizes(), npuDmaOp.getTargetMixedStrides(),
             npuDmaOp.getTargetBdId(), npuDmaOp.getSource(),
             npuDmaOp.getSourceMixedOffsets(), npuDmaOp.getSourceMixedSizes(),
-            npuDmaOp.getSourceMixedStrides(), bdIdOp);
+            npuDmaOp.getSourceMixedStrides(), *bdIdOp);
       }
       if (npuDmaOp.getTarget()) {
-        auto logicalObjFifo =
-            dyn_cast_if_present<AMDAIE::LogicalObjectFifoFromMemrefOp>(
-                npuDmaOp.getTarget().getDefiningOp());
-        if (!logicalObjFifo) {
-          npuDmaOp.emitOpError()
-              << "expected a target `amdaie.logicalobjectfifo.from_memref`";
-          return WalkResult::interrupt();
-        }
-        SmallVector<Value> tiles = logicalObjFifo.getTiles();
-        AMDAIE::TileOp tileOp;
-        if (failed(getGeneratorTileOp(npuDmaOp, tiles, tileOp)))
-          return WalkResult::interrupt();
-        ChannelBdIdGenerator &generator =
-            shimTileToGeneratorMap[tileOp.getResult()];
-        // TODO(jornt): Temporarily use channel 0 for all DMAs. This should
-        // return correct results for Shim channels, however, for generality
-        // towards other DMAs and future hardware generations, channel
-        // assignment should happen before BD assignemnt. This requires more
-        // refactoring.
-        std::optional<uint32_t> bdId = generator.getAndAssignBdId(0);
-        rewriter.setInsertionPointAfter(tileOp);
-        auto bdIdOp = rewriter.create<AMDAIE::BdIdOp>(rewriter.getUnknownLoc(),
-                                                      tileOp, bdId.value());
+        FailureOr<AMDAIE::BdIdOp> bdIdOp =
+            getBdIdOp(rewriter, npuDmaOp, shimTileToGeneratorMap,
+                      tileToBdIdOffsetMap, tileToBdIdSizeMap, channel);
+        if (failed(bdIdOp)) return WalkResult::interrupt();
         rewriter.setInsertionPoint(npuDmaOp);
         (void)rewriter.replaceOpWithNewOp<AMDAIE::NpuDmaCpyNdOp>(
             npuDmaOp, npuDmaOp.getResultTypes(), npuDmaOp.getConnection(),
             npuDmaOp.getTarget(), npuDmaOp.getTargetMixedOffsets(),
             npuDmaOp.getTargetMixedSizes(), npuDmaOp.getTargetMixedStrides(),
-            bdIdOp, npuDmaOp.getSource(), npuDmaOp.getSourceMixedOffsets(),
+            *bdIdOp, npuDmaOp.getSource(), npuDmaOp.getSourceMixedOffsets(),
             npuDmaOp.getSourceMixedSizes(), npuDmaOp.getSourceMixedStrides(),
             npuDmaOp.getSourceBdId());
       }
@@ -158,8 +267,18 @@ LogicalResult assignNpuDmaBdIds(AMDAIE::WorkgroupOp workgroupOp) {
         }
         ChannelBdIdGenerator &generator =
             shimTileToGeneratorMap[tileOp.getResult()];
-        uint32_t value = bdIdOp.getValue();
-        generator.releaseBdId(value);
+        Value value = bdIdOp.getValue();
+        if (auto addOp = value.getDefiningOp<arith::AddIOp>()) {
+          // If the BD ID is a semi-affine expression, retrieve the BD ID for
+          // the first iteration.
+          FailureOr<uint32_t> bdId = retriveBdId(addOp);
+          if (failed(bdId)) return WalkResult::interrupt();
+          generator.releaseBdId(*bdId);
+        } else {
+          // Else, must be a constant BD ID.
+          uint32_t bdId = getConstantIndexOrAssert(value);
+          generator.releaseBdId(bdId);
+        }
       }
       return WalkResult::advance();
     }
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeLowering.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeLowering.cpp
index 2348478d9..8fdd11f56 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeLowering.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeLowering.cpp
@@ -136,7 +136,7 @@ struct HalfDmaCpyNdToNpuConverter final
       return bdIdOp.emitOpError() << "must operate on an `amdaie.tile`";
     int64_t col = getConstantIndexOrAssert(tileOp.getCol());
     int64_t row = getConstantIndexOrAssert(tileOp.getRow());
-    int32_t bdId = bdIdOp.getValue();
+    int32_t bdId = getConstantIndexOrAssert(bdIdOp.getValue());
     int32_t outOfOrderId{0};
 
     SmallVector<int32_t, 4> staticSizes;
@@ -159,22 +159,18 @@ struct HalfDmaCpyNdToNpuConverter final
         if (stride == 0) {
           repeatCount = size;
         } else {
-          iterationStride =
-              std::max(stride * elemWidthInBits / minStrideBitWidth,
-                       (int64_t)1);
+          iterationStride = std::max(
+              stride * elemWidthInBits / minStrideBitWidth, (int64_t)1);
           iterationSize = size;
-          if (stride == 1)
-            size = (size * elemWidthInBits) / minStrideBitWidth;
+          if (stride == 1) size = (size * elemWidthInBits) / minStrideBitWidth;
           repeatCount = iterationSize;
         }
       } else {
         staticStrides.push_back(
-            std::max(stride * elemWidthInBits / minStrideBitWidth,
-                     (int64_t)1));
+            std::max(stride * elemWidthInBits / minStrideBitWidth, (int64_t)1));
         // Innermost size needs to account for addressing granularity.
         if (iter.index() == (sizes.size() - 1)) {
-          staticSizes.push_back(size * elemWidthInBits /
-                                minStrideBitWidth);
+          staticSizes.push_back(size * elemWidthInBits / minStrideBitWidth);
         } else {
           staticSizes.push_back(size);
         }
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_npu_dma_bd_ids.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_npu_dma_bd_ids.mlir
index b482663de..b4d0aabe0 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_npu_dma_bd_ids.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_npu_dma_bd_ids.mlir
@@ -15,10 +15,10 @@ module {
 // CHECK:       %[[C0:.+]] = arith.constant 0 : index
 // CHECK:       amdaie.workgroup
 // CHECK-DAG:     %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]])
-// CHECK-DAG:     %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], 0)
 // CHECK-DAG:     %[[FROM_MEMREF:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_0_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
 // CHECK:         %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
 // CHECK:         amdaie.controlcode
+// CHECK:           %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C0]])
 // CHECK:           %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA]]([] [] [], %[[FROM_MEMREF]][0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_0]])
 // CHECK:           amdaie.npu.dma_wait(%[[NPU_DMA]] : !amdaie.async_source_token)
 #map = affine_map<(d0) -> (d0 * 16)>
@@ -50,10 +50,10 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 // CHECK:       %[[C0:.+]] = arith.constant 0 : index
 // CHECK:       amdaie.workgroup
 // CHECK-DAG:     %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]])
-// CHECK-DAG:     %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], 0)
 // CHECK-DAG:     %[[FROM_MEMREF:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_0_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
 // CHECK:         %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
 // CHECK:         amdaie.controlcode
+// CHECK:           %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C0]])
 // CHECK:           %[[NPU_DMA:.+]] = amdaie.npu.dma_cpy_nd async_target %[[CIRC_DMA]](%[[FROM_MEMREF]][0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_0]], [] [] [])
 // CHECK:           amdaie.npu.dma_wait(%[[NPU_DMA]] : !amdaie.async_target_token)
 #map = affine_map<(d0) -> (d0 * 16)>
@@ -87,11 +87,8 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 // CHECK:       %[[C2:.+]] = arith.constant 2 : index
 // CHECK:       amdaie.workgroup
 // CHECK-DAG:     %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]])
-// CHECK-DAG:     %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], 0)
 // CHECK-DAG:     %[[TILE_1_0:.+]] = amdaie.tile(%[[C1]], %[[C0]])
-// CHECK-DAG:     %[[BD_ID_1:.+]] = amdaie.bd_id(%[[TILE_1_0]], 0)
 // CHECK-DAG:     %[[TILE_2_0:.+]] = amdaie.tile(%[[C2]], %[[C0]])
-// CHECK-DAG:     %[[BD_ID_2:.+]] = amdaie.bd_id(%[[TILE_2_0]], 0)
 // CHECK-DAG:     %[[FROM_MEMREF_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_0_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
 // CHECK-DAG:     %[[FROM_MEMREF_1:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_1_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
 // CHECK-DAG:     %[[FROM_MEMREF_2:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_2_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
@@ -99,8 +96,11 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 // CHECK:         %[[CIRC_DMA_1:.+]] = amdaie.circular_dma_cpy_nd
 // CHECK:         %[[CIRC_DMA_2:.+]] = amdaie.circular_dma_cpy_nd
 // CHECK:         amdaie.controlcode
+// CHECK:           %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C0]])
 // CHECK:           %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA_0]]([] [] [], %[[FROM_MEMREF_0]][0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_0]])
+// CHECK:           %[[BD_ID_1:.+]] = amdaie.bd_id(%[[TILE_1_0]], %[[C0]])
 // CHECK:           %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA_1]]([] [] [], %[[FROM_MEMREF_1]][0, 0] [8, 16] [16, 1] bd_id = %[[BD_ID_1]])
+// CHECK:           %[[BD_ID_2:.+]] = amdaie.bd_id(%[[TILE_2_0]], %[[C0]])
 // CHECK:           %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA_2]]([] [] [], %[[FROM_MEMREF_2]][0] [128] [1] bd_id = %[[BD_ID_2]])
 // CHECK:           amdaie.npu.dma_wait(%[[NPU_DMA_0]] : !amdaie.async_source_token)
 // CHECK:           amdaie.npu.dma_wait(%[[NPU_DMA_1]] : !amdaie.async_source_token)
@@ -143,24 +143,28 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 
 // -----
 
-// CHECK-LABEL: @multiple_dma_cpy_with_bd_id_reuse
-// CHECK:       %[[C0:.+]] = arith.constant 0 : index
+// CHECK-LABEL: @multiple_dma_cpy_with_diff_bd_id_1
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
 // CHECK:       amdaie.workgroup
 // CHECK-DAG:     %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]])
-// CHECK-DAG:     %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], 0)
 // CHECK-DAG:     %[[FROM_MEMREF_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_0_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
 // CHECK:         %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
 // CHECK:         amdaie.controlcode
+// CHECK:           %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C0]])
 // CHECK:           %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA]]([] [] [], %[[FROM_MEMREF_0]][0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_0]])
 // CHECK:           amdaie.npu.dma_wait(%[[NPU_DMA_0]] : !amdaie.async_source_token)
-// CHECK:           %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA]]([] [] [], %[[FROM_MEMREF_0]][0, 0] [8, 16] [16, 1] bd_id = %[[BD_ID_0]])
+// CHECK:           %[[BD_ID_1:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C1]])
+// CHECK:           %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA]]([] [] [], %[[FROM_MEMREF_0]][0, 0] [8, 16] [16, 1] bd_id = %[[BD_ID_1]])
 // CHECK:           amdaie.npu.dma_wait(%[[NPU_DMA_1]] : !amdaie.async_source_token)
-// CHECK:           %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA]]([] [] [], %[[FROM_MEMREF_0]][0] [128] [1] bd_id = %[[BD_ID_0]])
+// CHECK:           %[[BD_ID_2:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C2]])
+// CHECK:           %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA]]([] [] [], %[[FROM_MEMREF_0]][0] [128] [1] bd_id = %[[BD_ID_2]])
 // CHECK:           amdaie.npu.dma_wait(%[[NPU_DMA_2]] : !amdaie.async_source_token)
 #map = affine_map<(d0) -> (d0 * 16)>
 #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
 module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
-  func.func @multiple_dma_cpy_with_bd_id_reuse(%arg0: memref<8x16xi32>, %arg1: memref<1x1x8x16xi32, 1>) {
+  func.func @multiple_dma_cpy_with_diff_bd_id_1(%arg0: memref<8x16xi32>, %arg1: memref<1x1x8x16xi32, 1>) {
     %c0 = arith.constant 0 : index
     %c1 = arith.constant 1 : index
     amdaie.workgroup {
@@ -186,18 +190,20 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 
 // -----
 
-// CHECK-LABEL: @multiple_dma_cpy_with_diff_bd_id
-// CHECK:       %[[C0:.+]] = arith.constant 0 : index
+// CHECK-LABEL: @multiple_dma_cpy_with_diff_bd_id_2
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
 // CHECK:       amdaie.workgroup
 // CHECK:         %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]])
-// CHECK-DAG:     %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], 0)
-// CHECK-DAG:     %[[BD_ID_1:.+]] = amdaie.bd_id(%[[TILE_0_0]], 1)
-// CHECK-DAG:     %[[BD_ID_2:.+]] = amdaie.bd_id(%[[TILE_0_0]], 2)
 // CHECK-DAG:     %[[FROM_MEMREF_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_0_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
 // CHECK:         %[[CIRC_DMA:.+]] = amdaie.circular_dma_cpy_nd
 // CHECK:         amdaie.controlcode
+// CHECK:           %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C0]])
 // CHECK:           %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA]]([] [] [], %[[FROM_MEMREF_0]][0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_0]])
+// CHECK:           %[[BD_ID_1:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C1]])
 // CHECK:           %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA]]([] [] [], %[[FROM_MEMREF_0]][0, 0] [8, 16] [16, 1] bd_id = %[[BD_ID_1]])
+// CHECK:           %[[BD_ID_2:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C2]])
 // CHECK:           %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA]]([] [] [], %[[FROM_MEMREF_0]][0] [128] [1] bd_id = %[[BD_ID_2]])
 // CHECK:           amdaie.npu.dma_wait(%[[NPU_DMA_0]] : !amdaie.async_source_token)
 // CHECK:           amdaie.npu.dma_wait(%[[NPU_DMA_1]] : !amdaie.async_source_token)
@@ -205,7 +211,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 #map = affine_map<(d0) -> (d0 * 16)>
 #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
 module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
-  func.func @multiple_dma_cpy_with_diff_bd_id(%arg0: memref<8x16xi32>, %arg1: memref<1x1x8x16xi32, 1>) {
+  func.func @multiple_dma_cpy_with_diff_bd_id_2(%arg0: memref<8x16xi32>, %arg1: memref<1x1x8x16xi32, 1>) {
     %c0 = arith.constant 0 : index
     %c1 = arith.constant 1 : index
     amdaie.workgroup {
@@ -231,20 +237,17 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 
 // -----
 
-// CHECK-LABEL: @nested_loops
-// CHECK:       %[[C0:.+]] = arith.constant 0 : index
-// CHECK:       %[[C1:.+]] = arith.constant 1 : index
-// CHECK:       %[[C2:.+]] = arith.constant 2 : index
-// CHECK:       %[[C6:.+]] = arith.constant 6 : index
+// CHECK-LABEL: @nested_loops_1
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
+// CHECK-DAG:   %[[C6:.+]] = arith.constant 6 : index
+// CHECK-DAG:   %[[C15:.+]] = arith.constant 15 : index
+// CHECK-DAG:   %[[C16:.+]] = arith.constant 16 : index
 // CHECK:       amdaie.workgroup
 // CHECK-DAG:     %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]])
-// CHECK-DAG:     %[[BD_ID_0_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], 0)
-// CHECK-DAG:     %[[BD_ID_0_1:.+]] = amdaie.bd_id(%[[TILE_0_0]], 1)
 // CHECK-DAG:     %[[TILE_1_0:.+]] = amdaie.tile(%[[C1]], %[[C0]])
-// CHECK-DAG:     %[[BD_ID_1_0:.+]] = amdaie.bd_id(%[[TILE_1_0]], 0)
-// CHECK-DAG:     %[[BD_ID_1_1:.+]] = amdaie.bd_id(%[[TILE_1_0]], 1)
 // CHECK-DAG:     %[[TILE_2_0:.+]] = amdaie.tile(%[[C2]], %[[C0]])
-// CHECK-DAG:     %[[BD_ID_2_0:.+]] = amdaie.bd_id(%[[TILE_2_0]], 0)
 // CHECK-DAG:     %[[FROM_MEMREF_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_0_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
 // CHECK-DAG:     %[[FROM_MEMREF_1:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_1_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
 // CHECK-DAG:     %[[FROM_MEMREF_2:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_2_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
@@ -252,14 +255,22 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 // CHECK:         %[[CIRC_DMA_1:.+]] = amdaie.circular_dma_cpy_nd
 // CHECK:         %[[CIRC_DMA_2:.+]] = amdaie.circular_dma_cpy_nd
 // CHECK:         amdaie.controlcode
+// CHECK:           %[[BD_ID_0_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C0]])
 // CHECK:           %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA_0]]([] [] [], %[[FROM_MEMREF_0]][0, 0, 0, 0] [1, 1, 8, 16] [128, 128, 16, 1] bd_id = %[[BD_ID_0_0]])
 // CHECK:           scf.forall (%{{.+}}, %{{.+}}) in (2, 2)
+// CHECK:             %[[BD_ID_1_0:.+]] = amdaie.bd_id(%[[TILE_1_0]], %[[C0]])
 // CHECK:             %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA_1]]([] [] [], %[[FROM_MEMREF_1]][0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_1_0]])
-// CHECK:             scf.for %{{.+}} = %[[C0]] to %[[C6]] step %[[C1]]
+// CHECK:             scf.for %[[LOOP_VAR_0:.+]] = %[[C0]] to %[[C6]] step %[[C1]]
+// CHECK:               %[[VAR_0:.+]] = arith.remui %[[LOOP_VAR_0:.+]], %[[C15]] : index
+// CHECK:               %[[VAR_1:.+]] = arith.addi %[[VAR_0]], %[[C1]] : index
+// CHECK:               %[[BD_ID_1_1:.+]] = amdaie.bd_id(%[[TILE_1_0]], %[[VAR_1]])
 // CHECK:               %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA_1]]([] [] [], %[[FROM_MEMREF_1]][0, 0] [1, 128] [128, 1] bd_id = %[[BD_ID_1_1]])
+// CHECK:               %[[BD_ID_0_1:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[VAR_1]])
 // CHECK:               %[[NPU_DMA_3:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA_0]]([] [] [], %[[FROM_MEMREF_0]][0] [128] [1] bd_id = %[[BD_ID_0_1]])
 // CHECK:               amdaie.npu.dma_wait(%[[NPU_DMA_2]] : !amdaie.async_source_token)
 // CHECK:               amdaie.npu.dma_wait(%[[NPU_DMA_3]] : !amdaie.async_source_token)
+// CHECK:               %[[VAR_2:.+]] = arith.remui %[[LOOP_VAR_0:.+]], %[[C16]] : index
+// CHECK:               %[[BD_ID_2_0:.+]] = amdaie.bd_id(%[[TILE_2_0]], %[[VAR_2]])
 // CHECK:               %[[NPU_DMA_4:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA_2]]([] [] [], %[[FROM_MEMREF_2]][] [] [] bd_id = %[[BD_ID_2_0]])
 // CHECK:               amdaie.npu.dma_wait(%[[NPU_DMA_4]] : !amdaie.async_source_token)
 // CHECK:             }
@@ -269,7 +280,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 #map = affine_map<(d0) -> (d0 * 16)>
 #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
 module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
-  func.func @nested_loops(%arg0: memref<8x16xi32>, %arg1: memref<8x16xi32>, %arg2: memref<8x16xi32>, %arg3: memref<1x1x8x16xi32, 1>) {
+  func.func @nested_loops_1(%arg0: memref<8x16xi32>, %arg1: memref<8x16xi32>, %arg2: memref<8x16xi32>, %arg3: memref<1x1x8x16xi32, 1>) {
     %c0 = arith.constant 0 : index
     %c1 = arith.constant 1 : index
     %c2 = arith.constant 2 : index
@@ -310,3 +321,85 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
     return
   }
 }
+
+// -----
+
+// CHECK-LABEL: @nested_loops_2
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
+// CHECK-DAG:   %[[C3:.+]] = arith.constant 3 : index
+// CHECK-DAG:   %[[C6:.+]] = arith.constant 6 : index
+// CHECK-DAG:   %[[C14:.+]] = arith.constant 14 : index
+// CHECK:       amdaie.workgroup
+// CHECK-DAG:     %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]])
+// CHECK-DAG:     %[[FROM_MEMREF_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_0_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
+// CHECK-DAG:     %[[FROM_MEMREF_1:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_0_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
+// CHECK-DAG:     %[[FROM_MEMREF_2:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {%[[TILE_0_0]]} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
+// CHECK:         %[[CIRC_DMA_0:.+]] = amdaie.circular_dma_cpy_nd
+// CHECK:         amdaie.controlcode
+// CHECK:           %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C0]])
+// CHECK:           %[[NPU_DMA_0:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA_0]]([] [] [], %[[FROM_MEMREF_0]][0, 0, 0, 0] [1, 1, 8, 16] [128, 128, 16, 1] bd_id = %[[BD_ID_0]])
+// CHECK:           scf.forall (%{{.+}}, %{{.+}}) in (2, 2)
+// CHECK:             %[[BD_ID_1:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C1]])
+// CHECK:             %[[NPU_DMA_1:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA_0]]([] [] [], %[[FROM_MEMREF_1]][0, 0, 0] [1, 8, 16] [128, 16, 1] bd_id = %[[BD_ID_1]])
+// CHECK:             scf.for %[[LOOP_VAR_0:.+]] = %[[C0]] to %[[C6]] step %[[C1]]
+// CHECK:               %[[VAR_0:.+]] = arith.muli %[[LOOP_VAR_0]], %[[C3]] : index
+// CHECK:               %[[VAR_1:.+]] = arith.remui %[[VAR_0]], %[[C14]] : index
+// CHECK:               %[[VAR_2:.+]] = arith.addi %[[VAR_1]], %[[C2]] : index
+// CHECK:               %[[BD_ID_2:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[VAR_2]])
+// CHECK:               %[[NPU_DMA_2:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA_0]]([] [] [], %[[FROM_MEMREF_1]][0, 0] [1, 128] [128, 1] bd_id = %[[BD_ID_2]])
+// CHECK:               %[[VAR_3:.+]] = arith.addi %[[VAR_0]], %[[C1]] : index
+// CHECK:               %[[VAR_4:.+]] = arith.remui %[[VAR_3]], %[[C14]] : index
+// CHECK:               %[[VAR_5:.+]] = arith.addi %[[VAR_4]], %[[C2]] : index
+// CHECK:               %[[BD_ID_3:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[VAR_5]])
+// CHECK:               %[[NPU_DMA_3:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA_0]]([] [] [], %[[FROM_MEMREF_0]][0] [128] [1] bd_id = %[[BD_ID_3]])
+// CHECK:               amdaie.npu.dma_wait(%[[NPU_DMA_2]] : !amdaie.async_source_token)
+// CHECK:               amdaie.npu.dma_wait(%[[NPU_DMA_3]] : !amdaie.async_source_token)
+// CHECK:               %[[VAR_6:.+]] = arith.addi %[[VAR_0]], %[[C2]] : index
+// CHECK:               %[[VAR_7:.+]] = arith.remui %[[VAR_6]], %[[C14]] : index
+// CHECK:               %[[VAR_8:.+]] = arith.addi %[[VAR_7]], %[[C2]] : index
+// CHECK:               %[[BD_ID_4:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[VAR_8]])
+// CHECK:               %[[NPU_DMA_4:.+]] = amdaie.npu.dma_cpy_nd async_source %[[CIRC_DMA_0]]([] [] [], %[[FROM_MEMREF_2]][] [] [] bd_id = %[[BD_ID_4]])
+// CHECK:               amdaie.npu.dma_wait(%[[NPU_DMA_4]] : !amdaie.async_source_token)
+// CHECK:             }
+// CHECK:             amdaie.npu.dma_wait(%[[NPU_DMA_1]] : !amdaie.async_source_token)
+// CHECK:           }
+// CHECK:           amdaie.npu.dma_wait(%[[NPU_DMA_0]] : !amdaie.async_source_token)
+#map = affine_map<(d0) -> (d0 * 16)>
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @nested_loops_2(%arg0: memref<8x16xi32>, %arg1: memref<8x16xi32>, %arg2: memref<8x16xi32>, %arg3: memref<1x1x8x16xi32, 1>) {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
+    %c6 = arith.constant 6 : index
+    amdaie.workgroup {
+      %tile_0_0 = amdaie.tile(%c0, %c0)
+      %placeholder0 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo<memref<8x16xi32>>
+      %from_memref_0 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
+      %from_memref_1 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
+      %from_memref_2 = amdaie.logicalobjectfifo.from_memref %arg2, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
+      %from_memref_3 = amdaie.logicalobjectfifo.from_memref %arg3, {%tile_0_0} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>
+      %dma0 = amdaie.circular_dma_cpy_nd(%from_memref_3[] [] [], %placeholder0[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32>>)
+      amdaie.controlcode {
+        %0 = amdaie.npu.dma_cpy_nd async_source %dma0([] [] [], %from_memref_0[0, 0, 0, 0] [1, 1, 8, 16] [128, 128, 16, 1]) : source_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>
+        scf.forall (%arg4, %arg5) in (2, 2) {
+          %1 = amdaie.npu.dma_cpy_nd async_source %dma0([] [] [], %from_memref_1[0, 0, 0] [1, 8, 16] [128, 16, 1]) : source_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>
+          scf.for %arg6 = %c0 to %c6 step %c1 {
+            %2 = amdaie.npu.dma_cpy_nd async_source %dma0([] [] [], %from_memref_1[0, 0] [1, 128] [128, 1]) : source_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>
+            %3 = amdaie.npu.dma_cpy_nd async_source %dma0([] [] [], %from_memref_0[0] [128] [1]) : source_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>
+            amdaie.npu.dma_wait(%2 : !amdaie.async_source_token)
+            amdaie.npu.dma_wait(%3 : !amdaie.async_source_token)
+            %4 = amdaie.npu.dma_cpy_nd async_source %dma0([] [] [], %from_memref_2[] [] []) : source_type = !amdaie.logicalobjectfifo<memref<8x16xi32>>
+            amdaie.npu.dma_wait(%4 : !amdaie.async_source_token)
+          }
+          amdaie.npu.dma_wait(%1 : !amdaie.async_source_token)
+        }
+        amdaie.npu.dma_wait(%0 : !amdaie.async_source_token)
+        amdaie.end
+      }
+    }
+    return
+  }
+}
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_lowering.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_lowering.mlir
index e15cabc27..db465c0f0 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_lowering.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_lowering.mlir
@@ -40,7 +40,6 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
     amdaie.workgroup {
       %tile = amdaie.tile(%c0, %c1)
       %tile_0 = amdaie.tile(%c0, %c0)
-      %bd_id = amdaie.bd_id(%tile_0, 0)
       %buffer = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32>
       %buffer_1 = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32>
       %lock = amdaie.lock(%tile(4), 4)
@@ -55,6 +54,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
       amdaie.controlcode {
         %5 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
         memref.assume_alignment %1, 64 : memref<64x32xi32>
+        %bd_id = amdaie.bd_id(%tile_0, %c0)
 // CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 0 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 0, 0, 0>, strides = array<i32: 0, 0, 0>, use_next_bd = false, valid_bd = true}
 // CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32}
 // CHECK: amdaie.npu.push_to_queue {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
@@ -91,7 +91,6 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
     amdaie.workgroup {
       %tile = amdaie.tile(%c0, %c1)
       %tile_0 = amdaie.tile(%c0, %c0)
-      %bd_id = amdaie.bd_id(%tile_0, 0)
       %buffer = amdaie.buffer(%tile) : memref<2048xbf16, 1 : i32>
       %buffer_1 = amdaie.buffer(%tile) : memref<2048xbf16, 1 : i32>
       %lock = amdaie.lock(%tile(4), 4)
@@ -106,6 +105,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
       amdaie.controlcode {
         %5 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo<memref<2048xbf16>>
         memref.assume_alignment %1, 64 : memref<64x32xi32>
+        %bd_id = amdaie.bd_id(%tile_0, %c0)
 // CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 0 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 0, 0, 0>, strides = array<i32: 0, 0, 0>, use_next_bd = false, valid_bd = true}
 // CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32}
 // CHECK: amdaie.npu.push_to_queue {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
@@ -142,7 +142,6 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
     amdaie.workgroup {
       %tile = amdaie.tile(%c0, %c1)
       %tile_0 = amdaie.tile(%c0, %c0)
-      %bd_id = amdaie.bd_id(%tile_0, 0)
       %buffer = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32>
       %buffer_1 = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32>
       %lock = amdaie.lock(%tile(4), 4)
@@ -157,6 +156,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
       amdaie.controlcode {
         %5 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
         memref.assume_alignment %1, 64 : memref<64x32xi32>
+        %bd_id = amdaie.bd_id(%tile_0, %c0)
 // CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 0 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 0, 0, 0>, strides = array<i32: 0, 0, 0>, use_next_bd = false, valid_bd = true}
 // CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32}
 // CHECK: amdaie.npu.push_to_queue  {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 0 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
@@ -193,7 +193,6 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
     amdaie.workgroup {
       %tile = amdaie.tile(%c0, %c1)
       %tile_0 = amdaie.tile(%c0, %c0)
-      %bd_id = amdaie.bd_id(%tile_0, 0)
       %buffer = amdaie.buffer(%tile) : memref<2048xi8, 1 : i32>
       %buffer_1 = amdaie.buffer(%tile) : memref<2048xi8, 1 : i32>
       %lock = amdaie.lock(%tile(4), 4)
@@ -208,6 +207,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
       amdaie.controlcode {
         %5 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo<memref<2048xi8>>
         memref.assume_alignment %1, 64 : memref<64x32xi32>
+        %bd_id = amdaie.bd_id(%tile_0, %c0)
 // CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 0 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 0, 0, 0>, strides = array<i32: 0, 0, 0>, use_next_bd = false, valid_bd = true}
 // CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32}
 // CHECK: amdaie.npu.push_to_queue  {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 0 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
@@ -243,7 +243,6 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
     amdaie.workgroup {
       %tile = amdaie.tile(%c0, %c1)
       %tile_0 = amdaie.tile(%c0, %c0)
-      %bd_id = amdaie.bd_id(%tile_0, 0)
       %buffer = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32>
       %buffer_1 = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32>
       %lock = amdaie.lock(%tile(4), 4)
@@ -258,6 +257,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
       amdaie.controlcode {
         %5 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
         memref.assume_alignment %1, 64 : memref<64x32xi32>
+        %bd_id = amdaie.bd_id(%tile_0, %c0)
 // CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 0 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 0, 0, 0>, strides = array<i32: 0, 0, 0>, use_next_bd = false, valid_bd = true}
 // CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32}
 // CHECK: amdaie.npu.push_to_queue {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
@@ -294,7 +294,6 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
     amdaie.workgroup {
       %tile = amdaie.tile(%c0, %c1)
       %tile_0 = amdaie.tile(%c0, %c0)
-      %bd_id = amdaie.bd_id(%tile_0, 0)
       %buffer = amdaie.buffer(%tile) : memref<2048xbf16, 1 : i32>
       %buffer_1 = amdaie.buffer(%tile) : memref<2048xbf16, 1 : i32>
       %lock = amdaie.lock(%tile(4), 4)
@@ -309,6 +308,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
       amdaie.controlcode {
         %5 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo<memref<2048xbf16>>
         memref.assume_alignment %1, 64 : memref<64x32xi32>
+        %bd_id = amdaie.bd_id(%tile_0, %c0)
 // CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 0 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 0, 0, 0>, strides = array<i32: 0, 0, 0>, use_next_bd = false, valid_bd = true}
 // CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32}
 // CHECK: amdaie.npu.push_to_queue {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/dma_loop_subsumption.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/dma_loop_subsumption.mlir
index d52696980..d570d2624 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/dma_loop_subsumption.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/dma_loop_subsumption.mlir
@@ -31,7 +31,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
       %1 = amdaie.connection(%arg0, %arg1) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
       amdaie.controlcode {
         scf.for %arg2 = %c1 to %c6 step %c2 {
-          %bd_id = amdaie.bd_id(%tile, 0)
+          %bd_id = amdaie.bd_id(%tile, %c0)
           amdaie.npu.dma_cpy_nd %0([%arg2] [16] [1] bd_id = %bd_id, [] [] [])
           amdaie.npu.dma_cpy_nd %1([%arg2] [16] [1], [] [] [])
         }
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir
index a036bcb5f..9c4e7a1d4 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir
@@ -874,7 +874,6 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
       %tile_0 = amdaie.tile(%c0, %c1)
       %tile_1 = amdaie.tile(%c0, %c2)
       %tile_2 = amdaie.tile(%c1, %c2)
-      %bd_id = amdaie.bd_id(%tile, 0)
       %buffer = amdaie.buffer(%tile_0) : memref<4096xi32, 1 : i32>
       %buffer_3 = amdaie.buffer(%tile_0) : memref<4096xi32, 1 : i32>
       %lock = amdaie.lock(%tile_0(0), 2)
diff --git a/runtime/src/iree-amd-aie/aie_runtime/Utils/ChannelBdIdGenerator.cpp b/runtime/src/iree-amd-aie/aie_runtime/Utils/ChannelBdIdGenerator.cpp
index 339146e5f..de8373916 100644
--- a/runtime/src/iree-amd-aie/aie_runtime/Utils/ChannelBdIdGenerator.cpp
+++ b/runtime/src/iree-amd-aie/aie_runtime/Utils/ChannelBdIdGenerator.cpp
@@ -9,19 +9,39 @@
 namespace mlir::iree_compiler::AMDAIE {
 
 std::optional<uint32_t> ChannelBdIdGenerator::getAndAssignBdId(
-    uint32_t channel) {
+    uint32_t channel, BdIdAssignmentMode mode) {
   if (!channelToValidBdIds.contains(channel) ||
       channelToValidBdIds[channel].empty()) {
     return std::nullopt;
   }
-  uint32_t bdId = channelToValidBdIds[channel][0];
-  size_t index{1};
-  while (isBdIdAssigned(bdId) && index < channelToValidBdIds[channel].size()) {
-    bdId = channelToValidBdIds[channel][index++];
+
+  if (mode == BdIdAssignmentMode::Smallest) {
+    // Smallest: Find the smallest unassigned BD id
+    for (uint32_t bdId : channelToValidBdIds[channel]) {
+      if (!isBdIdAssigned(bdId)) {
+        assignBdId(bdId);
+        return bdId;
+      }
+    }
+  } else {
+    // Incremental: Find the first unassigned BD id greater than lastUsedBdId,
+    for (uint32_t bdId : channelToValidBdIds[channel]) {
+      if (bdId > lastUsedBdId && !isBdIdAssigned(bdId)) {
+        assignBdId(bdId);
+        return bdId;
+      }
+    }
+    // If not found, wrap around and check again
+    for (uint32_t bdId : channelToValidBdIds[channel]) {
+      if (bdId <= lastUsedBdId && !isBdIdAssigned(bdId)) {
+        assignBdId(bdId);
+        return bdId;
+      }
+    }
   }
-  if (isBdIdAssigned(bdId)) return std::nullopt;
-  assignBdId(bdId);
-  return bdId;
+
+  // No valid BD id found
+  return std::nullopt;
 }
 
 }  // namespace mlir::iree_compiler::AMDAIE
diff --git a/runtime/src/iree-amd-aie/aie_runtime/Utils/ChannelBdIdGenerator.h b/runtime/src/iree-amd-aie/aie_runtime/Utils/ChannelBdIdGenerator.h
index e478ac155..bd18d9c8e 100644
--- a/runtime/src/iree-amd-aie/aie_runtime/Utils/ChannelBdIdGenerator.h
+++ b/runtime/src/iree-amd-aie/aie_runtime/Utils/ChannelBdIdGenerator.h
@@ -16,6 +16,11 @@ using namespace llvm;
 
 namespace mlir::iree_compiler::AMDAIE {
 
+enum class BdIdAssignmentMode {
+  Incremental,  // Prioritize incremental assignment
+  Smallest      // Choose the smallest unused id
+};
+
 /// Utility to generate valid buffer descriptor (BD) ids for channels. Keeps
 /// state on assigned BD ids to avoid reuse.
 class ChannelBdIdGenerator {
@@ -28,11 +33,15 @@ class ChannelBdIdGenerator {
       DenseMap<uint32_t, SmallVector<uint32_t>> &&channelToValidBdIds)
       : channelToValidBdIds(std::move(channelToValidBdIds)) {}
 
-  void assignBdId(uint32_t bdId) { assignedBdIds.insert(bdId); }
+  void assignBdId(uint32_t bdId) {
+    assignedBdIds.insert(bdId);
+    lastUsedBdId = bdId;
+  }
 
   /// Attempts to find and assign an unused BD id for the provided channel.
   /// Returns `std::nullopt` if no valid BD id could be found.
-  std::optional<uint32_t> getAndAssignBdId(uint32_t channel);
+  std::optional<uint32_t> getAndAssignBdId(
+      uint32_t channel, BdIdAssignmentMode mode = BdIdAssignmentMode::Smallest);
 
   /// Check whether the provided BD id is currently assigned.
   bool isBdIdAssigned(uint32_t bdId) const { return assignedBdIds.count(bdId); }
@@ -41,11 +50,25 @@ class ChannelBdIdGenerator {
   /// reused.
   void releaseBdId(uint32_t bdId) { assignedBdIds.erase(bdId); }
 
+  uint32_t getAvailableBdIdNum(uint32_t channel) {
+    if (!channelToValidBdIds.contains(channel)) {
+      return 0;
+    } else {
+      uint32_t count = 0;
+      for (uint32_t bdId : channelToValidBdIds[channel]) {
+        if (!isBdIdAssigned(bdId)) count += 1;
+      }
+      return count;
+    }
+  }
+
  private:
   // Maps channel indices to vectors of valid BD ids.
   DenseMap<uint32_t, SmallVector<uint32_t>> channelToValidBdIds;
   // Set with all BD ids that are currently assigned.
   DenseSet<uint32_t> assignedBdIds;
+  // Tracks the last used index for Incremental mode
+  uint32_t lastUsedBdId = std::numeric_limits<uint32_t>::max();
 };
 
 }  // namespace mlir::iree_compiler::AMDAIE
diff --git a/runtime/src/iree-amd-aie/aie_runtime/Utils/test/ChannelBdIdGeneratorTest.cpp b/runtime/src/iree-amd-aie/aie_runtime/Utils/test/ChannelBdIdGeneratorTest.cpp
index 3b05cf19d..3ae8530d9 100644
--- a/runtime/src/iree-amd-aie/aie_runtime/Utils/test/ChannelBdIdGeneratorTest.cpp
+++ b/runtime/src/iree-amd-aie/aie_runtime/Utils/test/ChannelBdIdGeneratorTest.cpp
@@ -4,19 +4,15 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-
 #include <numeric>
 
 #include "gtest/gtest.h"
 #include "iree-amd-aie/aie_runtime/Utils/ChannelBdIdGenerator.h"
 
-
 namespace {
 
-
 using namespace mlir::iree_compiler::AMDAIE;
 
-
 DenseMap<uint32_t, SmallVector<uint32_t>>
 getTestSingleRangeChannelToValidBdIds() {
   SmallVector<uint32_t> range(3);
@@ -26,7 +22,6 @@ getTestSingleRangeChannelToValidBdIds() {
   return channelToValidBdIds;
 }
 
-
 DenseMap<uint32_t, SmallVector<uint32_t>> getTestEvenOddChannelToValidBdIds() {
   SmallVector<uint32_t> evenRange(4);
   std::iota(evenRange.begin(), evenRange.end(), 0);
@@ -38,7 +33,6 @@ DenseMap<uint32_t, SmallVector<uint32_t>> getTestEvenOddChannelToValidBdIds() {
   return channelToValidBdIds;
 }
 
-
 TEST(ChannelBdIdGeneratorTest, SingleRange) {
   ChannelBdIdGenerator generator(getTestSingleRangeChannelToValidBdIds());
   EXPECT_EQ(generator.getAndAssignBdId(0).value(), 0);
@@ -50,7 +44,6 @@ TEST(ChannelBdIdGeneratorTest, SingleRange) {
   EXPECT_EQ(generator.getAndAssignBdId(1), std::nullopt);
 }
 
-
 TEST(ChannelBdIdGeneratorTest, EvenOdd) {
   ChannelBdIdGenerator generator(getTestEvenOddChannelToValidBdIds());
   // Check that even channel BDs start from 0
@@ -77,7 +70,6 @@ TEST(ChannelBdIdGeneratorTest, EvenOdd) {
   EXPECT_EQ(generator.getAndAssignBdId(3), std::nullopt);
 }
 
-
 TEST(ChannelBdIdGeneratorTest, AssignBdId) {
   ChannelBdIdGenerator generator(getTestSingleRangeChannelToValidBdIds());
   generator.assignBdId(0);
@@ -87,7 +79,6 @@ TEST(ChannelBdIdGeneratorTest, AssignBdId) {
   EXPECT_EQ(generator.getAndAssignBdId(1), std::nullopt);
 }
 
-
 TEST(ChannelBdIdGeneratorTest, Release) {
   ChannelBdIdGenerator generator(getTestSingleRangeChannelToValidBdIds());
   EXPECT_EQ(generator.getAndAssignBdId(0).value(), 0);
@@ -102,10 +93,26 @@ TEST(ChannelBdIdGeneratorTest, Release) {
   EXPECT_EQ(generator.isBdIdAssigned(1), true);
 }
 
+TEST(ChannelBdIdGeneratorTest, IncrementalAssign) {
+  ChannelBdIdGenerator generator(getTestSingleRangeChannelToValidBdIds());
+  EXPECT_EQ(
+      generator.getAndAssignBdId(0, BdIdAssignmentMode::Incremental).value(),
+      0);
+  generator.releaseBdId(0);
+  EXPECT_EQ(
+      generator.getAndAssignBdId(0, BdIdAssignmentMode::Incremental).value(),
+      1);
+  generator.releaseBdId(1);
+  EXPECT_EQ(
+      generator.getAndAssignBdId(0, BdIdAssignmentMode::Incremental).value(),
+      2);
+  generator.releaseBdId(2);
+  EXPECT_EQ(generator.getAndAssignBdId(0).value(), 0);
+  generator.releaseBdId(0);
+}
 
 }  // namespace
 
-
 int main(int argc, char **argv) {
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();