From bba961e658c2666cf653379f0bbf53451facffde Mon Sep 17 00:00:00 2001
From: yzhang93 <zhyuhang88@gmail.com>
Date: Fri, 7 Feb 2025 17:51:28 -0800
Subject: [PATCH] [LoweringStrategy] Use a more general method to fetch input
 dims and sizes

---
 .../Transforms/KernelDispatch.cpp             | 204 +++++++++++-------
 .../test/lowering_strategy_air.mlir           |   6 +-
 .../test/lowering_strategy_generic.mlir       |  36 +++-
 .../lowering_strategy_objectfifo_npu1.mlir    |  48 ++---
 .../lowering_strategy_objectfifo_npu4.mlir    |   4 +-
 5 files changed, 191 insertions(+), 107 deletions(-)
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.cpp
index ff795fca0..fa4bccc6f 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.cpp
@@ -84,6 +84,52 @@ FailureOr<std::array<uint32_t, 3>> getPackedSize(linalg::LinalgOp linalgOp,
   return instructionSize;
 }
 
+struct InputDimsAndSizes {
+  SmallVector<unsigned, 2> mDims;
+  SmallVector<unsigned, 2> nDims;
+  SmallVector<unsigned, 2> kDims;
+  SmallVector<int64_t, 2> mSizes;
+  SmallVector<int64_t, 2> nSizes;
+  SmallVector<int64_t, 2> kSizes;
+};
+
+FailureOr<InputDimsAndSizes> getInputDimsAndSizes(linalg::LinalgOp linalgOp) {
+  FailureOr<linalg::ContractionDimensions> maybeContractionDims =
+      linalg::inferContractionDims(linalgOp);
+  if (failed(maybeContractionDims)) {
+    return linalgOp.emitOpError("failed to infer the contraction dimensions.");
+  }
+
+  linalg::ContractionDimensions contractionDims = *maybeContractionDims;
+  SmallVector<unsigned, 2> mDims = contractionDims.m;
+  SmallVector<unsigned, 2> nDims = contractionDims.n;
+  SmallVector<unsigned, 2> kDims = contractionDims.k;
+  if (mDims.empty() || nDims.empty() || kDims.empty()) {
+    return linalgOp.emitOpError("failed to fetch m/n/k dims.");
+  }
+
+  SmallVector<int64_t> shapes = linalgOp.getStaticLoopRanges();
+  if (mDims.size() + nDims.size() + kDims.size() > shapes.size()) {
+    return linalgOp.emitOpError(
+        "the total of m/n/k dims is larger than the number of loops.");
+  }
+
+  auto getSizesAt = [&shapes](const SmallVector<unsigned, 2> &idx) {
+    SmallVector<int64_t, 2> sizes;
+    for (auto i : idx) sizes.push_back(shapes[i]);
+    return sizes;
+  };
+
+  InputDimsAndSizes inputDimsAndSizes;
+  inputDimsAndSizes.mDims = mDims;
+  inputDimsAndSizes.nDims = nDims;
+  inputDimsAndSizes.kDims = kDims;
+  inputDimsAndSizes.mSizes = getSizesAt(mDims);
+  inputDimsAndSizes.nSizes = getSizesAt(nDims);
+  inputDimsAndSizes.kSizes = getSizesAt(kDims);
+  return inputDimsAndSizes;
+}
+
 // Container class for the tiling at level 0 (the AIE shared memory) and level 1
 // (the AIE core) in the M-, N-, and K-dimensions of a matmul operation, using
 // the pad-pack approach to tiling a matmul. Also contains the packing sizes for
@@ -156,25 +202,24 @@ FailureOr<ParameterSetting> ParameterSetting::create(
   auto initType =
       llvm::cast<ShapedType>(linalgOp.getDpsInitOperand(0)->get().getType());
   unsigned nBitsInit = initType.getElementTypeBitWidth();
-  ArrayRef<int64_t> initShape = initType.getShape();
-
   auto lhsType =
       llvm::cast<ShapedType>(linalgOp.getDpsInputOperand(0)->get().getType());
   unsigned nBitsLhs = lhsType.getElementTypeBitWidth();
-  ArrayRef<int64_t> lhsShape = lhsType.getShape();
-
   auto rhsType =
       llvm::cast<ShapedType>(linalgOp.getDpsInputOperand(1)->get().getType());
   unsigned nBitsRhs = rhsType.getElementTypeBitWidth();
 
-  // Shape of the full matmul operation.
-  if (isa<linalg::BatchMatmulOp>(linalgOp)) {
-    initShape = initShape.drop_front();
-    lhsShape = lhsShape.drop_front();
-  }
-  const uint64_t M = initShape[0];
-  const uint64_t N = initShape[1];
-  const uint64_t K = lhsShape[1];
+  auto getTotalSize = [](const SmallVector<int64_t, 2> &sizes) {
+    return std::accumulate(sizes.begin(), sizes.end(), 1,
+                           std::multiplies<int64_t>());
+  };
+
+  // Get the shape (M, N, K) of the full Matmul operation.
+  auto maybeInputDimsAndSizes = getInputDimsAndSizes(linalgOp);
+  if (failed(maybeInputDimsAndSizes)) return failure();
+  int64_t M = getTotalSize(maybeInputDimsAndSizes.value().mSizes);
+  int64_t N = getTotalSize(maybeInputDimsAndSizes.value().nSizes);
+  int64_t K = getTotalSize(maybeInputDimsAndSizes.value().kSizes);
 
   // If we are conservative with ensuring that tiles A, B, and C fit at the
   // different memory levels, we should choose the scale factor based
@@ -389,15 +434,23 @@ static SmallVector<int64_t> setOuterPermB(bool isMatmulTransposeB,
 
 static LogicalResult setRootConfigForPackPeel4LevelTilingPipeline(
     mlir::FunctionOpInterface entryPointFn, linalg::LinalgOp linalgOp,
-    AMDAIEDevice targetDevice, uint32_t numRows, uint32_t numCols) {
-  // Scale the L1 K with a factor of 2 compared with the outer dimenions M and N
-  // to increase the L1 memory usage.
+    AMDAIEDevice targetDevice, uint32_t numRows, uint32_t numCols,
+    uint32_t numLoops) {
+  // Scale the L1 K with a factor of 2 compared with the outer dimensions M and
+  // N to increase the L1 memory usage.
   auto maybePackPeelTiling = ParameterSetting::create(
       linalgOp, /*isPackPeel=*/true, /*isObjectFifo=*/true, targetDevice,
       numRows, numCols, /*kPackScaleL1=*/2);
   if (failed(maybePackPeelTiling)) return failure();
   auto packPeelTiling = maybePackPeelTiling.value();
 
+  // Get M, N, K dimension indices from the input indexing map.
+  auto maybeInputDimsAndSizes = getInputDimsAndSizes(linalgOp);
+  if (failed(maybeInputDimsAndSizes)) return failure();
+  SmallVector<unsigned, 2> mDims = maybeInputDimsAndSizes.value().mDims;
+  SmallVector<unsigned, 2> nDims = maybeInputDimsAndSizes.value().nDims;
+  SmallVector<unsigned, 2> kDims = maybeInputDimsAndSizes.value().kDims;
+
   AMDAIEDeviceModel deviceModel = getDeviceModel(targetDevice);
 
   // ------------------------------------------------------
@@ -405,10 +458,11 @@ static LogicalResult setRootConfigForPackPeel4LevelTilingPipeline(
   // ------------------------------------------------------
   MLIRContext *context = entryPointFn.getContext();
 
-  SmallVector<int64_t> packedSizesL0 = packPeelTiling.getPackSizeL0();
-  if (isa<linalg::BatchMatmulOp>(linalgOp)) {
-    packedSizesL0.insert(packedSizesL0.begin(), 0);
-  }
+  // Pack level => 1.
+  SmallVector<int64_t> packedSizesL0(numLoops, 0);
+  packedSizesL0[mDims.back()] = packPeelTiling.m0Pack;
+  packedSizesL0[nDims.back()] = packPeelTiling.n0Pack;
+  packedSizesL0[kDims.back()] = packPeelTiling.k0Pack;
 
   // For matmul, transpose B matrix from [K N n k] to [N K k n]
   // For matmul_transpose_b, we don't have to transpose the B matrix,
@@ -440,17 +494,11 @@ static LogicalResult setRootConfigForPackPeel4LevelTilingPipeline(
       outerPerm);
 
   // Pack level => 2.
-  // packed size for [M, N, K, m, n, k]
-  SmallVector<int64_t> packedSizesL1 = {0,
-                                        0,
-                                        0,
-                                        packPeelTiling.m1Pack,
-                                        packPeelTiling.n1Pack,
-                                        packPeelTiling.k1Pack};
-
-  if (isa<linalg::BatchMatmulOp>(linalgOp)) {
-    packedSizesL1.insert(packedSizesL1.begin(), 0);
-  }
+  // The number of loops have increased by 3 due to the first level pack.
+  SmallVector<int64_t> packedSizesL1(numLoops + 3, 0);
+  packedSizesL1[mDims.back() + 3] = packPeelTiling.m1Pack;
+  packedSizesL1[nDims.back() + 3] = packPeelTiling.n1Pack;
+  packedSizesL1[kDims.back() + 3] = packPeelTiling.k1Pack;
 
   // Transpose A matrix from [M K m k m0 k0] to [M K k m m0 k0]
   // Transpose C matrix from [M N m n m0 n0] to [M N n m m0 n0]
@@ -492,18 +540,24 @@ static LogicalResult setRootConfigForPackPeel4LevelTilingPipeline(
   bool fitsInL2 = (l2SizeA + l2SizeB + l2SizeInit) <
                   (deviceModel.getMemTileSizeInBytes() * numCols);
   int64_t scaleL0 = !isBatchMatmul && fitsInL2 ? 2 : 1;
-  SmallVector<int64_t> tileSizeLevel0 = {packPeelTiling.M0 * scaleL0,
-                                         packPeelTiling.N0 * scaleL0};
-  SmallVector<int64_t> tileSizeLevel1 = {numRows, numCols, 0};
-  SmallVector<int64_t> tileSizeLevel2 = {0, 0, 1};
-  SmallVector<int64_t> tileSizeLevel3 = {1, 1, 0, 0, 0, 0};
 
+  SmallVector<int64_t> tileSizeLevel0(numLoops, 0);
   if (isa<linalg::BatchMatmulOp>(linalgOp)) {
-    tileSizeLevel0.insert(tileSizeLevel0.begin(), 1);
-    tileSizeLevel1.insert(tileSizeLevel1.begin(), 0);
-    tileSizeLevel2.insert(tileSizeLevel2.begin(), 0);
-    tileSizeLevel3.insert(tileSizeLevel3.begin(), 0);
+    tileSizeLevel0[0] = 1;
   }
+  tileSizeLevel0[mDims[0]] = packPeelTiling.M0 * scaleL0;
+  tileSizeLevel0[nDims[0]] = packPeelTiling.N0 * scaleL0;
+
+  SmallVector<int64_t> tileSizeLevel1(numLoops, 0);
+  tileSizeLevel1[mDims[0]] = numRows;
+  tileSizeLevel1[nDims[0]] = numCols;
+
+  SmallVector<int64_t> tileSizeLevel2(numLoops, 0);
+  tileSizeLevel2[kDims[0]] = 1;
+
+  SmallVector<int64_t> tileSizeLevel3(numLoops, 0);
+  tileSizeLevel3[mDims[0]] = 1;
+  tileSizeLevel3[nDims[0]] = 1;
 
   TileSizesListType tileSizes = {tileSizeLevel0, tileSizeLevel1, tileSizeLevel2,
                                  tileSizeLevel3};
@@ -518,7 +572,7 @@ static LogicalResult setRootConfigForPackPeel4LevelTilingPipeline(
 static LogicalResult setRootConfigForPackPeelPipeline(
     mlir::FunctionOpInterface entryPointFn, linalg::LinalgOp linalgOp,
     LowerToAIEPassPipeline useLowerToAIEPipeline, AMDAIEDevice targetDevice,
-    uint32_t numRows, uint32_t numCols) {
+    uint32_t numRows, uint32_t numCols, uint32_t numLoops) {
   bool isObjectFifo =
       useLowerToAIEPipeline == LowerToAIEPassPipeline::ObjectFifo;
   auto maybePackPeelTiling =
@@ -527,15 +581,23 @@ static LogicalResult setRootConfigForPackPeelPipeline(
   if (failed(maybePackPeelTiling)) return failure();
   auto packPeelTiling = maybePackPeelTiling.value();
 
+  // Get M, N, K dimension indices from the input indexing map.
+  auto maybeInputDimsAndSizes = getInputDimsAndSizes(linalgOp);
+  if (failed(maybeInputDimsAndSizes)) return failure();
+  SmallVector<unsigned, 2> mDims = maybeInputDimsAndSizes.value().mDims;
+  SmallVector<unsigned, 2> nDims = maybeInputDimsAndSizes.value().nDims;
+  SmallVector<unsigned, 2> kDims = maybeInputDimsAndSizes.value().kDims;
+
   // ------------------------------------------------------
   // --------------- Set packing config -------------------
   // ------------------------------------------------------
   MLIRContext *context = entryPointFn.getContext();
 
-  SmallVector<int64_t> packedSizesL0 = packPeelTiling.getPackSizeL0();
-  if (isa<linalg::BatchMatmulOp>(linalgOp)) {
-    packedSizesL0.insert(packedSizesL0.begin(), 0);
-  }
+  // Pack level => 1.
+  SmallVector<int64_t> packedSizesL0(numLoops, 0);
+  packedSizesL0[mDims.back()] = packPeelTiling.m0Pack;
+  packedSizesL0[nDims.back()] = packPeelTiling.n0Pack;
+  packedSizesL0[kDims.back()] = packPeelTiling.k0Pack;
 
   // For matmul, transpose B matrix from [K N n k] to [N K k n]
   // For matmul_transpose_b, we don't have to transpose the B matrix,
@@ -571,17 +633,11 @@ static LogicalResult setRootConfigForPackPeelPipeline(
       outerPerm);
 
   // Pack level => 2.
-  // packed size for [M, N, K, m, n, k]
-  SmallVector<int64_t> packedSizesL1 = {0,
-                                        0,
-                                        0,
-                                        packPeelTiling.m1Pack,
-                                        packPeelTiling.n1Pack,
-                                        packPeelTiling.k1Pack};
-
-  if (isa<linalg::BatchMatmulOp>(linalgOp)) {
-    packedSizesL1.insert(packedSizesL1.begin(), 0);
-  }
+  // The number of loops have increased by 3 due to the first level pack.
+  SmallVector<int64_t> packedSizesL1(numLoops + 3, 0);
+  packedSizesL1[mDims.back() + 3] = packPeelTiling.m1Pack;
+  packedSizesL1[nDims.back() + 3] = packPeelTiling.n1Pack;
+  packedSizesL1[kDims.back() + 3] = packPeelTiling.k1Pack;
 
   // Transpose A matrix from [M K m k m0 k0] to [M K k m m0 k0]
   // Transpose C matrix from [M N m n m0 n0] to [M N n m m0 n0]
@@ -611,15 +667,19 @@ static LogicalResult setRootConfigForPackPeelPipeline(
   // ------------------------------------------------------
   // -------------- Set lowering config -------------------
   // ------------------------------------------------------
-  SmallVector<int64_t> tileSizeLevel0 = {packPeelTiling.M0, packPeelTiling.N0};
-  SmallVector<int64_t> tileSizeLevel1 = {0, 0, packPeelTiling.K0};
-  SmallVector<int64_t> tileSizeLevel2 = {1, 1, 0, 0, 0, 0};
-
+  SmallVector<int64_t> tileSizeLevel0(numLoops, 0);
   if (isa<linalg::BatchMatmulOp>(linalgOp)) {
-    tileSizeLevel0.insert(tileSizeLevel0.begin(), 1);
-    tileSizeLevel1.insert(tileSizeLevel1.begin(), 0);
-    tileSizeLevel2.insert(tileSizeLevel2.begin(), 0);
+    tileSizeLevel0[0] = 1;
   }
+  tileSizeLevel0[mDims[0]] = packPeelTiling.M0;
+  tileSizeLevel0[nDims[0]] = packPeelTiling.N0;
+
+  SmallVector<int64_t> tileSizeLevel1(numLoops, 0);
+  tileSizeLevel1[kDims[0]] = 1;
+
+  SmallVector<int64_t> tileSizeLevel2(numLoops, 0);
+  tileSizeLevel2[mDims[0]] = 1;
+  tileSizeLevel2[nDims[0]] = 1;
 
   TileSizesListType tileSizes = {tileSizeLevel0, tileSizeLevel1,
                                  tileSizeLevel2};
@@ -842,6 +902,8 @@ static LogicalResult setRootConfig(mlir::FunctionOpInterface entryPointFn,
                                    uint32_t numCols) {
   assert(!getLoweringConfig<IREE::Codegen::LoweringConfigAttr>(genericOp) &&
          "expected lowering_config is not set");
+  unsigned numLoops = genericOp.getNumLoops();
+  assert(numLoops <= 7 && "expected input number of loops no more than 7");
   if (!isMatmul(genericOp) && !isMatmulTransposeA(genericOp) &&
       !isMatmulTransposeB(genericOp))
     return genericOp.emitOpError(
@@ -850,11 +912,11 @@ static LogicalResult setRootConfig(mlir::FunctionOpInterface entryPointFn,
   if (passPipeline == TilePassPipeline::PackPeelPipeline) {
     return setRootConfigForPackPeelPipeline(entryPointFn, genericOp,
                                             useLowerToAIEPipeline, targetDevice,
-                                            numRows, numCols);
+                                            numRows, numCols, numLoops);
   }
   if (passPipeline == TilePassPipeline::PackPeel4LevelTilingPipeline) {
     return setRootConfigForPackPeel4LevelTilingPipeline(
-        entryPointFn, genericOp, targetDevice, numRows, numCols);
+        entryPointFn, genericOp, targetDevice, numRows, numCols, numLoops);
   }
   if (passPipeline == TilePassPipeline::PadPackPipeline) {
     return setRootConfigForPadPackPipeline(entryPointFn, genericOp,
@@ -875,15 +937,7 @@ static LogicalResult setRootConfig(mlir::FunctionOpInterface entryPointFn,
          "expected lowering_config is not set");
   auto linalgOp = cast<linalg::LinalgOp>(contractionOp.getOperation());
   unsigned numLoops = linalgOp.getNumLoops();
-  {
-    SmallVector<unsigned> dims;
-    linalgOp.getReductionDims(dims);
-    if (dims.size() != 1 || dims[0] != numLoops - 1) {
-      return linalgOp.emitOpError(
-                 "is expected to have exactly one reduction dim, ")
-             << "and that it is the innermost dim (" << numLoops - 1 << ").";
-    }
-  }
+  assert(numLoops <= 7 && "expected input number of loops no more than 7");
 
   // TODO (nmeshram) : This needs to be moved in a separate more generalized
   // logic. Also, need a flag to experiment between pad based and pack based
@@ -891,11 +945,11 @@ static LogicalResult setRootConfig(mlir::FunctionOpInterface entryPointFn,
   if (passPipeline == TilePassPipeline::PackPeelPipeline) {
     return setRootConfigForPackPeelPipeline(entryPointFn, linalgOp,
                                             useLowerToAIEPipeline, targetDevice,
-                                            numRows, numCols);
+                                            numRows, numCols, numLoops);
   }
   if (passPipeline == TilePassPipeline::PackPeel4LevelTilingPipeline) {
     return setRootConfigForPackPeel4LevelTilingPipeline(
-        entryPointFn, linalgOp, targetDevice, numRows, numCols);
+        entryPointFn, linalgOp, targetDevice, numRows, numCols, numLoops);
   }
   if (passPipeline == TilePassPipeline::PadPackPipeline) {
     return setRootConfigForPadPackPipeline(entryPointFn, linalgOp, targetDevice,
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_air.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_air.mlir
index 6b6718808..aef9e1e26 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_air.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_air.mlir
@@ -189,7 +189,7 @@ builtin.module {
 
 // -----
 
-// CHECK-PACK-PEEL{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>
+// CHECK-PACK-PEEL{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 0], [0, 0, 1], [1, 1, 0]]>
 // CHECK-PACK-PEEL{LITERAL}: #amdaie.packing_config<packing_config = [{packedSizes = [128, 128, 128], transposePackIndices = [0, 1], unpackEmpty = [false, false], innerPerm = [[0, 1], [1, 0]], outerPerm = [[0, 1], [1, 0]]}, {packedSizes = [0, 0, 0, 4, 8, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>
 #pipeline_layout = #hal.pipeline.layout<bindings = [
   <storage_buffer>,
@@ -216,7 +216,7 @@ builtin.module {
 
 // -----
 
-// CHECK-PACK-PEEL{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[44, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>
+// CHECK-PACK-PEEL{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[44, 128, 0], [0, 0, 1], [1, 1, 0]]>
 // CHECK-PACK-PEEL{LITERAL}: #amdaie.packing_config<packing_config = [{packedSizes = [44, 32, 64], transposePackIndices = [0, 1], unpackEmpty = [false, false], innerPerm = [[0, 1], [1, 0]], outerPerm = [[0, 1], [1, 0]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>
 #pipeline_layout = #hal.pipeline.layout<bindings = [
   <storage_buffer>,
@@ -244,7 +244,7 @@ module {
 
 // CHECK-PAD-PACK{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 256], [32, 32], [0, 0, 4]]>
 // CHECK-PAD-PACK{LITERAL}: #packingConfig = #amdaie.packing_config<packing_config = [{packedSizes = [4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [0, 1], [0, 1]], outerPerm = [[1, 0], [1, 0], [1, 0]]}]>
-// CHECK-PACK-PEEL{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>
+// CHECK-PACK-PEEL{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[128, 128, 0], [0, 0, 1], [1, 1, 0]]>
 // CHECK-PACK-PEEL{LITERAL}: #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [0, 1], unpackEmpty = [false, false], innerPerm = [[0, 1], [0, 1]], outerPerm = [[0, 1], [0, 1]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [0, 1], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>
 #pipeline_layout = #hal.pipeline.layout<bindings = [
   <storage_buffer>,
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_generic.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_generic.mlir
index 0e8e80a01..f8bbb02f5 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_generic.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_generic.mlir
@@ -2,7 +2,7 @@
 
 // Test generic version of matmul.
 
-// CHECK{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>
+// CHECK{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[128, 128, 0], [0, 0, 1], [1, 1, 0]]>
 // CHECK{LITERAL}: #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1], [1, 0], [1, 0]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>
 module {
   func.func @matmul_generic_128x128x256_i32() {
@@ -32,7 +32,7 @@ module {
 
 // Test generic version of matmul_transpose_b.
 
-// CHECK{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>
+// CHECK{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[128, 128, 0], [0, 0, 1], [1, 1, 0]]>
 // CHECK{LITERAL}: #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [0, 1], [0, 1]], outerPerm = [[0, 1], [0, 1], [1, 0]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [0, 1], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>
 module {
   func.func @matmul_transpose_b_generic_128x128x256_i32() {
@@ -62,7 +62,7 @@ module {
 
 // Test generic version of matmul_transpose_a.
 
-// CHECK{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>
+// CHECK{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[128, 128, 0], [0, 0, 1], [1, 1, 0]]>
 // CHECK{LITERAL}: #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[1, 0], [1, 0], [0, 1]], outerPerm = [[1, 0], [1, 0], [1, 0]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[1, 0], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>
 module {
   func.func @matmul_transpose_a_generic_128x128x256_i32() {
@@ -85,3 +85,33 @@ module {
     return
   }
 }
+
+// -----
+
+// Test generic version of matmul with reduction loop at first, i.e, (d0, d1, d2) = (k, m, n).
+
+// CHECK{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[0, 128, 128], [1, 0, 0], [0, 1, 1]]>
+// CHECK{LITERAL}: #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1], [1, 0], [1, 0]]}, {packedSizes = [0, 0, 0, 8, 4, 4], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>
+module {
+  func.func @matmul_generic_128x128x256_i32() {
+    %c0_i32 = arith.constant 0 : i32
+    %c0 = arith.constant 0 : index
+    %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x256xi32>>
+    %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<256x128xi32>>
+    %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
+    %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x256xi32>> -> tensor<128x256xi32>
+    %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [256, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<256x128xi32>> -> tensor<256x128xi32>
+    %5 = tensor.empty() : tensor<128x128xi32>
+    %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<128x128xi32>) -> tensor<128x128xi32>
+    //      CHECK:  linalg.generic
+    // CHECK-SAME:  attrs = {lowering_config = #config, packing_config = #packingConfig}
+    %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d0)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>], iterator_types = ["reduction", "parallel", "parallel"]} ins(%3, %4 : tensor<128x256xi32>, tensor<256x128xi32>) outs(%6 : tensor<128x128xi32>) {
+    ^bb0(%in: i32, %in_0: i32, %out: i32):
+      %8 = arith.muli %in, %in_0 : i32
+      %9 = arith.addi %out, %8 : i32
+      linalg.yield %9 : i32
+    } -> tensor<128x128xi32>
+    flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [128, 128], strides = [1, 1] : tensor<128x128xi32> -> !flow.dispatch.tensor<writeonly:tensor<128x128xi32>>
+    return
+  }
+}
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_objectfifo_npu1.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_objectfifo_npu1.mlir
index c40e27717..f7c5c8831 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_objectfifo_npu1.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_objectfifo_npu1.mlir
@@ -3,16 +3,16 @@
 // RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(iree-amdaie-lowering-strategy{target-device=npu1_4col})' %s | FileCheck %s --check-prefix=CHECK-4x4
 // RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(iree-amdaie-lowering-strategy{target-device=npu1_4col use-tile-pipeline=pack-peel-4-level-tiling})' %s | FileCheck %s --check-prefix=PACK-PEEL-4-LEVEL
 
-// CHECK-2x2{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>
+// CHECK-2x2{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 0], [0, 0, 1], [1, 1, 0]]>
 // CHECK-2x2{LITERAL}: #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1], [1, 0], [1, 0]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>
 
-// CHECK-4x2{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[128, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>
+// CHECK-4x2{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[128, 64, 0], [0, 0, 1], [1, 1, 0]]>
 // CHECK-4x2{LITERAL}: #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1], [1, 0], [1, 0]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>
 
-// CHECK-4x4{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>
+// CHECK-4x4{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[128, 128, 0], [0, 0, 1], [1, 1, 0]]>
 // CHECK-4x4{LITERAL}: #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1], [1, 0], [1, 0]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>
 
-// PACK-PEEL-4-LEVEL{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[256, 256], [4, 4, 0], [0, 0, 1], [1, 1, 0, 0, 0, 0]]
+// PACK-PEEL-4-LEVEL{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[256, 256, 0], [4, 4, 0], [0, 0, 1], [1, 1, 0]]
 // PACK-PEEL-4-LEVEL{LITERAL}: #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 64], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1], [1, 0], [1, 0]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>
 #pipeline_layout = #hal.pipeline.layout<bindings = [
   <storage_buffer>,
@@ -39,16 +39,16 @@ module {
 
 // -----
 
-// CHECK-2x2{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>
+// CHECK-2x2{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 0], [0, 0, 1], [1, 1, 0]]>
 // CHECK-2x2{LITERAL}: #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1], [1, 0], [1, 0]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>
 
-// CHECK-4x2{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[128, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>
+// CHECK-4x2{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[128, 64, 0], [0, 0, 1], [1, 1, 0]]>
 // CHECK-4x2{LITERAL}: #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1], [1, 0], [1, 0]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>
 
-// CHECK-4x4{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>
+// CHECK-4x4{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[128, 128, 0], [0, 0, 1], [1, 1, 0]]>
 // CHECK-4x4{LITERAL}: #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1], [1, 0], [1, 0]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>
 
-// PACK-PEEL-4-LEVEL{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[256, 256], [4, 4, 0], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>
+// PACK-PEEL-4-LEVEL{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[256, 256, 0], [4, 4, 0], [0, 0, 1], [1, 1, 0]]>
 // PACK-PEEL-4-LEVEL{LITERAL}: #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 64], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1], [1, 0], [1, 0]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>
 #pipeline_layout = #hal.pipeline.layout<bindings = [
   <storage_buffer>,
@@ -75,16 +75,16 @@ module {
 
 // -----
 
-// CHECK-2x2{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>
+// CHECK-2x2{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 0], [0, 0, 1], [1, 1, 0]]>
 // CHECK-2x2{LITERAL}: #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1], [1, 0], [1, 0]]}, {packedSizes = [0, 0, 0, 4, 8, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>
 
-// CHECK-4x2{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[128, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>
+// CHECK-4x2{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[128, 64, 0], [0, 0, 1], [1, 1, 0]]>
 // CHECK-4x2{LITERAL}: #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1], [1, 0], [1, 0]]}, {packedSizes = [0, 0, 0, 4, 8, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>
 
-// CHECK-4x4{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>
+// CHECK-4x4{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[128, 128, 0], [0, 0, 1], [1, 1, 0]]>
 // CHECK-4x4{LITERAL}: #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1], [1, 0], [1, 0]]}, {packedSizes = [0, 0, 0, 4, 8, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>
 
-// PACK-PEEL-4-LEVEL{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[256, 256], [4, 4, 0], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>
+// PACK-PEEL-4-LEVEL{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[256, 256, 0], [4, 4, 0], [0, 0, 1], [1, 1, 0]]>
 // PACK-PEEL-4-LEVEL{LITERAL}: #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 64], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1], [1, 0], [1, 0]]}, {packedSizes = [0, 0, 0, 4, 8, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>
 #pipeline_layout = #hal.pipeline.layout<bindings = [
   <storage_buffer>,
@@ -111,16 +111,16 @@ module {
 
 // -----
 
-// CHECK-2x2{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 64], [0, 0, 0, 1], [0, 1, 1, 0, 0, 0, 0]]>
+// CHECK-2x2{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[1, 64, 64, 0], [0, 0, 0, 1], [0, 1, 1, 0]]>
 // CHECK-2x2{LITERAL}: #amdaie.packing_config<packing_config = [{packedSizes = [0, 32, 32, 32], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 2], [0, 2, 1], [0, 2, 1]]}, {packedSizes = [0, 0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 2, 4, 3], [0, 1, 2, 4, 3], [0, 1, 2, 4, 3]]}]>
 
-// CHECK-4x2{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[1, 128, 64], [0, 0, 0, 1], [0, 1, 1, 0, 0, 0, 0]]>
+// CHECK-4x2{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[1, 128, 64, 0], [0, 0, 0, 1], [0, 1, 1, 0]]>
 // CHECK-4x2{LITERAL}: #amdaie.packing_config<packing_config = [{packedSizes = [0, 32, 32, 32], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 2], [0, 2, 1], [0, 2, 1]]}, {packedSizes = [0, 0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 2, 4, 3], [0, 1, 2, 4, 3], [0, 1, 2, 4, 3]]}]>
 
-// CHECK-4x4{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[1, 128, 128], [0, 0, 0, 1], [0, 1, 1, 0, 0, 0, 0]]>
+// CHECK-4x4{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[1, 128, 128, 0], [0, 0, 0, 1], [0, 1, 1, 0]]>
 // CHECK-4x4{LITERAL}: #amdaie.packing_config<packing_config = [{packedSizes = [0, 32, 32, 32], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 2], [0, 2, 1], [0, 2, 1]]}, {packedSizes = [0, 0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 2, 4, 3], [0, 1, 2, 4, 3], [0, 1, 2, 4, 3]]}]>
 
-// PACK-PEEL-4-LEVEL{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[1, 128, 128], [0, 4, 4, 0], [0, 0, 0, 1], [0, 1, 1, 0, 0, 0, 0]]>
+// PACK-PEEL-4-LEVEL{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[1, 128, 128, 0], [0, 4, 4, 0], [0, 0, 0, 1], [0, 1, 1, 0]]>
 // PACK-PEEL-4-LEVEL{LITERAL}: #packingConfig = #amdaie.packing_config<packing_config = [{packedSizes = [0, 32, 32, 64], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 2], [0, 2, 1], [0, 2, 1]]}, {packedSizes = [0, 0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 2, 4, 3], [0, 1, 2, 4, 3], [0, 1, 2, 4, 3]]}]>
 #pipeline_layout = #hal.pipeline.layout<bindings = [
   <storage_buffer>,
@@ -147,16 +147,16 @@ module {
 
 // -----
 
-// CHECK-2x2{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>
+// CHECK-2x2{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 0], [0, 0, 1], [1, 1, 0]]>
 // CHECK-2x2{LITERAL}: #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [0, 1], [0, 1]], outerPerm = [[0, 1], [0, 1], [1, 0]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [0, 1], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>
 
-// CHECK-4x2{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[128, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>
+// CHECK-4x2{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[128, 64, 0], [0, 0, 1], [1, 1, 0]]>
 // CHECK-4x2{LITERAL}: #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [0, 1], [0, 1]], outerPerm = [[0, 1], [0, 1], [1, 0]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [0, 1], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>
 
-// CHECK-4x4{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>
+// CHECK-4x4{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[128, 128, 0], [0, 0, 1], [1, 1, 0]]>
 // CHECK-4x4{LITERAL}: #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [0, 1], [0, 1]], outerPerm = [[0, 1], [0, 1], [1, 0]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [0, 1], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>
 
-// PACK-PEEL-4-LEVEL{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[256, 256], [4, 4, 0], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>
+// PACK-PEEL-4-LEVEL{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[256, 256, 0], [4, 4, 0], [0, 0, 1], [1, 1, 0]]>
 // PACK-PEEL-4-LEVEL{LITERAL}: #packingConfig = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 64], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [0, 1], [0, 1]], outerPerm = [[0, 1], [0, 1], [1, 0]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [0, 1], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>
 #pipeline_layout = #hal.pipeline.layout<bindings = [
   <storage_buffer>,
@@ -183,16 +183,16 @@ module {
 
 // -----
 
-// CHECK-2x2{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[64, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>
+// CHECK-2x2{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 0], [0, 0, 1], [1, 1, 0]]>
 // CHECK-2x2{LITERAL}: #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[1, 0], [1, 0], [0, 1]], outerPerm = [[1, 0], [1, 0], [1, 0]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[1, 0], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>
 
-// CHECK-4x2{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[128, 64], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>
+// CHECK-4x2{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[128, 64, 0], [0, 0, 1], [1, 1, 0]]>
 // CHECK-4x2{LITERAL}: #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[1, 0], [1, 0], [0, 1]], outerPerm = [[1, 0], [1, 0], [1, 0]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[1, 0], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>
 
-// CHECK-4x4{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>
+// CHECK-4x4{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[128, 128, 0], [0, 0, 1], [1, 1, 0]]>
 // CHECK-4x4{LITERAL}: #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[1, 0], [1, 0], [0, 1]], outerPerm = [[1, 0], [1, 0], [1, 0]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[1, 0], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>
 
-// PACK-PEEL-4-LEVEL{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[256, 256], [4, 4, 0], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>
+// PACK-PEEL-4-LEVEL{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[256, 256, 0], [4, 4, 0], [0, 0, 1], [1, 1, 0]]>
 // PACK-PEEL-4-LEVEL{LITERAL}: #packingConfig = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 64], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[1, 0], [1, 0], [0, 1]], outerPerm = [[1, 0], [1, 0], [1, 0]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[1, 0], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>
 #pipeline_layout = #hal.pipeline.layout<bindings = [
   <storage_buffer>,
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_objectfifo_npu4.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_objectfifo_npu4.mlir
index d36bd925d..63497578b 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_objectfifo_npu4.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_objectfifo_npu4.mlir
@@ -2,7 +2,7 @@
 // RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(iree-amdaie-lowering-strategy{target-device=npu4 use-tile-pipeline=pack-peel-4-level-tiling})' %s | FileCheck %s --check-prefix=PACK-PEEL-4-LEVEL
 
 // CHECK:       #config = #iree_codegen.lowering_config<tile_sizes = [
-// CHECK-SAME:                [128, 128], [0, 0, 1], [1, 1, 0, 0, 0, 0]
+// CHECK-SAME:                [128, 128, 0], [0, 0, 1], [1, 1, 0]
 // CHECK-SAME:            ]>
 // CHECK:       #packingConfig = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 32], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true],
 // CHECK-SAME:                      innerPerm = [
@@ -16,7 +16,7 @@
 // CHECK-SAME:                              [0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]
 // CHECK-SAME:                   ]}]>
 
-// PACK-PEEL-4-LEVEL{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[256, 256], [4, 4, 0], [0, 0, 1], [1, 1, 0, 0, 0, 0]]>
+// PACK-PEEL-4-LEVEL{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[256, 256, 0], [4, 4, 0], [0, 0, 1], [1, 1, 0]]>
 // PACK-PEEL-4-LEVEL{LITERAL}: #packingConfig = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 64], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1], [1, 0], [1, 0]]}, {packedSizes = [0, 0, 0, 8, 8, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>
 #pipeline_layout = #hal.pipeline.layout<bindings = [
   <storage_buffer>,