iree-org · krzysz00 · Jan 28, 2025
@@ -786,13 +786,10 @@ FailureOr<SmallVector<Value>> Im2colOp::decomposeOperation(OpBuilder &b) {
     OpFoldResult ivOffset = mulOfrs(b, nestedLoc, stride, ivs[ivIdx]);
     kIndex = addOfrs(b, nestedLoc, kIndex, ivOffset);
   }
-  FailureOr<SmallVector<Value>> maybeDelinKOffset = affine::delinearizeIndex(
-      b, nestedLoc, getValueOrCreateConstantIndexOp(b, loc, kIndex),
-      getValueOrCreateConstantIndexOp(b, loc, (kBasis)));
-  if (failed(maybeDelinKOffset)) {
-    return failure();
-  }
-  SmallVector<Value> delinKOffset = maybeDelinKOffset.value();
+  auto delinKOffsetOp = b.create<affine::AffineDelinearizeIndexOp>(
+      nestedLoc, getValueOrCreateConstantIndexOp(b, loc, kIndex), kBasis,
+      /*hasOuterBound=*/true);
+  SmallVector<Value> delinKOffset = delinKOffsetOp.getResults();
   // Split the delinearized offsets into the window offsets (for M offsets)
   // and the K offsets for the input tensor.
   SmallVector<Value> windowOffset, inputKOffset;
@@ -823,13 +820,10 @@ FailureOr<SmallVector<Value>> Im2colOp::decomposeOperation(OpBuilder &b) {
   // Delinearize the m_offset * m_strides into the convolution output space.
   // `mBasis` contains the basis for the iteration space of result of the
   // convolution op (i.e., basis for result H and W dims).
-  FailureOr<SmallVector<Value>> maybeDelinMOffset = affine::delinearizeIndex(
-      b, nestedLoc,
-      getValueOrCreateConstantIndexOp(b, nestedLoc, linearMOffset), mBasis);
-  if (failed(maybeDelinMOffset)) {
-    return failure();
-  }
-  SmallVector<Value> delinMOffset = maybeDelinMOffset.value();
+  auto delinMOffsetOp = b.create<affine::AffineDelinearizeIndexOp>(
+      nestedLoc, getValueOrCreateConstantIndexOp(b, loc, linearMOffset), mBasis,
+      /*hasOuterBound=*/true);
+  SmallVector<Value> delinMOffset = delinMOffsetOp.getResults();
 
   // Compute the final offsets into the input tensor.
   OpFoldResult zero = b.getIndexAttr(0);

@@ -15,9 +15,8 @@ module {
     return %7 : tensor<2x?x4xf32>
   }
 }
-//   CHECK-DAG: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 160) * 640)>
-//   CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0, s1] -> ((d0 + s0) floordiv 32 + s1 floordiv 480)>
-//   CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0)[s0, s1] -> ((d0 + s0) mod 32 + s1 floordiv 160 - (s1 floordiv 480) * 3)>
+//   CHECK-DAG: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 * 4)>
+//   CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (d0 + s0)>
 // CHECK-LABEL: func.func @im2col_untile_k
 //  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9_]+]]
 //  CHECK-SAME:     %[[mSIZE:[a-zA-Z0-9_]+]]
@@ -27,12 +26,15 @@ module {
 //   CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
 //   CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
 //       CHECK:   %[[OUT_TILE:.+]] = tensor.empty(%[[mSIZE]]) : tensor<2x?x4xf32>
+//       CHECK:   %[[kScaled:.+]] = affine.apply #[[$MAP]]()[%[[K]]]
 //       CHECK:   %[[bLOOP:.+]] = scf.for %[[b:.+]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[OUT0:.+]] = %[[OUT_TILE]]) -> (tensor<2x?x4xf32>)
 //       CHECK:     %[[mLOOP:.+]] = scf.for %[[m:.+]] = %[[C0]] to %[[mSIZE]] step %[[C1]] iter_args(%[[OUT1:.+]] = %[[OUT0]]) -> (tensor<2x?x4xf32>)
-//   CHECK-DAG:       %[[kIDX:.+]] = affine.apply #[[$MAP]]()[%[[K]]]
-//   CHECK-DAG:       %[[hIDX:.+]] = affine.apply #[[$MAP1]](%[[m]])[%[[mOFF]], %[[K]]]
-//   CHECK-DAG:       %[[wIDX:.+]] = affine.apply #[[$MAP2]](%[[m]])[%[[mOFF]], %[[K]]]
-//       CHECK:       %[[IN_SLICE:.+]] = tensor.extract_slice %[[ARG0]][%[[b]], %[[hIDX]], %[[wIDX]], %[[kIDX]]] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x34x34x640xf32> to tensor<1x1x4xf32>
+//   CHECK-DAG:       %[[kParts:.+]]:3 = affine.delinearize_index %[[kScaled]] into (3, 3, 640)
+//   CHECK-DAG:       %[[mIDX:.+]] = affine.apply #[[$MAP1]](%[[m]])[%[[mOFF]]]
+//   CHECK-DAG:       %[[mParts:.+]]:2 = affine.delinearize_index %[[mIDX]] into (32, 32)
+//   CHECK-DAG:       %[[hIDX:.+]] = affine.apply #[[$MAP1]](%[[mParts]]#0)[%[[kParts]]#0]
+//   CHECK-DAG:       %[[wIDX:.+]] = affine.apply #[[$MAP1]](%[[mParts]]#1)[%[[kParts]]#1]
+//       CHECK:       %[[IN_SLICE:.+]] = tensor.extract_slice %[[ARG0]][%[[b]], %[[hIDX]], %[[wIDX]], %[[kParts]]#2] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x34x34x640xf32> to tensor<1x1x4xf32>
 //       CHECK:       %[[OUT_SLICE:.+]] = tensor.extract_slice %[[OUT1]][%[[b]], %[[m]], 0] [1, 1, 4] [1, 1, 1] : tensor<2x?x4xf32> to tensor<1x1x4xf32>
 //       CHECK:       %[[COPY:.+]] = linalg.copy ins(%[[IN_SLICE]] : tensor<1x1x4xf32>) outs(%[[OUT_SLICE]] : tensor<1x1x4xf32>) -> tensor<1x1x4xf32>
 //       CHECK:       %[[INSERT:.+]] = tensor.insert_slice %[[COPY]] into %[[OUT1]][%[[b]], %[[m]], 0] [1, 1, 4] [1, 1, 1] : tensor<1x1x4xf32> into tensor<2x?x4xf32>
@@ -57,9 +59,9 @@ module {
     return %8 : tensor<2x?x?xf32>
   }
 }
-//   CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0)[s0] -> ((d0 + s0) floordiv 10)>
-//   CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1)[s0, s1] -> (((d0 + s0) floordiv 32) * 5 + (((d1 + s1) mod 10) floordiv 5) * 4)>
-//   CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0, d1)[s0, s1] -> (d0 * 3 + d1 * 7 + s0 * 3 + s1 * 7 - ((d0 + s0) floordiv 32) * 96 - ((d1 + s1) floordiv 5) * 35)>
+//   CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0)[s0] -> (d0 + s0)>
+//   CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1) -> (d0 * 5 + d1 * 4)>
+//   CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0, d1) -> (d0 * 3 + d1 * 7)>
 // CHECK-LABEL: func.func @im2col_transposed_m_pos
 //  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9_]+]]
 //  CHECK-SAME:     %[[mSIZE:[a-zA-Z0-9_]+]]
@@ -74,9 +76,12 @@ module {
 //       CHECK:     %[[mLOOP:.+]] = scf.for %[[m:.+]] = %[[C0]] to %[[mSIZE]] step %[[C1]] iter_args(%[[OUT1:.+]] = %[[OUT0]]) -> (tensor<2x?x?xf32>)
 //       CHECK:       %[[kLOOP:.+]] = scf.for %[[k:.+]] = %[[C0]] to %[[kSIZE]] step %[[C1]] iter_args(%[[OUT2:.+]] = %[[OUT1]]) -> (tensor<2x?x?xf32>)
 //   CHECK-DAG:         %[[kIDX:.+]] = affine.apply #[[$MAP]](%[[k]])[%[[kOFF]]]
-//   CHECK-DAG:         %[[hIDX:.+]] = affine.apply #[[$MAP1]](%[[m]], %[[k]])[%[[mOFF]], %[[kOFF]]]
-//   CHECK-DAG:         %[[wIDX:.+]] = affine.apply #[[$MAP2]](%[[m]], %[[k]])[%[[mOFF]], %[[kOFF]]]
-//       CHECK:         %[[IN_SLICE:.+]] = tensor.extract_slice %[[ARG0]][%[[kIDX]], %[[b]], %[[wIDX]], %[[hIDX]]] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<640x2x101x172xf32> to tensor<1x1x1xf32>
+//   CHECK-DAG:         %[[kParts:.+]]:3 = affine.delinearize_index %[[kIDX]] into (640, 2, 5)
+//   CHECK-DAG:         %[[mIDX:.+]] = affine.apply #[[$MAP]](%[[m]])[%[[mOFF]]]
+//   CHECK-DAG:         %[[mParts:.+]]:2 = affine.delinearize_index %[[mIDX]] into (32, 32)
+//   CHECK-DAG:         %[[hIDX:.+]] = affine.apply #[[$MAP1]](%[[mParts]]#0, %[[kParts]]#1)
+//   CHECK-DAG:         %[[wIDX:.+]] = affine.apply #[[$MAP2]](%[[mParts]]#1, %[[kParts]]#2)
+//       CHECK:         %[[IN_SLICE:.+]] = tensor.extract_slice %[[ARG0]][%[[kParts]]#0, %[[b]], %[[wIDX]], %[[hIDX]]] [1, 1, 1, 1] [1, 1, 1, 1] : tensor<640x2x101x172xf32> to tensor<1x1x1xf32>
 //       CHECK:         %[[OUT_SLICE:.+]] = tensor.extract_slice %[[OUT2]][%[[b]], %[[m]], %[[k]]] [1, 1, 1] [1, 1, 1] : tensor<2x?x?xf32> to tensor<1x1x1xf32>
 //       CHECK:         %[[COPY:.+]] = linalg.copy ins(%[[IN_SLICE]] : tensor<1x1x1xf32>) outs(%[[OUT_SLICE]] : tensor<1x1x1xf32>) -> tensor<1x1x1xf32>
 //       CHECK:         %[[INSERT:.+]] = tensor.insert_slice %[[COPY]] into %[[OUT2]][%[[b]], %[[m]], %[[k]]] [1, 1, 1] [1, 1, 1] : tensor<1x1x1xf32> into tensor<2x?x?xf32>
@@ -99,9 +104,9 @@ module {
     return %7 : tensor<2x?x?x2x4xf32>
   }
 }
-//   CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0)[s0] -> (d0 * 4 + s0 * 4 - ((d0 + s0) floordiv 160) * 640)>
-//   CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1, d2)[s0, s1, s2, s3] -> ((d1 + s2 + d0 * s0 + s1 * s0) floordiv 32 + (d2 + s3) floordiv 480)>
-//   CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0, d1, d2)[s0, s1, s2, s3] -> (d0 * s0 + d1 + s1 * s0 + s2 - ((d1 + s2 + d0 * s0 + s1 * s0) floordiv 32) * 32 + (d2 + s3) floordiv 160 - ((d2 + s3) floordiv 480) * 3)>
+//   CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0)[s0] -> (d0 * 4 + s0 * 4)
+//   CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1)[s0, s1, s2] -> (d0 * s0 + d1 + s1 * s0 + s2)>
+//   CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0, d1) -> (d0 + d1)>
 // CHECK-LABEL: func.func @im2col_expanded
 //  CHECK-SAME:     %[[ARG0:[a-zA-Z0-9_]+]]
 //  CHECK-SAME:     %[[mSIZE0:[a-zA-Z0-9_]+]]
@@ -119,9 +124,12 @@ module {
 //       CHECK:       %[[mLOOP1:.+]] = scf.for %[[m1:.+]] = %[[C0]] to %[[mSIZE1]] step %[[C1]] iter_args(%[[OUT2:.+]] = %[[OUT1]]) -> (tensor<2x?x?x2x4xf32>)
 //       CHECK:         %[[kLOOP:.+]] = scf.for %[[k:.+]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[OUT3:.+]] = %[[OUT2]]) -> (tensor<2x?x?x2x4xf32>)
 //   CHECK-DAG:           %[[kIDX:.+]] = affine.apply #[[$MAP]](%[[k]])[%[[kOFF]]]
-//   CHECK-DAG:           %[[hIDX:.+]] = affine.apply #[[$MAP1]](%[[m0]], %[[m1]], %[[k]])[%[[mSTRIDE]], %[[mOFF0]], %[[mOFF1]], %[[kOFF]]]
-//   CHECK-DAG:           %[[wIDX:.+]] = affine.apply #[[$MAP2]](%[[m0]], %[[m1]], %[[k]])[%[[mSTRIDE]], %[[mOFF0]], %[[mOFF1]], %[[kOFF]]]
-//       CHECK:           %[[IN_SLICE:.+]] = tensor.extract_slice %[[ARG0]][%[[b]], %[[hIDX]], %[[wIDX]], %[[kIDX]]] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x34x34x640xf32> to tensor<1x1x1x4xf32>
+//   CHECK-DAG:           %[[kParts:.+]]:3 = affine.delinearize_index %[[kIDX]] into (3, 3, 640)
+//   CHECK-DAG:           %[[mIDX:.+]] = affine.apply #[[$MAP1]](%[[m0]], %[[m1]])[%[[mSTRIDE]], %[[mOFF0]], %[[mOFF1]]]
+//   CHECK-DAG:           %[[mParts:.+]]:2 = affine.delinearize_index %[[mIDX]] into (32, 32)
+//   CHECK-DAG:           %[[hIDX:.+]] = affine.apply #[[$MAP2]](%[[mParts]]#0, %[[kParts]]#0)
+//   CHECK-DAG:           %[[wIDX:.+]] = affine.apply #[[$MAP2]](%[[mParts]]#1, %[[kParts]]#1)
+//       CHECK:           %[[IN_SLICE:.+]] = tensor.extract_slice %[[ARG0]][%[[b]], %[[hIDX]], %[[wIDX]], %[[kParts]]#2] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x34x34x640xf32> to tensor<1x1x1x4xf32>
 //       CHECK:           %[[OUT_SLICE:.+]] = tensor.extract_slice %[[OUT3]][%[[b]], %[[m0]], %[[m1]], %[[k]], 0] [1, 1, 1, 1, 4] [1, 1, 1, 1, 1] : tensor<2x?x?x2x4xf32> to tensor<1x1x1x4xf32>
 //       CHECK:           %[[COPY:.+]] = linalg.copy ins(%[[IN_SLICE]] : tensor<1x1x1x4xf32>) outs(%[[OUT_SLICE]] : tensor<1x1x1x4xf32>) -> tensor<1x1x1x4xf32>
 //       CHECK:           %[[INSERT:.+]] = tensor.insert_slice %[[COPY]] into %[[OUT3]][%[[b]], %[[m0]], %[[m1]], %[[k]], 0] [1, 1, 1, 1, 4] [1, 1, 1, 1, 1] : tensor<1x1x1x4xf32> into tensor<2x?x?x2x4xf32>
@@ -165,9 +173,8 @@ module {
     return %7 : tensor<2x2x4xf32>
   }
 }
-//   CHECK-UNROLL-DAG: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 160) * 640)>
-//   CHECK-UNROLL-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0, s1] -> ((d0 + s0) floordiv 32 + s1 floordiv 480)>
-//   CHECK-UNROLL-DAG: #[[$MAP2:.+]] = affine_map<(d0)[s0, s1] -> ((d0 + s0) mod 32 + s1 floordiv 160 - (s1 floordiv 480) * 3)>
+//   CHECK-UNROLL-DAG: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 * 4)>
+//   CHECK-UNROLL-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (d0 + s0)>
 // CHECK-UNROLL-LABEL: func.func @im2col_unrolled
 //  CHECK-UNROLL-SAME:     %[[ARG0:[a-zA-Z0-9_]+]]
 //  CHECK-UNROLL-SAME:     %[[mOFF:[a-zA-Z0-9_]+]]
@@ -179,39 +186,48 @@ module {
 //  First iteration
 //
 //   CHECK-UNROLL-DAG:   %[[kIDX:.+]] = affine.apply #[[$MAP]]()[%[[K]]]
-//   CHECK-UNROLL-DAG:   %[[hIDX:.+]] = affine.apply #[[$MAP1]](%[[C0]])[%[[mOFF]], %[[K]]]
-//   CHECK-UNROLL-DAG:   %[[wIDX:.+]] = affine.apply #[[$MAP2]](%[[C0]])[%[[mOFF]], %[[K]]]
-//       CHECK-UNROLL:   %[[IN_SLICE:.+]] = tensor.extract_slice %[[ARG0]][%[[C0]], %[[hIDX]], %[[wIDX]], %[[kIDX]]] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x34x34x640xf32> to tensor<1x1x4xf32>
+//   CHECK-UNROLL-DAG:   %[[kParts:.+]]:3 = affine.delinearize_index %[[kIDX]] into (3, 3, 640)
+//   CHECK-UNROLL-DAG:   %[[mIDX:.+]] = affine.apply #[[$MAP1]](%[[C0]])[%[[mOFF]]]
+//   CHECK-UNROLL-DAG:   %[[mParts:.+]]:2 = affine.delinearize_index %[[mIDX]] into (32, 32)
+//   CHECK-UNROLL-DAG:   %[[hIDX:.+]] = affine.apply #[[$MAP1]](%[[mParts]]#0)[%[[kParts]]#0]
+//   CHECK-UNROLL-DAG:   %[[wIDX:.+]] = affine.apply #[[$MAP1]](%[[mParts]]#1)[%[[kParts]]#1]
+//       CHECK-UNROLL:   %[[IN_SLICE:.+]] = tensor.extract_slice %[[ARG0]][%[[C0]], %[[hIDX]], %[[wIDX]], %[[kParts]]#2] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x34x34x640xf32> to tensor<1x1x4xf32>
 //       CHECK-UNROLL:   %[[OUT_SLICE:.+]] = tensor.extract_slice %[[OUT_TILE]][%[[C0]], %[[C0]], 0] [1, 1, 4] [1, 1, 1] : tensor<2x2x4xf32> to tensor<1x1x4xf32>
 //       CHECK-UNROLL:   %[[COPY:.+]] = linalg.copy ins(%[[IN_SLICE]] : tensor<1x1x4xf32>) outs(%[[OUT_SLICE]] : tensor<1x1x4xf32>) -> tensor<1x1x4xf32>
 //       CHECK-UNROLL:   %[[INSERT0:.+]] = tensor.insert_slice %[[COPY]] into %[[OUT_TILE]][%[[C0]], %[[C0]], 0] [1, 1, 4] [1, 1, 1] : tensor<1x1x4xf32> into tensor<2x2x4xf32>
 
 //  Second iteration
 //
-//   CHECK-UNROLL-DAG:   %[[kIDX:.+]] = affine.apply #[[$MAP]]()[%[[K]]]
-//   CHECK-UNROLL-DAG:   %[[hIDX:.+]] = affine.apply #[[$MAP1]](%[[C1]])[%[[mOFF]], %[[K]]]
-//   CHECK-UNROLL-DAG:   %[[wIDX:.+]] = affine.apply #[[$MAP2]](%[[C1]])[%[[mOFF]], %[[K]]]
-//       CHECK-UNROLL:   %[[IN_SLICE:.+]] = tensor.extract_slice %[[ARG0]][%[[C0]], %[[hIDX]], %[[wIDX]], %[[kIDX]]] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x34x34x640xf32> to tensor<1x1x4xf32>
+//   CHECK-UNROLL-DAG:   %[[kParts:.+]]:3 = affine.delinearize_index %[[kIDX]] into (3, 3, 640)
+//   CHECK-UNROLL-DAG:   %[[mIDX:.+]] = affine.apply #[[$MAP1]](%[[C1]])[%[[mOFF]]]
+//   CHECK-UNROLL-DAG:   %[[mParts:.+]]:2 = affine.delinearize_index %[[mIDX]] into (32, 32)
+//   CHECK-UNROLL-DAG:   %[[hIDX:.+]] = affine.apply #[[$MAP1]](%[[mParts]]#0)[%[[kParts]]#0]
+//   CHECK-UNROLL-DAG:   %[[wIDX:.+]] = affine.apply #[[$MAP1]](%[[mParts]]#1)[%[[kParts]]#1]
+//       CHECK-UNROLL:   %[[IN_SLICE:.+]] = tensor.extract_slice %[[ARG0]][%[[C0]], %[[hIDX]], %[[wIDX]], %[[kParts]]#2] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x34x34x640xf32> to tensor<1x1x4xf32>
 //       CHECK-UNROLL:   %[[OUT_SLICE:.+]] = tensor.extract_slice %[[INSERT0]][%[[C0]], %[[C1]], 0] [1, 1, 4] [1, 1, 1] : tensor<2x2x4xf32> to tensor<1x1x4xf32>
 //       CHECK-UNROLL:   %[[COPY:.+]] = linalg.copy ins(%[[IN_SLICE]] : tensor<1x1x4xf32>) outs(%[[OUT_SLICE]] : tensor<1x1x4xf32>) -> tensor<1x1x4xf32>
 //       CHECK-UNROLL:   %[[INSERT1:.+]] = tensor.insert_slice %[[COPY]] into %[[INSERT0]][%[[C0]], %[[C1]], 0] [1, 1, 4] [1, 1, 1] : tensor<1x1x4xf32> into tensor<2x2x4xf32>
 
 //  Third iteration
 //
-//   CHECK-UNROLL-DAG:   %[[kIDX:.+]] = affine.apply #[[$MAP]]()[%[[K]]]
-//   CHECK-UNROLL-DAG:   %[[hIDX:.+]] = affine.apply #[[$MAP1]](%[[C0]])[%[[mOFF]], %[[K]]]
-//   CHECK-UNROLL-DAG:   %[[wIDX:.+]] = affine.apply #[[$MAP2]](%[[C0]])[%[[mOFF]], %[[K]]]
-//       CHECK-UNROLL:   %[[IN_SLICE:.+]] = tensor.extract_slice %[[ARG0]][%[[C1]], %[[hIDX]], %[[wIDX]], %[[kIDX]]] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x34x34x640xf32> to tensor<1x1x4xf32>
+//   CHECK-UNROLL-DAG:   %[[kParts:.+]]:3 = affine.delinearize_index %[[kIDX]] into (3, 3, 640)
+//   CHECK-UNROLL-DAG:   %[[mIDX:.+]] = affine.apply #[[$MAP1]](%[[C0]])[%[[mOFF]]]
+//   CHECK-UNROLL-DAG:   %[[mParts:.+]]:2 = affine.delinearize_index %[[mIDX]] into (32, 32)
+//   CHECK-UNROLL-DAG:   %[[hIDX:.+]] = affine.apply #[[$MAP1]](%[[mParts]]#0)[%[[kParts]]#0]
+//   CHECK-UNROLL-DAG:   %[[wIDX:.+]] = affine.apply #[[$MAP1]](%[[mParts]]#1)[%[[kParts]]#1]
+//       CHECK-UNROLL:   %[[IN_SLICE:.+]] = tensor.extract_slice %[[ARG0]][%[[C1]], %[[hIDX]], %[[wIDX]], %[[kParts]]#2] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x34x34x640xf32> to tensor<1x1x4xf32>
 //       CHECK-UNROLL:   %[[OUT_SLICE:.+]] = tensor.extract_slice %[[INSERT1]][%[[C1]], %[[C0]], 0] [1, 1, 4] [1, 1, 1] : tensor<2x2x4xf32> to tensor<1x1x4xf32>
 //       CHECK-UNROLL:   %[[COPY:.+]] = linalg.copy ins(%[[IN_SLICE]] : tensor<1x1x4xf32>) outs(%[[OUT_SLICE]] : tensor<1x1x4xf32>) -> tensor<1x1x4xf32>
 //       CHECK-UNROLL:   %[[INSERT2:.+]] = tensor.insert_slice %[[COPY]] into %[[INSERT1]][%[[C1]], %[[C0]], 0] [1, 1, 4] [1, 1, 1] : tensor<1x1x4xf32> into tensor<2x2x4xf32>
 
 //  Fourth iteration
 //
-//   CHECK-UNROLL-DAG:   %[[kIDX:.+]] = affine.apply #[[$MAP]]()[%[[K]]]
-//   CHECK-UNROLL-DAG:   %[[hIDX:.+]] = affine.apply #[[$MAP1]](%[[C1]])[%[[mOFF]], %[[K]]]
-//   CHECK-UNROLL-DAG:   %[[wIDX:.+]] = affine.apply #[[$MAP2]](%[[C1]])[%[[mOFF]], %[[K]]]
-//       CHECK-UNROLL:   %[[IN_SLICE:.+]] = tensor.extract_slice %[[ARG0]][%[[C1]], %[[hIDX]], %[[wIDX]], %[[kIDX]]] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x34x34x640xf32> to tensor<1x1x4xf32>
+//   CHECK-UNROLL-DAG:   %[[kParts:.+]]:3 = affine.delinearize_index %[[kIDX]] into (3, 3, 640)
+//   CHECK-UNROLL-DAG:   %[[mIDX:.+]] = affine.apply #[[$MAP1]](%[[C1]])[%[[mOFF]]]
+//   CHECK-UNROLL-DAG:   %[[mParts:.+]]:2 = affine.delinearize_index %[[mIDX]] into (32, 32)
+//   CHECK-UNROLL-DAG:   %[[hIDX:.+]] = affine.apply #[[$MAP1]](%[[mParts]]#0)[%[[kParts]]#0]
+//   CHECK-UNROLL-DAG:   %[[wIDX:.+]] = affine.apply #[[$MAP1]](%[[mParts]]#1)[%[[kParts]]#1]
+//       CHECK-UNROLL:   %[[IN_SLICE:.+]] = tensor.extract_slice %[[ARG0]][%[[C1]], %[[hIDX]], %[[wIDX]], %[[kParts]]#2] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x34x34x640xf32> to tensor<1x1x4xf32>
 //       CHECK-UNROLL:   %[[OUT_SLICE:.+]] = tensor.extract_slice %[[INSERT2]][%[[C1]], %[[C1]], 0] [1, 1, 4] [1, 1, 1] : tensor<2x2x4xf32> to tensor<1x1x4xf32>
 //       CHECK-UNROLL:   %[[COPY:.+]] = linalg.copy ins(%[[IN_SLICE]] : tensor<1x1x4xf32>) outs(%[[OUT_SLICE]] : tensor<1x1x4xf32>) -> tensor<1x1x4xf32>
 //       CHECK-UNROLL:   %[[INSERT3:.+]] = tensor.insert_slice %[[COPY]] into %[[INSERT2]][%[[C1]], %[[C1]], 0] [1, 1, 4] [1, 1, 1] : tensor<1x1x4xf32> into tensor<2x2x4xf32>