From 227a0f79ab54cb807fec49362c4973b0fb3ca7af Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen@intel.com>
Date: Mon, 9 Dec 2024 10:04:43 -0600
Subject: [PATCH] [Blocking] Rewrite blocking pass to generate small 2D Xetile
 Ops (#978)

---
 include/imex/Dialect/XeTile/IR/XeTileTypes.td |    4 +
 .../XeTile/Transforms/BlockingAnalysis.h      |    6 +-
 .../imex/Dialect/XeTile/Transforms/Passes.td  |    5 +-
 include/imex/Utils/XeCommon.h                 |   24 +-
 .../XeTileToXeGPU/ArithOpConversion.cpp       |   20 +-
 .../XeTileToXeGPU/XeTileOpConversion.cpp      |  117 +-
 lib/Dialect/XeTile/Transforms/Blocking.cpp    | 1778 ++++++++++++++++-
 .../XeTile/Transforms/BlockingAnalysis.cpp    |   89 +-
 lib/Utils/XeCommon.cpp                        |   92 +-
 .../Transforms/Blocking/unit_tests.mlir       |    5 +
 .../Blocking/unit_tests_transform.mlir        | 1753 ++++++++++++++++
 11 files changed, 3640 insertions(+), 253 deletions(-)
 create mode 100644 test/Dialect/XeTile/Transforms/Blocking/unit_tests_transform.mlir
diff --git a/include/imex/Dialect/XeTile/IR/XeTileTypes.td b/include/imex/Dialect/XeTile/IR/XeTileTypes.td
index b0238266b..7cd2dc4df 100644
--- a/include/imex/Dialect/XeTile/IR/XeTileTypes.td
+++ b/include/imex/Dialect/XeTile/IR/XeTileTypes.td
@@ -89,6 +89,10 @@ def XeTile : XeTile_Type<"Tile", "tile", [ShapedTypeInterface],
       return llvm::cast<TileType>(cloneWith(getShape(), elementType));
     }
 
+    TileType clone(llvm::ArrayRef<int64_t> shape) {
+      return llvm::cast<TileType>(cloneWith(shape, getElementType()));
+    }
+
     xetile::SubGroupMapAttr getSgMap() {
       auto encoding = llvm::dyn_cast_if_present<xetile::XeTileAttr>(getEncoding());
       if (encoding)
diff --git a/include/imex/Dialect/XeTile/Transforms/BlockingAnalysis.h b/include/imex/Dialect/XeTile/Transforms/BlockingAnalysis.h
index 48bd95523..e6a4c1faf 100644
--- a/include/imex/Dialect/XeTile/Transforms/BlockingAnalysis.h
+++ b/include/imex/Dialect/XeTile/Transforms/BlockingAnalysis.h
@@ -41,10 +41,6 @@ class Block {
 
 llvm::raw_ostream &operator<<(llvm::raw_ostream &os, Block blk);
 
-// A pair of operator and operand index number representing
-// the use point of a value.
-typedef std::pair<mlir::Operation *, int64_t> UsePoint;
-
 class BlockingAnalysis {
 public:
   explicit BlockingAnalysis(std::shared_ptr<XeuArchInterface> uArch) {
@@ -54,7 +50,7 @@ class BlockingAnalysis {
 
   mlir::LogicalResult run(mlir::Operation *op);
 
-  Block getUseBlockSize(mlir::Value val, UsePoint point) const;
+  Block getUseBlockSize(mlir::Value val, mlir::OpOperand &point) const;
   Block getDefBlockSize(mlir::Value val) const;
   void printAnalysisResult();
 
diff --git a/include/imex/Dialect/XeTile/Transforms/Passes.td b/include/imex/Dialect/XeTile/Transforms/Passes.td
index 83c141718..4f4883aec 100644
--- a/include/imex/Dialect/XeTile/Transforms/Passes.td
+++ b/include/imex/Dialect/XeTile/Transforms/Passes.td
@@ -91,7 +91,10 @@ def XeTileBlocking : Pass<"xetile-blocking", "::mlir::gpu::GPUModuleOp">{
   let options = [
      Option<"device", "device", "std::string",
             /*default=*/"\"pvc\"",
-            "gpu platform architecture where these ops are running">
+            "gpu platform architecture where these ops are running">,
+     Option<"EnableTransform", "enable-2d-transform", "bool",
+            /*default=*/"false",
+            "Using 2D transform or 4D Conversion.">
  ];
 }
 
diff --git a/include/imex/Utils/XeCommon.h b/include/imex/Utils/XeCommon.h
index 77fdfdcd2..797f6be2d 100644
--- a/include/imex/Utils/XeCommon.h
+++ b/include/imex/Utils/XeCommon.h
@@ -28,8 +28,18 @@
 #include <mlir/Transforms/DialectConversion.h>
 #include <mlir/Transforms/OneToNTypeConversion.h>
 using namespace mlir::xegpu;
+
 namespace imex {
 
+using PackFuncTy = std::function<mlir::TypedValue<mlir::VectorType>(
+    mlir::Value, mlir::Value, mlir::Location, mlir::OpBuilder &)>;
+
+// A wrapper function to merge small vectors into a big one. It takes a
+// range of mlir::Value objects with mlir::VectorType, and merge them
+// into a big vector using the provided transformation function.
+mlir::Value packVectorsWith(mlir::ValueRange ins, PackFuncTy op,
+                            mlir::Location loc, mlir::OpBuilder &builder);
+
 // Combine vectors vertically while keeping the logical data layout.
 // As an example, given two vectors (2x4xf16) p and q, it will merge
 // them in to a 4x4xf16 vector.
@@ -40,7 +50,19 @@ namespace imex {
 //  q5, q6, q7, q8
 mlir::TypedValue<mlir::VectorType> stack(mlir::Value vecUp, mlir::Value vecDown,
                                          mlir::Location loc,
-                                         mlir::PatternRewriter &rewriter);
+                                         mlir::OpBuilder &builder);
+
+// merge vectors horizontally while keep the logical data layout.
+// 1 2 3 4   +    10 11 12   =   1 2 3 4 10 11 12
+// 5 6 7 8        13 14 15       5 6 7 8 13 14 15
+// since there is no direct op in mlir exists, we will
+// using ShapeCast and Shuffle to mimic it. It comes with
+// cost of complex shuffle masks. the mask for the above one
+// will be like this: 0 1 2 3  8  9 10
+//                    4 5 6 7 11 12 13
+mlir::TypedValue<mlir::VectorType> concat(mlir::Value lhs, mlir::Value rhs,
+                                          mlir::Location loc,
+                                          mlir::OpBuilder &builder);
 
 // It checks each GPUFuncOp in the module to see
 // whether they have arguments and outputs with
diff --git a/lib/Conversion/XeTileToXeGPU/ArithOpConversion.cpp b/lib/Conversion/XeTileToXeGPU/ArithOpConversion.cpp
index 1102a8b64..08aa74a6b 100644
--- a/lib/Conversion/XeTileToXeGPU/ArithOpConversion.cpp
+++ b/lib/Conversion/XeTileToXeGPU/ArithOpConversion.cpp
@@ -17,21 +17,6 @@
 
 namespace imex {
 
-using VectorTypedValue = mlir::TypedValue<mlir::VectorType>;
-using funcTy = VectorTypedValue(mlir::Value, mlir::Value, mlir::Location,
-                                mlir::PatternRewriter &);
-
-// see its description in XeTileOpConversion.cpp
-extern VectorTypedValue concat(mlir::Value v1, mlir::Value v2,
-                               mlir::Location loc,
-                               mlir::PatternRewriter &rewriter);
-
-// see its description in XeTileOpConversion.cpp
-extern mlir::Value mergeVectorsWrapper(mlir::ValueRange ins,
-                                       std::function<funcTy> transFunc,
-                                       mlir::Location loc,
-                                       XeOneToNPatternRewriter &rewriter);
-
 static mlir::Value createBinOp(mlir::vector::CombiningKind kind,
                                mlir::Value lhs, mlir::Value rhs,
                                mlir::Type elemTy, mlir::Location &loc,
@@ -318,8 +303,7 @@ class SgVectorMultiDimReductionOpPattern
         // TODO: need a better way to represent the result (align with
         // unpack/pack logic). currently we just shuffle them and cast it to the
         // type/shape in xetile program.
-        auto reducedVal =
-            mergeVectorsWrapper(intermediates, concat, loc, rewriter);
+        auto reducedVal = packVectorsWith(intermediates, concat, loc, rewriter);
         auto targetTy = mlir::VectorType::get({shape[1], shape[3]}, elemTy);
         auto newOp = rewriter.create<mlir::vector::ShapeCastOp>(loc, targetTy,
                                                                 reducedVal);
@@ -338,7 +322,7 @@ class SgVectorMultiDimReductionOpPattern
         // currently we just shuffle them and cast it to the type/shape in
         // xetile program.
         auto reductionVal =
-            mergeVectorsWrapper(intermediates, concat, loc, rewriter);
+            packVectorsWith(intermediates, concat, loc, rewriter);
         auto targetTy = mlir::VectorType::get({shape[0], shape[2]}, elemTy);
         auto newOp = rewriter.create<mlir::vector::ShapeCastOp>(loc, targetTy,
                                                                 reductionVal);
diff --git a/lib/Conversion/XeTileToXeGPU/XeTileOpConversion.cpp b/lib/Conversion/XeTileToXeGPU/XeTileOpConversion.cpp
index 00c7fc5b9..80d6b1fcd 100644
--- a/lib/Conversion/XeTileToXeGPU/XeTileOpConversion.cpp
+++ b/lib/Conversion/XeTileToXeGPU/XeTileOpConversion.cpp
@@ -38,106 +38,6 @@ using mlir::vector::ShapeCastOp;
 using mlir::vector::ShuffleOp;
 using mlir::vector::SplatOp;
 
-using VectorTypedValue = mlir::TypedValue<mlir::VectorType>;
-using funcTy = VectorTypedValue(mlir::Value, mlir::Value, mlir::Location,
-                                mlir::PatternRewriter &);
-
-// generate linearized shuffle mask for concat.
-static llvm::SmallVector<int64_t>
-getShuffleMask(llvm::ArrayRef<int64_t> shape1, llvm::ArrayRef<int64_t> shape2) {
-  assert(shape1.size() == shape2.size() && shape1.size() <= 2 &&
-         "only 1D/2D shape are supported.");
-  assert(shape1.drop_back() == shape2.drop_back() &&
-         "the row dim of the shapes should match.");
-  int64_t size1 = std::accumulate(shape1.begin(), shape1.end(), 1,
-                                  std::multiplies<int64_t>());
-  int64_t size2 = std::accumulate(shape2.begin(), shape2.end(), 1,
-                                  std::multiplies<int64_t>());
-  llvm::SmallVector<int64_t> mask(size1 + size2);
-  auto rows = shape1.size() == 1 ? 1 : shape1[0];
-  auto cols1 = shape1.size() == 1 ? shape1[0] : shape1[1];
-  auto cols2 = shape2.size() == 1 ? shape2[0] : shape2[1];
-  for (int64_t i = 0; i < rows; i++) {
-    int64_t s = i * (cols1 + cols2);
-    int64_t m = s + cols1;
-    int64_t e = m + cols2;
-    int64_t v1 = i * cols1;
-    int64_t v2 = size1 + i * cols2;
-    std::iota(mask.begin() + s, mask.begin() + m, v1);
-    std::iota(mask.begin() + m, mask.begin() + e, v2);
-  }
-  return mask;
-}
-
-// merge vectors horizontally while keep the logical data layout.
-// 1 2 3 4   +    10 11 12   =   1 2 3 4 10 11 12
-// 5 6 7 8        13 14 15       5 6 7 8 13 14 15
-// since there is no direct op in mlir exists, we will
-// using ShapeCast and Shuffle to mimic it. It comes with
-// cost of complex shuffle masks. the mask for the above one
-// will be like this: 0 1 2 3  8  9 10
-//                    4 5 6 7 11 12 13
-VectorTypedValue concat(mlir::Value vecLeft, mlir::Value vecRight,
-                        mlir::Location loc, mlir::PatternRewriter &rewriter) {
-  auto vecLeftTy = llvm::cast<mlir::VectorType>(vecLeft.getType());
-  auto vecRightTy = llvm::cast<mlir::VectorType>(vecRight.getType());
-
-  assert(vecLeftTy.getShape()[0] == vecLeftTy.getShape()[0] &&
-         "Operands of concat() do not have the same number of rows.");
-  assert(vecLeftTy.getRank() <= 2 &&
-         vecRightTy.getRank() == vecLeftTy.getRank() &&
-         "Currently concat only works on 1D/2D vector.");
-
-  auto elemTy = vecLeftTy.getElementType();
-  auto leftSize = vecLeftTy.getNumElements();
-  auto leftShape = vecLeftTy.getShape();
-  auto leftFlatTy = mlir::VectorType::get({vecLeftTy.getNumElements()}, elemTy);
-
-  auto rightSize = vecRightTy.getNumElements();
-  auto rightShape = vecRightTy.getShape();
-  auto rightFlatTy =
-      mlir::VectorType::get({vecRightTy.getNumElements()}, elemTy);
-
-  auto newShape = vecLeftTy.getRank() == 1
-                      ? llvm::SmallVector<int64_t>({leftSize + rightSize})
-                      : llvm::SmallVector<int64_t>(
-                            {leftShape[0], leftShape[1] + rightShape[1]});
-  auto castLeft = rewriter.create<ShapeCastOp>(loc, leftFlatTy, vecLeft);
-  auto castRight = rewriter.create<ShapeCastOp>(loc, rightFlatTy, vecRight);
-  auto mask = getShuffleMask(leftShape, rightShape);
-  auto shuffleOp = rewriter.create<ShuffleOp>(loc, castLeft, castRight, mask);
-  auto targetTy = mlir::VectorType::get(newShape, elemTy);
-  auto newOp = rewriter.create<ShapeCastOp>(loc, targetTy, shuffleOp);
-  return newOp;
-}
-
-// A wrapper function to merge small vectors into a big one. It takes a
-// range of mlir::Value objects with mlir::VectorType, and merge them
-// into a big vector using the provided transformation function.
-mlir::Value mergeVectorsWrapper(mlir::ValueRange ins,
-                                std::function<funcTy> transFunc,
-                                mlir::Location loc,
-                                XeOneToNPatternRewriter &rewriter) {
-  llvm::SmallVector<mlir::Value> shuffleOps(ins.begin(), ins.end());
-  while (shuffleOps.size() > 1) {
-    auto curr = shuffleOps;
-    shuffleOps.clear();
-    size_t currPairStartIdx{0};
-    while (currPairStartIdx < curr.size() - 1) {
-      size_t leftIdx{currPairStartIdx++};
-      size_t rightIdx{currPairStartIdx++};
-      auto newOp = transFunc(curr[leftIdx], curr[rightIdx], loc, rewriter);
-      shuffleOps.push_back(newOp);
-    }
-    if (currPairStartIdx < curr.size()) {
-      assert(currPairStartIdx == curr.size() - 1);
-      shuffleOps.push_back(curr[curr.size() - 1]);
-    }
-  }
-
-  return shuffleOps[0];
-}
-
 // Check that lowerUnpackOrPack will be able to evenly combine/split the input
 // grid into the output grid.
 static bool isUnpackPackCompatible(xetile::TileUnpackOp unpackOp,
@@ -164,7 +64,7 @@ static bool isUnpackPackCompatible(xetile::TileUnpackOp unpackOp,
 
 // a unified function lowering Unpack and Pack ops.
 static llvm::SmallVector<mlir::Value>
-lowerUnpackOrPack(XeOneToNPatternRewriter &rewriter, mlir::Operation *op,
+lowerUnpackOrPack(mlir::PatternRewriter &rewriter, mlir::Location loc,
                   mlir::ValueRange inputs, mlir::DenseI64ArrayAttr inBlkSizes,
                   mlir::DenseI64ArrayAttr outBlkSizes,
                   llvm::ArrayRef<int64_t> inGrids,
@@ -183,8 +83,7 @@ lowerUnpackOrPack(XeOneToNPatternRewriter &rewriter, mlir::Operation *op,
         auto idx = i * inGrids[1] + j;
         valSet.push_back(inputs[idx]);
         if (valSet.size() == static_cast<size_t>(nums)) {
-          auto newOp =
-              mergeVectorsWrapper(valSet, stack, op->getLoc(), rewriter);
+          auto newOp = packVectorsWith(valSet, stack, loc, rewriter);
           intermediates[i / nums * inGrids[1] + j] = newOp;
           valSet.clear();
         }
@@ -205,7 +104,7 @@ lowerUnpackOrPack(XeOneToNPatternRewriter &rewriter, mlir::Operation *op,
         for (auto k = 0; k < nums; k++) {
           llvm::SmallVector<int64_t> offsets({k * blkSizes[0], 0});
           auto newOp = rewriter.create<ExtractStridedSliceOp>(
-              op->getLoc(), v, offsets, blkSizes, strides);
+              loc, v, offsets, blkSizes, strides);
           auto idx = startPos + k * inGrids[1];
           intermediates[idx] = newOp;
         }
@@ -228,8 +127,7 @@ lowerUnpackOrPack(XeOneToNPatternRewriter &rewriter, mlir::Operation *op,
       for (auto j = 0; j < interGrids[1]; j++) {
         valSet.push_back(intermediates[i * interGrids[1] + j]);
         if (valSet.size() == nums) {
-          auto newOp =
-              mergeVectorsWrapper(valSet, concat, op->getLoc(), rewriter);
+          auto newOp = packVectorsWith(valSet, concat, loc, rewriter);
           newOps.push_back(newOp);
           valSet.clear();
         }
@@ -245,7 +143,7 @@ lowerUnpackOrPack(XeOneToNPatternRewriter &rewriter, mlir::Operation *op,
         for (int64_t k = 0; k < nums; k++) {
           llvm::SmallVector<int64_t> offsets({0, k * blkSizes[1]});
           auto newOp = rewriter.create<ExtractStridedSliceOp>(
-              op->getLoc(), v, offsets, blkSizes, strides);
+              loc, v, offsets, blkSizes, strides);
           newOps.push_back(newOp);
         }
       }
@@ -291,7 +189,7 @@ class SgTileUnpackOpPattern : public XeOneToNConversion<xetile::TileUnpackOp> {
     }
 
     rewriter.setInsertionPoint(op);
-    auto newOps = lowerUnpackOrPack(rewriter, op, inputs, inBlkSizes,
+    auto newOps = lowerUnpackOrPack(rewriter, op->getLoc(), inputs, inBlkSizes,
                                     outBlkSizes, inGrids, outGrids);
 
     if (op->hasOneUse() && packOp && isUnpackPackCompatible(op, packOp)) {
@@ -327,7 +225,8 @@ class SgTilePackOpPattern : public XeOneToNConversion<xetile::TilePackOp> {
     auto outGrids = outTy.getShape().take_front(2);
     auto outBlkSizes = op.getInnerBlocksAttr();
 
-    auto newOps = lowerUnpackOrPack(rewriter, op, {input}, inBlkSizes,
+    rewriter.setInsertionPoint(op);
+    auto newOps = lowerUnpackOrPack(rewriter, op->getLoc(), {input}, inBlkSizes,
                                     outBlkSizes, inGrids, outGrids);
 
     // it is simple one-to-one mapping
diff --git a/lib/Dialect/XeTile/Transforms/Blocking.cpp b/lib/Dialect/XeTile/Transforms/Blocking.cpp
index 3cc88910b..88c9ef616 100644
--- a/lib/Dialect/XeTile/Transforms/Blocking.cpp
+++ b/lib/Dialect/XeTile/Transforms/Blocking.cpp
@@ -31,9 +31,12 @@
 #include <mlir/IR/BuiltinTypes.h>
 #include <mlir/IR/PatternMatch.h>
 #include <mlir/Pass/Pass.h>
+#include <mlir/Pass/PassManager.h>
 #include <mlir/Support/LogicalResult.h>
 #include <mlir/Transforms/DialectConversion.h>
 #include <mlir/Transforms/GreedyPatternRewriteDriver.h>
+#include <mlir/Transforms/Passes.h>
+#include <mlir/Transforms/WalkPatternRewriteDriver.h>
 
 #include <llvm/ADT/DenseMapInfo.h>
 #include <llvm/ADT/STLExtras.h>
@@ -58,9 +61,1506 @@ namespace imex {
 #include "imex/Dialect/XeTile/Transforms/Passes.h.inc"
 } // namespace imex
 
+// TODO: Remove it after consolidation
+bool Enable2DBlockingTransform = false;
+
 namespace imex {
+// Blocking is to decompose ops working on big tile or vector size
+// into a set of ops working on smaller tile or vector size, that
+// can be mapped to hardware instructions. The old implementation
+// is using a 4D tile/vector type to represent the blocking result.
+// with the outer 2 dimensions corresponding to the grid size or
+// the number of instructions and their organization, while the
+// inner 2 dimensions corresponding to the block size, which can
+// be handled by a single instruction. The new implementation is
+// to remove this 4D tile/vector type representation and generating
+// a set of xetile or vector ops working the block size directly.
 namespace Blocking {
 
+template <typename SourceOp, typename AnalysisT>
+class RewriteXeTileOp : public mlir::OpRewritePattern<SourceOp> {
+public:
+  using OpPatternRewriter = typename mlir::PatternRewriter;
+
+  RewriteXeTileOp(mlir::MLIRContext *context, AnalysisT &analysis)
+      : mlir::OpRewritePattern<SourceOp>(context), analysis(analysis) {}
+
+protected:
+  AnalysisT &analysis;
+};
+
+template <template <typename> class TraitType, typename AnalysisT>
+class RewriteOpWithTrait : public mlir::OpTraitRewritePattern<TraitType> {
+public:
+  using OpPatternRewriter = typename mlir::PatternRewriter;
+
+  RewriteOpWithTrait(mlir::MLIRContext *context, AnalysisT &analysis,
+                     PatternBenefit benefit = 1)
+      : mlir::OpTraitRewritePattern<TraitType>(context, benefit),
+        analysis(analysis) {}
+
+protected:
+  AnalysisT &analysis;
+};
+
+static const char *const packAttrName = "__xetile_blocking_pack__";
+static const char *const unpackAttrName = "__xetile_blocking_unpack__";
+static const char *const blockAttrName = "__xetile_blocking_inner_block__";
+
+static mlir::Value
+unpackWithUnrealizedCastOp(mlir::ValueRange srcs, mlir::Type destTy,
+                           llvm::ArrayRef<int64_t> innerBlock,
+                           mlir::Location loc,
+                           mlir::PatternRewriter &rewriter) {
+  auto attr = mlir::NamedAttribute(rewriter.getStringAttr(unpackAttrName),
+                                   rewriter.getUnitAttr());
+  auto innerBlkAttr =
+      mlir::NamedAttribute(rewriter.getStringAttr(blockAttrName),
+                           rewriter.getDenseI64ArrayAttr(innerBlock));
+  auto castOp = rewriter.create<mlir::UnrealizedConversionCastOp>(
+      loc, destTy, srcs,
+      llvm::ArrayRef<mlir::NamedAttribute>({attr, innerBlkAttr}));
+  return castOp.getResult(0);
+}
+
+static mlir::ValueRange
+packWithUnrealizedCastOp(mlir::Value src, mlir::TypeRange destTypes,
+                         llvm::ArrayRef<int64_t> innerBlock, mlir::Location loc,
+                         mlir::PatternRewriter &rewriter) {
+  auto attr = mlir::NamedAttribute(rewriter.getStringAttr(packAttrName),
+                                   rewriter.getUnitAttr());
+  auto innerBlkAttr =
+      mlir::NamedAttribute(rewriter.getStringAttr(blockAttrName),
+                           rewriter.getDenseI64ArrayAttr(innerBlock));
+  auto castOp = rewriter.create<mlir::UnrealizedConversionCastOp>(
+      loc, destTypes, src,
+      llvm::ArrayRef<mlir::NamedAttribute>({attr, innerBlkAttr}));
+  return castOp.getResults();
+}
+
+static bool isPackOp(mlir::UnrealizedConversionCastOp castOp) {
+  if (!castOp)
+    return false;
+  bool isVec = llvm::all_of(castOp->getResultTypes(), [](mlir::Type ty) {
+    return mlir::isa<mlir::VectorType>(ty);
+  });
+  isVec &= llvm::all_of(castOp->getOperandTypes(), [](mlir::Type ty) {
+    return mlir::isa<mlir::VectorType>(ty);
+  });
+  auto attr = castOp->getAttrOfType<mlir::UnitAttr>(packAttrName);
+  return isVec && bool(attr);
+}
+
+static bool isUnpackOp(mlir::UnrealizedConversionCastOp castOp) {
+  if (!castOp)
+    return false;
+  bool isVec = llvm::all_of(castOp->getResultTypes(), [](mlir::Type ty) {
+    return mlir::isa<mlir::VectorType>(ty);
+  });
+  isVec &= llvm::all_of(castOp->getOperandTypes(), [](mlir::Type ty) {
+    return mlir::isa<mlir::VectorType>(ty);
+  });
+  auto attr = castOp->getAttrOfType<mlir::UnitAttr>(unpackAttrName);
+  return isVec && bool(attr);
+}
+
+static std::pair<mlir::DenseI64ArrayAttr, mlir::DenseI64ArrayAttr>
+getGridAndBlockSizes(mlir::UnrealizedConversionCastOp castOp) {
+  assert((isUnpackOp(castOp) || isPackOp(castOp)) &&
+         "Expecting unpack or pack op.");
+  auto innerBlkSizes =
+      castOp->getAttrOfType<mlir::DenseI64ArrayAttr>(blockAttrName);
+  llvm::ArrayRef<int64_t> shape;
+  if (isUnpackOp(castOp))
+    shape = mlir::dyn_cast<mlir::ShapedType>(castOp->getResult(0).getType())
+                .getShape();
+
+  if (isPackOp(castOp))
+    shape = mlir::dyn_cast<mlir::ShapedType>(castOp->getOperand(0).getType())
+                .getShape();
+
+  auto grids = mlir::DenseI64ArrayAttr::get(
+      castOp.getContext(),
+      {shape[0] / innerBlkSizes[0], shape[1] / innerBlkSizes[1]});
+  return {grids, innerBlkSizes};
+}
+
+// Check that lowerUnpackOrPack will be able to evenly combine/split the input
+// grid into the output grid.
+static bool isUnpackPackCompatible(mlir::UnrealizedConversionCastOp unpackOp,
+                                   mlir::UnrealizedConversionCastOp packOp) {
+
+  if (!isUnpackOp(unpackOp) || !isPackOp(packOp))
+    return false;
+
+  auto [inGrids, inBlkSizes] = Blocking::getGridAndBlockSizes(unpackOp);
+  auto [outGrids, outBlkSizes] = Blocking::getGridAndBlockSizes(packOp);
+
+  if (inBlkSizes[0] < outBlkSizes[0] && inGrids[0] % outGrids[0] != 0)
+    return false;
+  if (inBlkSizes[0] > outBlkSizes[0] && outGrids[0] % inGrids[0] != 0)
+    return false;
+  if (inBlkSizes[1] < outBlkSizes[1] && inGrids[1] % outGrids[1] != 0)
+    return false;
+  if (inBlkSizes[1] > outBlkSizes[1] && outGrids[1] % inGrids[1] != 0)
+    return false;
+
+  return true;
+}
+
+// Create a BinOp on lhs and rhs based on the CombiningKind.
+static mlir::Value createBinOp(mlir::vector::CombiningKind kind,
+                               mlir::Value lhs, mlir::Value rhs,
+                               mlir::Location &loc,
+                               mlir::PatternRewriter &rewriter) {
+  assert(lhs.getType() == rhs.getType() && "Expecting same type.");
+  auto elemTy = mlir::getElementTypeOrSelf(lhs);
+  // ADD and MUL are defined for both Integers and Floats,
+  // need to generate code based on element data type.
+  if (kind == mlir::vector::CombiningKind::ADD) {
+    if (mlir::isa<mlir::FloatType>(elemTy)) {
+      return rewriter.create<mlir::arith::AddFOp>(loc, lhs, rhs);
+    }
+    if (mlir::isa<mlir::IntegerType>(elemTy)) {
+      return rewriter.create<mlir::arith::AddIOp>(loc, lhs, rhs);
+    }
+  }
+
+  if (kind == mlir::vector::CombiningKind::MUL) {
+    if (mlir::isa<mlir::FloatType>(elemTy)) {
+      return rewriter.create<mlir::arith::MulFOp>(loc, lhs, rhs);
+    }
+    if (mlir::isa<mlir::IntegerType>(elemTy)) {
+      return rewriter.create<mlir::arith::MulIOp>(loc, lhs, rhs);
+    }
+  }
+
+  switch (kind) {
+  // the following are for ints only
+  case mlir::vector::CombiningKind::MINUI:
+    return rewriter.create<mlir::arith::MinUIOp>(loc, lhs, rhs);
+  case mlir::vector::CombiningKind::MINSI:
+    return rewriter.create<mlir::arith::MinSIOp>(loc, lhs, rhs);
+  case mlir::vector::CombiningKind::MAXUI:
+    return rewriter.create<mlir::arith::MaxUIOp>(loc, lhs, rhs);
+  case mlir::vector::CombiningKind::MAXSI:
+    return rewriter.create<mlir::arith::MaxSIOp>(loc, lhs, rhs);
+  case mlir::vector::CombiningKind::AND:
+    return rewriter.create<mlir::arith::AndIOp>(loc, lhs, rhs);
+  case mlir::vector::CombiningKind::OR:
+    return rewriter.create<mlir::arith::OrIOp>(loc, lhs, rhs);
+  case mlir::vector::CombiningKind::XOR:
+    return rewriter.create<mlir::arith::XOrIOp>(loc, lhs, rhs);
+  // the following are for floats only
+  case mlir::vector::CombiningKind::MINNUMF:
+    return rewriter.create<mlir::arith::MinNumFOp>(loc, lhs, rhs);
+  case mlir::vector::CombiningKind::MAXNUMF:
+    return rewriter.create<mlir::arith::MaxNumFOp>(loc, lhs, rhs);
+  case mlir::vector::CombiningKind::MINIMUMF:
+    return rewriter.create<mlir::arith::MinimumFOp>(loc, lhs, rhs);
+  case mlir::vector::CombiningKind::MAXIMUMF:
+    return rewriter.create<mlir::arith::MaximumFOp>(loc, lhs, rhs);
+  default:
+    llvm_unreachable("Unexpected CombiningKind.");
+    return lhs;
+  }
+}
+
+// helper function to lower the outer reduction,
+// e.g., tile.reduce <add> %src [0]: vector<32x64xf16> to vector<1x64xf16>
+// The blocking size for such reduction op is not fixed to 1x16. So the
+// `sources` is a vector of values with type of vector<1x16xf16>, organized
+// as grid [32, 2]. So this function will perform 31 reduction operations
+// on grid[:, 0], and grid[:, 1] respectively
+static llvm::SmallVector<mlir::Value>
+lowerOuterReduction(mlir::ValueRange sources, llvm::ArrayRef<int64_t> grid,
+                    mlir::vector::CombiningKind kind, mlir::Location loc,
+                    mlir::PatternRewriter &rewriter) {
+  llvm::SmallVector<mlir::Value> results;
+  for (auto j = 0; j < grid[1]; j++) {
+    auto val = sources[j];
+    for (auto i = 1; i < grid[0]; i++) {
+      val = createBinOp(kind, val, sources[i * grid[1] + j], loc, rewriter);
+    }
+    auto shapedTy = mlir::dyn_cast<mlir::ShapedType>(val.getType());
+    // needs one reduction is block size is not 1 for the reduction dim.
+    if (shapedTy && shapedTy.getDimSize(0) != 1) {
+      auto shape = shapedTy.getShape().vec();
+      shape[0] = 1;
+      auto resTy = shapedTy.clone(shape);
+      val = rewriter.create<xetile::ReductionOp>(loc, resTy, kind, val,
+                                                 llvm::ArrayRef<int64_t>({0}));
+    }
+    results.push_back(val);
+  }
+  return results;
+}
+
+// expected inputs are a grid of vector<1xnxf16> values. The grid shape is
+// [i, j]. i and n is power of 2 and the third dim is always 1, which should be
+// set by the blocking pass. For a vector of vector<32x64xf16> with reduction
+// on dim 1, it will blocked into a vector<32x4x1x16> with reduction on dim 1
+// and dim 3. lowerInnerReductionWithIntraVectorShuffles performs the reduction
+// with arithmetic operations on vector<16xf16>. To perform reduction on dim 1,
+// simple vector arithmetic operations are issued, we will get 32 vectors of
+// vector<16xf16>, each vector<16xf16> represents the partial reduction result
+// of each row. To perform redcution on dim 3, it uses two vector shuffles
+/// to shuffle values from two conjuction rows. For example, given
+// row1 = [a0, a1, ..., a15], and  row2 = [b0, b1, ..., b15]. It will shuffle
+// the vector into row1' = [a0, .., a7, b0, ..., b7],
+// row2' = [a8, ..., a15, b8, ..., b15], and then perform the vector arith op
+// on row1' and row2', geting the result: c = [c0, ..., c7, c8, ..., c15].
+// here, c0, ..., c7 are the partial reduction results of row1 and c8, ..., c15
+// are the partial results of row2.  This process will be repeated until get the
+// final result, such that each element in c represents a final reduction result
+// of a row.
+static llvm::SmallVector<mlir::Value>
+lowerInnerReductionWithIntraVectorShuffles(
+    mlir::ValueRange sources, mlir::Type elemTy, llvm::ArrayRef<int64_t> grid,
+    llvm::ArrayRef<int64_t> block, mlir::vector::CombiningKind kind,
+    mlir::Location loc, mlir::PatternRewriter &rewriter) {
+
+  auto isPowerOfTwo = [](auto n) { return (n & (n - 1)) == 0; };
+
+  // make sure the dim0 of the block is 1 in blocking pass
+  // different from outer reduction, this is strictly required
+  // for this method.
+  assert(block[0] == 1 && "dim0 of the block has to be 1.");
+  assert(isPowerOfTwo(grid[0]) && isPowerOfTwo(block[1]) &&
+         "sizes of dim1 of grid and block should be power of 2.");
+
+  auto genShuffleMasks = [&](int blkSize, int vecSize) {
+    llvm::SmallVector<int64_t> mask1;
+    llvm::SmallVector<int64_t> mask2;
+    auto s1 = 0, s2 = blkSize;
+    for (auto i = 0; i < vecSize; i++) {
+      if (i && i % blkSize == 0) {
+        s1 += blkSize;
+        s2 += blkSize;
+      }
+
+      mask1.push_back(s1);
+      mask2.push_back(s2);
+      s1++;
+      s2++;
+    }
+    return std::make_pair(mask1, mask2);
+  };
+
+  // Stage 1: vector<ixjx1xnxf16> equals to a grid of ixj of vector<1xnxf16>
+  // after lowering to xegpu. This stage performs j-1 reduction operations on
+  // j dim of the grid, the result is a vector of vector<ixnxf16>.
+  llvm::SmallVector<mlir::Value> intermediates(grid[0]);
+  for (auto i = 0; i < grid[0]; i++) {
+    auto val = sources[i * grid[1]];
+    for (auto j = 1; j < grid[1]; j++) {
+      val = createBinOp(kind, val, sources[i * grid[1] + j], loc, rewriter);
+    }
+    // cast the result of e.g., vector<1x16xf16> into vector<16xf16>
+    auto targetTy = mlir::VectorType::get({block[1]}, elemTy);
+    val = rewriter.create<mlir::vector::ShapeCastOp>(loc, targetTy, val);
+    intermediates[i] = val;
+  }
+
+  // Stage 2: doing intra vector reduction with shuffle Ops.
+  // Each vector in the result of stage 1 can be viewed as a row
+  // each row has e.g., 32 elements:
+  // v1 = [a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 ... a31]
+  // v2 = [b0 b1 b2 b3 b4 b5 b6 b7 b8 b9 ... b31]
+  // ...
+  // vn = [p0 p1 p2 p3 p4 p5 p6 p7 p8 p9 ... p31]
+  // To reduce it, we repeatedly shuffle halves of two consecutive vectors.
+  // One can view it as: transpose halves of two partial aggregates, reduce
+  // vertically, get 1 vector with reduced halves of two vectors. For example,
+  // for v1 and v2, we get:
+  //    nv1 = [a0, .., a15, b0, .., b15]
+  //    nv2 = [a16, .., a31, b16, .., b31]
+  //    nv_reduced = reductionOp(nv1,nv2)
+  // such that the left half of the vector contains the partial reduction
+  // of v1, and the right half contains the partial reduction of v2.
+  // and the the number of vectors is reduced by half after one iteration.
+  // and we reduce the block size by half, and repeat the process until
+  // the block size is 1.
+  // The intermediate result of this stage is an array of vectors with
+  // type, e.g., vector<nxf16>, array size is `i/n`. And these vectors
+  // will be merged into a single vector with type vector<ixf16>.
+
+  // each row should not have > 1 partial aggregate at the end
+  auto partialRowAggSize{block[1]};
+  auto numVecsLeft{grid[0]};
+  while (partialRowAggSize != 1 && numVecsLeft != 1) {
+    partialRowAggSize /= 2;
+    auto workList = intermediates;
+    intermediates.clear();
+    assert(workList.size() % 2 == 0 && "The size should be divisible by 2.");
+    auto masks = genShuffleMasks(partialRowAggSize, block[1]);
+    for (size_t i = 0; i < workList.size(); i += 2) {
+      auto v1 = workList[i];
+      auto v2 = workList[i + 1];
+      auto shuffleOp1 =
+          rewriter.create<mlir::vector::ShuffleOp>(loc, v1, v2, masks.first);
+      auto shuffleOp2 =
+          rewriter.create<mlir::vector::ShuffleOp>(loc, v1, v2, masks.second);
+      auto reduce = createBinOp(kind, shuffleOp1, shuffleOp2, loc, rewriter);
+      intermediates.push_back(reduce);
+    }
+    numVecsLeft /= 2;
+  }
+
+  if (partialRowAggSize > 1) {
+    assert(intermediates.size() == 1 &&
+           "We must have ONE row with non-finalized aggregates.");
+    auto toFinalize = intermediates.back();
+    intermediates.clear();
+    uint32_t currentAggVecSize = block[1];
+    do {
+      currentAggVecSize /= 2;
+      partialRowAggSize /= 2;
+      auto [vecUpperMask, vecLowerMask] =
+          genShuffleMasks(partialRowAggSize, currentAggVecSize);
+      auto shuffleOp1 = rewriter.create<mlir::vector::ShuffleOp>(
+          loc, toFinalize, toFinalize, vecUpperMask);
+      auto shuffleOp2 = rewriter.create<mlir::vector::ShuffleOp>(
+          loc, toFinalize, toFinalize, vecLowerMask);
+      toFinalize = createBinOp(kind, shuffleOp1, shuffleOp2, loc, rewriter);
+    } while (partialRowAggSize != 1);
+    intermediates.push_back(toFinalize);
+  }
+  return intermediates;
+}
+
+// a unified function lowering Unpack and Pack ops.
+static llvm::SmallVector<mlir::Value>
+lowerUnpackOrPack(mlir::ValueRange inputs, mlir::DenseI64ArrayAttr inBlkSizes,
+                  mlir::DenseI64ArrayAttr outBlkSizes,
+                  mlir::DenseI64ArrayAttr inGrids,
+                  mlir::DenseI64ArrayAttr outGrids, mlir::Location loc,
+                  mlir::OpBuilder &builder) {
+  // handle based on the dim0, and save results into intermediates
+  llvm::SmallVector<mlir::Value> intermediates(outGrids[0] * inGrids[1]);
+  if (inBlkSizes[0] == outBlkSizes[0]) { // do nothing
+    intermediates = inputs;
+  } else if (inBlkSizes[0] < outBlkSizes[0]) { // stack on dim 0
+    // `nums` small vectors will be stacked into one big vector
+    auto nums = inGrids[0] / outGrids[0];
+    llvm::SmallVector<mlir::Value> valSet;
+    for (auto j = 0; j < inGrids[1]; j++) {
+      for (auto i = 0; i < inGrids[0]; i++) {
+        auto idx = i * inGrids[1] + j;
+        valSet.push_back(inputs[idx]);
+        if (valSet.size() == static_cast<size_t>(nums)) {
+          auto newOp = packVectorsWith(valSet, stack, loc, builder);
+          intermediates[i / nums * inGrids[1] + j] = newOp;
+          valSet.clear();
+        }
+      }
+    }
+  } else {
+    // do extract on dim0 using vector::ExtractStridedSliceOp
+    // intermediates.resize(outGrids[0] * inGrids[1]);
+    llvm::SmallVector<int64_t> blkSizes({outBlkSizes[0], inBlkSizes[1]});
+
+    // each vector will be horizonally cut into `nums` subvectors
+    auto nums = outGrids[0] / inGrids[0];
+    llvm::SmallVector<int64_t> strides({1, 1});
+    for (auto i = 0; i < inGrids[0]; i++) {
+      for (auto j = 0; j < inGrids[1]; j++) {
+        auto startPos = i * nums * inGrids[1] + j;
+        auto v = inputs[i * inGrids[1] + j];
+        for (auto k = 0; k < nums; k++) {
+          llvm::SmallVector<int64_t> offsets({k * blkSizes[0], 0});
+          auto newOp = builder.create<mlir::vector::ExtractStridedSliceOp>(
+              loc, v, offsets, blkSizes, strides);
+          auto idx = startPos + k * inGrids[1];
+          intermediates[idx] = newOp;
+        }
+      }
+    }
+  }
+
+  // handle intermediates based on the dim1, and save results into newOps
+  llvm::SmallVector<mlir::Value> newOps;
+  llvm::SmallVector<int64_t> interGrids = {outGrids[0], inGrids[1]};
+  if (inBlkSizes[1] == outBlkSizes[1]) {
+    // do nothing since they have the same size
+    newOps = intermediates;
+  } else if (inBlkSizes[1] < outBlkSizes[1]) {
+    // doing concat since blkSZ of input vector is smaller
+    // `nums` of small vectors will be concated into a big one
+    size_t nums = inGrids[1] / outGrids[1];
+    llvm::SmallVector<mlir::Value> valSet;
+    for (auto i = 0; i < interGrids[0]; i++) {
+      for (auto j = 0; j < interGrids[1]; j++) {
+        valSet.push_back(intermediates[i * interGrids[1] + j]);
+        if (valSet.size() == nums) {
+          auto newOp = packVectorsWith(valSet, concat, loc, builder);
+          newOps.push_back(newOp);
+          valSet.clear();
+        }
+      }
+    }
+  } else { // doing extract on dim 1
+    llvm::SmallVector<int64_t> blkSizes({outBlkSizes[0], outBlkSizes[1]});
+    llvm::SmallVector<int64_t> strides({1, 1});
+    auto nums = outGrids[1] / interGrids[1];
+    for (auto i = 0; i < interGrids[0]; i++) {
+      for (auto j = 0; j < interGrids[1]; j++) {
+        auto v = intermediates[i * interGrids[1] + j];
+        for (int64_t k = 0; k < nums; k++) {
+          llvm::SmallVector<int64_t> offsets({0, k * blkSizes[1]});
+          auto newOp = builder.create<mlir::vector::ExtractStridedSliceOp>(
+              loc, v, offsets, blkSizes, strides);
+          newOps.push_back(newOp);
+        }
+      }
+    }
+  }
+  return newOps;
+}
+
+static llvm::SmallVector<mlir::Type>
+convertTypes(mlir::ShapedType type, llvm::ArrayRef<int64_t> blockSize) {
+  auto newTy = type.clone(blockSize, type.getElementType());
+  auto size = std::accumulate(blockSize.begin(), blockSize.end(), 1,
+                              std::multiplies<int64_t>());
+  return llvm::SmallVector<mlir::Type>(type.getNumElements() / size, newTy);
+}
+
+// clang-format off
+// rewrite a arith.constant op on big sizes into multiple arith.constant ops on
+// smaller sizes, which is determined by the blocking analysis. For example,
+// %0 = arith.constant dense<[[0, 1, 2, 3], [4, 5, 6, 7]]>: vector<2x4xf16>,
+// will be replaced by:
+// %0_0 = arith.constant dense<[[0, 1], [4, 5]]>: vector<2x2xf16>
+// %0_1 = arith.constant dense<[[2, 3], [6, 7]]>: vector<2x2xf16>
+// assuming the blocking size is [2, 2].
+// clang-format on
+
+class RewriteArithConstantOp
+    : public RewriteXeTileOp<mlir::arith::ConstantOp, BlockingAnalysis> {
+public:
+  using RewriteXeTileOp<mlir::arith::ConstantOp,
+                        BlockingAnalysis>::RewriteXeTileOp;
+  mlir::LogicalResult
+  matchAndRewrite(mlir::arith::ConstantOp op,
+                  mlir::PatternRewriter &rewriter) const override {
+    auto loc = op.getLoc();
+    bool allUsersAreUnrealizedCastOp =
+        llvm::all_of(op->getUsers(), [](auto user) {
+          auto castOp = llvm::dyn_cast<mlir::UnrealizedConversionCastOp>(user);
+          auto packAttr = user->getAttr(packAttrName);
+          return castOp && packAttr;
+        });
+    // currently only handles the case where the constant op is used by
+    // an unrealized cast op.
+    if (allUsersAreUnrealizedCastOp) {
+      auto value = llvm::dyn_cast<mlir::DenseElementsAttr>(op.getValue());
+      if (!value || value.getType().getRank() != 2)
+        return mlir::failure();
+
+      auto blockSize = analysis.getDefBlockSize(op.getResult());
+      if (!blockSize)
+        return mlir::failure();
+
+      auto shape = value.getType().getShape();
+      auto elemTy = value.getType().getElementType();
+      auto newTy = mlir::VectorType::get(blockSize.asArrayRef(), elemTy);
+      auto values = value.getValues<mlir::Attribute>();
+
+      llvm::SmallVector<mlir::Value> newOps;
+      for (auto i = 0; i < shape[0]; i += blockSize[0]) {
+        for (auto j = 0; j < shape[1]; j += blockSize[1]) {
+          llvm::SmallVector<mlir::Attribute> subValues;
+          for (auto x = 0; x < blockSize[0]; x++) {
+            for (auto y = 0; y < blockSize[1]; y++) {
+              subValues.push_back(values[(i + x) * shape[1] + j + y]);
+            }
+          }
+          auto subValue = mlir::DenseElementsAttr::get(newTy, subValues);
+          auto newOp = rewriter.create<mlir::arith::ConstantOp>(loc, subValue);
+          newOps.push_back(newOp);
+        }
+      }
+      auto castOp = unpackWithUnrealizedCastOp(
+          newOps, value.getType(), blockSize.asArrayRef(), loc, rewriter);
+      rewriter.replaceOp(op, castOp);
+      return mlir::success();
+    }
+    return mlir::failure();
+  }
+};
+
+// rewrite a init_tile op on big size into multiple init_tile ops on smaller
+// size, which is based on blocking analysis.
+class RewriteInitTileOp
+    : public RewriteXeTileOp<xetile::InitTileOp, BlockingAnalysis> {
+public:
+  using RewriteXeTileOp<xetile::InitTileOp, BlockingAnalysis>::RewriteXeTileOp;
+
+  mlir::LogicalResult
+  matchAndRewrite(xetile::InitTileOp op,
+                  mlir::PatternRewriter &rewriter) const override {
+    auto loc = op.getLoc();
+    auto ctx = op.getContext();
+    auto tileTy = op.getType();
+    auto shape = tileTy.getShape();
+
+    auto blockSize = analysis.getDefBlockSize(op.getTile());
+    // skip it if there is no valid blockSize available, or the
+    // tile is already with the target size.
+    if (!blockSize || shape == blockSize.asArrayRef())
+      return mlir::failure();
+
+    llvm::SmallVector<mlir::Value> newOps;
+
+    // handle scattered tiles.
+    if (tileTy.getScatterAttr() == mlir::BoolAttr::get(ctx, true)) {
+      auto indices = op.getIndices();
+      assert(indices && "indices is missing.");
+      auto indicesTy = indices.getType();
+
+      auto convertedTileTypes = convertTypes(tileTy, blockSize.asArrayRef());
+      auto newIndicesTypes = convertTypes(indicesTy, blockSize.asArrayRef());
+
+      auto subIndices = packWithUnrealizedCastOp(
+          indices, newIndicesTypes, blockSize.asArrayRef(), loc, rewriter);
+
+      for (auto [t, i] : llvm::zip(convertedTileTypes, subIndices)) {
+        llvm::SmallVector<mlir::Value> operands({op.getSource(), i});
+        auto newOp = rewriter.create<xetile::InitTileOp>(
+            loc, mlir::TypeRange({t}), operands, op->getAttrs());
+        newOps.push_back(newOp);
+      }
+    } else { // handle blocked tiles
+      auto newTileTy = tileTy.clone(blockSize.asArrayRef());
+      // TODO: add array_length support.
+      auto width = blockSize[1];
+      llvm::SmallVector<int64_t, 2> grids(
+          {shape[0] / blockSize[0], shape[1] / width});
+      llvm::SmallVector<mlir::OpFoldResult> offsets = op.getMixedOffsets();
+
+      auto addi = [&](mlir::OpFoldResult a, int64_t b) -> mlir::Value {
+        if (mlir::isa<mlir::Attribute>(a)) {
+          auto attr = a.get<mlir::Attribute>();
+          auto sum =
+              rewriter.getIndexAttr(cast<IntegerAttr>(attr).getInt() + b);
+          return rewriter.create<mlir::arith::ConstantOp>(loc, sum);
+        } else {
+          auto aV = a.get<mlir::Value>();
+          auto bV = rewriter.create<mlir::arith::ConstantOp>(
+              loc, rewriter.getIndexAttr(b));
+          return rewriter.create<mlir::arith::AddIOp>(loc, aV, bV);
+        }
+      };
+
+      for (int64_t i = 0; i < grids[0]; i++) {
+        for (int64_t j = 0; j < grids[1]; j++) {
+          auto subOffX = blockSize[0] * i;
+          auto subOffY = width * j;
+          auto X = addi(offsets[0], subOffX);
+          auto Y = addi(offsets[1], subOffY);
+          llvm::SmallVector<mlir::OpFoldResult> ofrs({X, Y});
+          llvm::SmallVector<mlir::Value> offsets;
+          llvm::SmallVector<int64_t> constOffsets;
+          mlir::dispatchIndexOpFoldResults(ofrs, offsets, constOffsets);
+          auto constOffsetsAttr = rewriter.getDenseI64ArrayAttr(constOffsets);
+          auto newOp = rewriter.create<xetile::InitTileOp>(
+              loc, newTileTy, op.getSource(), offsets, op.getSizes(),
+              op.getStrides(), constOffsetsAttr, op.getConstSizesAttr(),
+              op.getConstStridesAttr(), nullptr);
+          newOps.push_back(newOp);
+        }
+      }
+    }
+    auto castOp = unpackWithUnrealizedCastOp(
+        newOps, tileTy, blockSize.asArrayRef(), loc, rewriter);
+    rewriter.replaceOp(op, castOp);
+    return mlir::success();
+  }
+};
+
+// rewrite a prefetch_tile op on big tile size into multiple prefetch_tile ops
+// on smaller tile size, which is based on blocking analysis.
+class RewritePrefetchTileOp
+    : public RewriteXeTileOp<xetile::PrefetchTileOp, BlockingAnalysis> {
+public:
+  using RewriteXeTileOp<xetile::PrefetchTileOp,
+                        BlockingAnalysis>::RewriteXeTileOp;
+
+  mlir::LogicalResult
+  matchAndRewrite(xetile::PrefetchTileOp op,
+                  OpPatternRewriter &rewriter) const override {
+    auto loc = op.getLoc();
+    auto tile = op.getTile();
+    auto tileTy = tile.getType();
+    auto shape = tileTy.getShape();
+    auto blockSize = analysis.getUseBlockSize(tile, op->getOpOperand(0));
+    // define op is not updated yet.
+    if (!blockSize || shape == blockSize.asArrayRef())
+      return failure();
+    auto convertedTileTypes = convertTypes(tileTy, blockSize.asArrayRef());
+    auto convertedTiles = packWithUnrealizedCastOp(
+        tile, convertedTileTypes, blockSize.asArrayRef(), loc, rewriter);
+
+    for (auto [t, ty] : llvm::zip_equal(convertedTiles, convertedTileTypes)) {
+      rewriter.create<xetile::PrefetchTileOp>(loc, ty, t, op->getAttrs());
+    }
+
+    rewriter.eraseOp(op);
+    return mlir::success();
+  }
+};
+
+// rewrite a load_tile op on big tile size into multiple load_tile ops
+// on smaller tile size, which is based on blocking analysis.
+class RewriteLoadTileOp
+    : public RewriteXeTileOp<xetile::LoadTileOp, BlockingAnalysis> {
+public:
+  using RewriteXeTileOp<xetile::LoadTileOp, BlockingAnalysis>::RewriteXeTileOp;
+
+  mlir::LogicalResult
+  matchAndRewrite(xetile::LoadTileOp op,
+                  OpPatternRewriter &rewriter) const override {
+    auto loc = op.getLoc();
+    auto tile = op.getSource();
+    auto tileTy = tile.getType();
+    auto shape = tileTy.getShape();
+    auto blockSize = analysis.getUseBlockSize(tile, op->getOpOperand(0));
+
+    if (!blockSize || shape == blockSize.asArrayRef())
+      return failure();
+
+    auto convertedTileTypes = convertTypes(tileTy, blockSize.asArrayRef());
+    auto convertedTiles = packWithUnrealizedCastOp(
+        tile, convertedTileTypes, blockSize.asArrayRef(), loc, rewriter);
+
+    auto vecTy = ::mlir::VectorType::get(blockSize.asArrayRef(),
+                                         tileTy.getElementType());
+
+    llvm::SmallVector<mlir::Value> newOps;
+    for (auto t : convertedTiles) {
+      auto newOp =
+          rewriter.create<xetile::LoadTileOp>(loc, vecTy, t, op->getAttrs());
+      newOps.push_back(newOp);
+    }
+
+    auto castOp = unpackWithUnrealizedCastOp(
+        newOps, op.getType(), blockSize.asArrayRef(), loc, rewriter);
+
+    rewriter.replaceOp(op, castOp);
+    return mlir::success();
+  }
+};
+
+// rewrite a store_tile op on big tile size into multiple store_tile ops
+// on smaller tile size, which is based on blocking analysis.
+class RewriteStoreTileOp
+    : public RewriteXeTileOp<xetile::StoreTileOp, BlockingAnalysis> {
+public:
+  using RewriteXeTileOp<xetile::StoreTileOp, BlockingAnalysis>::RewriteXeTileOp;
+
+  mlir::LogicalResult
+  matchAndRewrite(xetile::StoreTileOp op,
+                  OpPatternRewriter &rewriter) const override {
+    auto loc = op.getLoc();
+    auto value = op.getValue();
+    auto valTy = value.getType();
+    auto shape = valTy.getShape();
+    auto tile = op.getTile();
+    auto tileTy = tile.getType();
+    auto blockSize = analysis.getUseBlockSize(value, op->getOpOperand(0));
+
+    if (!blockSize || shape == blockSize.asArrayRef())
+      return failure();
+
+    auto convertedValTypes = convertTypes(valTy, blockSize.asArrayRef());
+    auto convertedTileTypes = convertTypes(tileTy, blockSize.asArrayRef());
+    auto convertedValues = packWithUnrealizedCastOp(
+        value, convertedValTypes, blockSize.asArrayRef(), loc, rewriter);
+    auto convertedTiles = packWithUnrealizedCastOp(
+        tile, convertedTileTypes, blockSize.asArrayRef(), loc, rewriter);
+
+    for (auto [v, t] : llvm::zip(convertedValues, convertedTiles)) {
+      rewriter.create<xetile::StoreTileOp>(loc, v, t, op.getL1HintAttr(),
+                                           op.getL2HintAttr(),
+                                           op.getL3HintAttr());
+    }
+    rewriter.eraseOp(op);
+    return mlir::success();
+  }
+};
+
+// rewrite a LoadGatherOp on big tile size into multiple LoadGatherOps
+// on smaller tile size.
+class RewriteLoadGatherOp
+    : public RewriteXeTileOp<xetile::LoadGatherOp, BlockingAnalysis> {
+public:
+  using RewriteXeTileOp<xetile::LoadGatherOp,
+                        BlockingAnalysis>::RewriteXeTileOp;
+
+  mlir::LogicalResult
+  matchAndRewrite(xetile::LoadGatherOp op,
+                  PatternRewriter &rewriter) const override {
+    auto loc = op.getLoc();
+    auto mask = op.getMask();
+    auto tile = op.getTile();
+    auto type = tile.getType();
+    auto elemTy = type.getElementType();
+
+    auto blockSize = analysis.getUseBlockSize(tile, op->getOpOperand(0));
+    if (!blockSize || type.getShape() == blockSize.asArrayRef())
+      return mlir::failure();
+
+    auto convertedTileTypes = convertTypes(type, blockSize.asArrayRef());
+    auto convertedMaskTypes =
+        convertTypes(mask.getType(), blockSize.asArrayRef());
+
+    auto tiles = packWithUnrealizedCastOp(
+        tile, convertedTileTypes, blockSize.asArrayRef(), loc, rewriter);
+    auto masks = packWithUnrealizedCastOp(
+        mask, convertedMaskTypes, blockSize.asArrayRef(), loc, rewriter);
+    auto newValueTy = mlir::VectorType::get(blockSize.asArrayRef(), elemTy);
+    llvm::SmallVector<mlir::Value> newOps;
+    for (auto [t, m] : llvm::zip(tiles, masks)) {
+      auto newOp = rewriter.create<xetile::LoadGatherOp>(
+          loc, newValueTy, t, m, op.getPaddingAttr(), op.getL1HintAttr(),
+          op.getL2HintAttr(), op.getL3HintAttr());
+      newOps.push_back(newOp);
+    }
+
+    auto castOp = unpackWithUnrealizedCastOp(
+        newOps, op.getType(), blockSize.asArrayRef(), loc, rewriter);
+    rewriter.replaceOp(op, castOp);
+    return mlir::success();
+  }
+};
+
+// rewrite a StoreScatterOp on big tile size into multiple StoreScatterOps
+// on smaller tile size.
+class RewriteStoreScatterOp
+    : public RewriteXeTileOp<xetile::StoreScatterOp, BlockingAnalysis> {
+public:
+  using RewriteXeTileOp<xetile::StoreScatterOp,
+                        BlockingAnalysis>::RewriteXeTileOp;
+
+  mlir::LogicalResult
+  matchAndRewrite(xetile::StoreScatterOp op,
+                  OpPatternRewriter &rewriter) const override {
+    auto loc = op.getLoc();
+    auto value = op.getValue();
+    auto tile = op.getTile();
+    auto mask = op.getMask();
+    auto tileTy = tile.getType();
+
+    auto blockSize = analysis.getUseBlockSize(value, op->getOpOperand(0));
+
+    if (!blockSize || tileTy.getShape() == blockSize.asArrayRef() ||
+        blockSize != analysis.getUseBlockSize(tile, op->getOpOperand(1)))
+      return mlir::failure();
+
+    auto convertedValTypes =
+        convertTypes(value.getType(), blockSize.asArrayRef());
+    auto convertedTileTypes =
+        convertTypes(tile.getType(), blockSize.asArrayRef());
+    auto convertedMaskTypes =
+        convertTypes(mask.getType(), blockSize.asArrayRef());
+
+    auto values = packWithUnrealizedCastOp(
+        value, convertedValTypes, blockSize.asArrayRef(), loc, rewriter);
+    auto tiles = packWithUnrealizedCastOp(
+        tile, convertedTileTypes, blockSize.asArrayRef(), loc, rewriter);
+    auto masks = packWithUnrealizedCastOp(
+        mask, convertedMaskTypes, blockSize.asArrayRef(), loc, rewriter);
+
+    for (auto [v, t, m] : llvm::zip(values, tiles, masks)) {
+      (void)rewriter.create<xetile::StoreScatterOp>(
+          loc, v, t, m, op.getL1HintAttr(), op.getL2HintAttr(),
+          op.getL3HintAttr());
+    }
+
+    rewriter.eraseOp(op);
+    return mlir::success();
+  }
+};
+
+// rewrite a update_tile_offset op on big tile size into multiple
+// update_tile_offset ops on smaller tile size.
+class RewriteUpdateTileOffsetOp
+    : public RewriteXeTileOp<xetile::UpdateTileOffsetOp, BlockingAnalysis> {
+public:
+  using RewriteXeTileOp<xetile::UpdateTileOffsetOp,
+                        BlockingAnalysis>::RewriteXeTileOp;
+
+  mlir::LogicalResult
+  matchAndRewrite(xetile::UpdateTileOffsetOp op,
+                  OpPatternRewriter &rewriter) const override {
+    auto loc = op.getLoc();
+    auto tile = op.getTile();
+    auto tileTy = tile.getType();
+    auto shape = tileTy.getShape();
+    auto ctx = op.getContext();
+
+    auto blockSize = analysis.getDefBlockSize(tile);
+    if (!blockSize || shape == blockSize.asArrayRef())
+      return mlir::failure();
+
+    auto convertedTileTypes = convertTypes(tileTy, blockSize.asArrayRef());
+    auto convertedTiles = packWithUnrealizedCastOp(
+        tile, convertedTileTypes, blockSize.asArrayRef(), loc, rewriter);
+
+    llvm::SmallVector<mlir::Value> newOps;
+
+    // handle scattered tiles.
+    if (tileTy.getScatterAttr() == mlir::BoolAttr::get(ctx, true)) {
+      auto indices = op.getIndices();
+      assert(indices && "indices is missing.");
+      auto indicesTy = indices.getType();
+
+      auto convertedIndicesTypes =
+          convertTypes(indicesTy, blockSize.asArrayRef());
+      auto convertedIndices =
+          packWithUnrealizedCastOp(indices, convertedIndicesTypes,
+                                   blockSize.asArrayRef(), loc, rewriter);
+
+      for (auto [t, i] : llvm::zip(convertedTiles, convertedIndices)) {
+        auto newOp = rewriter.create<xetile::UpdateTileOffsetOp>(
+            loc, t, op.getOffsetX(), op.getOffsetY(), i);
+        newOps.push_back(newOp);
+      }
+    } else { // handle blocked tiles
+      for (auto t : convertedTiles) {
+        auto newOp = rewriter.create<xetile::UpdateTileOffsetOp>(
+            loc, t, op.getOffsetX(), op.getOffsetY(), nullptr);
+        newOps.push_back(newOp);
+      }
+    }
+
+    auto castOp = unpackWithUnrealizedCastOp(
+        newOps, op.getType(), blockSize.asArrayRef(), loc, rewriter);
+    rewriter.replaceOp(op, castOp);
+    return mlir::success();
+  }
+};
+
+// rewrite a tile_mma op on big tile size into multiple
+// tile_mma ops on smaller tile size.
+class RewriteTileMMAOp
+    : public RewriteXeTileOp<xetile::TileMMAOp, BlockingAnalysis> {
+public:
+  using RewriteXeTileOp<xetile::TileMMAOp, BlockingAnalysis>::RewriteXeTileOp;
+
+  mlir::LogicalResult
+  matchAndRewrite(xetile::TileMMAOp op,
+                  OpPatternRewriter &rewriter) const override {
+    auto loc = op.getLoc();
+    auto resultTy = op.getResult().getType();
+
+    auto a = op.getA();
+    auto b = op.getB();
+    auto c = op.getC();
+
+    assert(a && b && "a operand or b operand is (are) missing.\n");
+
+    auto getBlockingSize = [&](mlir::Value val, int pos) -> Block {
+      if (!val)
+        return Block();
+      return analysis.getUseBlockSize(val, op->getOpOperand(pos));
+    };
+
+    auto aShape = a.getType().getShape();
+    auto bShape = b.getType().getShape();
+
+    auto aBlockSize = getBlockingSize(op.getA(), 0);
+    auto bBlockSize = getBlockingSize(op.getB(), 1);
+    auto cBlockSize = getBlockingSize(op.getC(), 2);
+
+    llvm::SmallVector<mlir::Value> aVals, bVals, cVals;
+    auto pack = [&](mlir::TypedValue<mlir::VectorType> val,
+                    llvm::ArrayRef<int64_t> blockSize) {
+      auto type = val.getType();
+      if (type.getShape() == blockSize)
+        return llvm::SmallVector<mlir::Value>({val});
+      auto convertedTypes = convertTypes(type, blockSize);
+      auto values = packWithUnrealizedCastOp(val, convertedTypes, blockSize,
+                                             loc, rewriter);
+      return llvm::to_vector(values);
+    };
+
+    if (aBlockSize)
+      aVals = pack(a, aBlockSize.asArrayRef());
+
+    if (bBlockSize)
+      bVals = pack(b, bBlockSize.asArrayRef());
+
+    if (c && cBlockSize)
+      cVals = pack(c, cBlockSize.asArrayRef());
+
+    // Vals are empty due to invalid blocking size, or with size 1 due to
+    // the original shape is the same with the blocking size. The op will
+    // be skipped if every operand got an invalid blocking size or the
+    // original shape is the same with the blocking size.
+    if (aVals.size() <= 1 && bVals.size() <= 1 && cVals.size() <= 1)
+      return mlir::failure();
+
+    uint64_t M = aShape[0] / aBlockSize[0];
+    uint64_t K = aShape[1] / aBlockSize[1];
+    uint64_t N = bShape[1] / bBlockSize[1];
+
+    auto vecTy = ::mlir::VectorType::get({aBlockSize[0], bBlockSize[1]},
+                                         resultTy.getElementType());
+    mlir::SmallVector<mlir::Value> newOps;
+
+    for (uint64_t i = 0; i < M; i++) {
+      for (uint64_t j = 0; j < N; j++) {
+        mlir::Value tmpC;
+        if (c)
+          tmpC = cVals[i * N + j]; // init with acc
+        for (uint64_t k = 0; k < K; k++) {
+          auto aVec = aVals[i * K + k];
+          auto bVec = bVals[k * N + j];
+          llvm::SmallVector<mlir::Value> operands({aVec, bVec});
+          if (tmpC)
+            operands.push_back(tmpC);
+          tmpC = rewriter.create<xetile::TileMMAOp>(loc, vecTy, operands,
+                                                    op->getAttrs());
+        }
+        newOps.push_back(tmpC);
+      }
+    }
+    auto castOp = unpackWithUnrealizedCastOp(
+        newOps, resultTy, Block().asArrayRef(), loc, rewriter);
+    rewriter.replaceOp(op, castOp);
+    return mlir::success();
+  }
+};
+
+// rewrite a tile_reduction op on big tile size into multiple
+// tile_reduction ops on smaller tile size.
+// Currently the outer reduction op is lowered into a set of
+// binary ops across the reduction dimension (see details in
+// comments for lowerOuterReduction). And the inner reduction
+// is lowered into a set of binary ops and shuffle ops (See
+// details in comments for lowerInnerReductionWithIntraVectorShuffles).
+//
+// TODO: Update the blocking and lowering strategy to generate
+// binary ops across the blocks, and keep the reduction op on
+// each block. e.g., we can choose 8x16 as block size for
+// xetile.reduce<add> %src [0]: vector<32x64xf16> -> vector<1x64xf16>
+// so we will have a grid (shape = [4, 4]) of 8x16 blocks, and then
+// we can generate 4 binary ops working on vector<8x16> for
+// grid[:, i] (i = 0, 1, 2, 3). It will result in 4 vector values
+// of type vector<8x16>, and then generate reduction op on each vector.
+class RewriteTileReductionOp
+    : public RewriteXeTileOp<xetile::ReductionOp, BlockingAnalysis> {
+public:
+  using RewriteXeTileOp<xetile::ReductionOp, BlockingAnalysis>::RewriteXeTileOp;
+
+  mlir::LogicalResult
+  matchAndRewrite(xetile::ReductionOp op,
+                  OpPatternRewriter &rewriter) const override {
+    auto loc = op.getLoc();
+    auto src = op.getSource();
+    auto srcTy = src.getType();
+    auto shape = srcTy.getShape();
+    auto dims = op.getReductionDims();
+    // only support 2D vector, and reduction on one dimension.
+    if (srcTy.getRank() != 2 || dims.size() != 1)
+      return rewriter.notifyMatchFailure(op, "unsupported reduction op");
+
+    auto blkSize = analysis.getUseBlockSize(src, op->getOpOperand(0));
+    if (!blkSize)
+      return rewriter.notifyMatchFailure(op, "Invalid blocking size");
+
+    auto convertedSrcTypes = convertTypes(srcTy, blkSize.asArrayRef());
+    auto convertedSrcs = packWithUnrealizedCastOp(
+        src, convertedSrcTypes, blkSize.asArrayRef(), loc, rewriter);
+
+    int64_t grid[2] = {shape[0] / blkSize[0], shape[1] / blkSize[1]};
+
+    llvm::SmallVector<mlir::Value> newOps;
+    if (dims[0] == 0) {
+      newOps =
+          lowerOuterReduction(convertedSrcs, grid, op.getKind(), loc, rewriter);
+    } else if (dims[0] == 1) {
+      auto elemTy = srcTy.getElementType();
+      auto intermediates = lowerInnerReductionWithIntraVectorShuffles(
+          convertedSrcs, elemTy, grid, blkSize.asArrayRef(), op.getKind(), loc,
+          rewriter);
+
+      for (auto v : intermediates) {
+        auto resultTy = mlir::VectorType::get({1, 1}, elemTy);
+        for (auto i = 0; i < blkSize[1]; i++) {
+          auto pos = rewriter.create<mlir::arith::ConstantOp>(
+              loc, rewriter.getI32IntegerAttr(i));
+          auto extractOp =
+              rewriter.create<mlir::vector::ExtractElementOp>(loc, v, pos);
+          auto splatOp = rewriter.create<mlir::vector::SplatOp>(
+              op.getLoc(), resultTy, extractOp);
+          newOps.push_back(splatOp);
+        }
+      }
+    } else {
+      return rewriter.notifyMatchFailure(op, "unsupported reduction dim");
+    }
+
+    blkSize[dims[0]] = 1;
+    auto castOp = unpackWithUnrealizedCastOp(
+        newOps, op.getType(), blkSize.asArrayRef(), loc, rewriter);
+    rewriter.replaceOp(op, castOp);
+
+    return mlir::success();
+  }
+};
+
+// rewrite a tile_broadcast op on big tile size into multiple
+// tile_broadcast ops on smaller tile size.
+class RewriteTileBroadcastOp
+    : public RewriteXeTileOp<xetile::BroadcastOp, BlockingAnalysis> {
+public:
+  using RewriteXeTileOp<xetile::BroadcastOp, BlockingAnalysis>::RewriteXeTileOp;
+
+  mlir::LogicalResult
+  matchAndRewrite(xetile::BroadcastOp op,
+                  OpPatternRewriter &rewriter) const override {
+    auto loc = op.getLoc();
+    auto src = op.getSource();
+    auto srcTy = src.getType();
+    auto elemTy = srcTy.getElementType();
+    auto dims = op.getBroadcastDim();
+    if (srcTy.getRank() != 2 || dims.size() != 1)
+      return rewriter.notifyMatchFailure(op, "unsupported broadcast op");
+
+    auto srcBlkSize = analysis.getUseBlockSize(src, op->getOpOperand(0));
+    auto resBlkSize = analysis.getDefBlockSize(op.getResult());
+
+    if (!srcBlkSize || !resBlkSize)
+      return rewriter.notifyMatchFailure(op, "Invalid blocking size");
+
+    if (srcTy.getShape() == srcBlkSize.asArrayRef())
+      return rewriter.notifyMatchFailure(op, "No need to block");
+
+    auto convertedSrcTypes = convertTypes(srcTy, srcBlkSize.asArrayRef());
+    auto convertedSrcs = packWithUnrealizedCastOp(
+        src, convertedSrcTypes, srcBlkSize.asArrayRef(), loc, rewriter);
+
+    auto resTy = op.getResult().getType();
+    int64_t resultGrid[2] = {resTy.getShape()[0] / resBlkSize[0],
+                             resTy.getShape()[1] / resBlkSize[1]};
+
+    llvm::SmallVector<mlir::Value> newOps;
+    if (dims[0] == 0) {
+      // clang-format off
+      // broadcast along the first dim, we simply need to replicate the source.
+      // For example, for
+      //    xetile.broadcast %src [0]: vector<1x64xf16> -> vector<32x64xf16>
+      // After blocking (assuming block size = [1, 16]) and lowering to xegpu,
+      // its input values (source) will be a vector of values with type <1x16xf16>
+      // and size = 4, which can be viewed as:
+      // | vector<1x16xf16> | vector<1x16xf16> | vector<1x16xf16> | vector<1x16xf16> |
+      // so we need to replicate it 32 times (resultGrid[0]) to get final results:
+      //  0: | vector<1x16xf16> | vector<1x16xf16> | vector<1x16xf16> | vector<1x16xf16> |
+      //  ......
+      // 31: | vector<1x16xf16> | vector<1x16xf16> | vector<1x16xf16> | vector<1x16xf16> |
+      // clang-format on
+      for (auto i = 0; i < resultGrid[0]; i++)
+        newOps.append(convertedSrcs.begin(), convertedSrcs.end());
+    } else if (dims[0] == 1) {
+      // clang-format off
+      // broadcast along the second dim, we use both splatOp and replicates.
+      // For example: xetile.broadcast %src [1]: vector<32x1xf16> ->
+      // vector<32x64xf16>. After blocking (assuming block size = [1, 16]) and
+      // lowering to xegpu, the input value (source) will be a vector of values
+      // with type <1x1xf16> and size = 32, which can be viewed as:
+      //    0: | vector<1x1xf16> |
+      //           ...
+      //   31: | vector<1x1xf16> |
+      // first, splatOp is used to broadcast the value of vector<1x1xf16> to
+      // vector<1x16xf16>
+      //    0: | vector<1x16xf16> |
+      //           ...
+      //   31: | vector<1x16xf16> |
+      // and then we replicate the splatOp 4 times (resultGrid[1]) to get the
+      // final results:
+      //    0: | vector<1x16xf16> | vector<1x16xf16> | vector<1x16xf16> | vector<1x16xf16> |
+      //           ...
+      //   31: | vector<1x16xf16> | vector<1x16xf16> | vector<1x16xf16> | vector<1x16xf16> |
+      // clang-format on
+      auto dstTy = mlir::VectorType::get(resBlkSize.asArrayRef(), elemTy);
+      for (auto src : convertedSrcs) {
+        auto ty = mlir::dyn_cast<mlir::VectorType>(src.getType());
+        assert(ty && ty.getNumElements() == 1 &&
+               "Expecting a <1x1xelemty> vector type.");
+        auto ext = rewriter.create<mlir::vector::ExtractOp>(
+            loc, src, llvm::ArrayRef<int64_t>({0, 0}));
+        auto splatOp = rewriter.create<mlir::vector::SplatOp>(loc, dstTy, ext);
+        newOps.append(resultGrid[1], splatOp);
+      }
+    } else {
+      return mlir::failure();
+    }
+
+    return mlir::failure();
+  }
+};
+
+// rewrite a tile_transpose op on big tile size into multiple
+// tile_transpose ops on smaller tile size.
+class RewriteTileTransposeOp
+    : public RewriteXeTileOp<xetile::TransposeOp, BlockingAnalysis> {
+  using RewriteXeTileOp<xetile::TransposeOp, BlockingAnalysis>::RewriteXeTileOp;
+
+  mlir::LogicalResult
+  matchAndRewrite(xetile::TransposeOp op,
+                  OpPatternRewriter &rewriter) const override {
+    auto loc = op.getLoc();
+    auto input = op.getVector();
+    auto inputTy = input.getType();
+    auto inShape = inputTy.getShape();
+    auto result = op.getResult();
+    auto resultTy = result.getType();
+    auto resultShape = resultTy.getShape();
+
+    auto permutation = op.getPermutation();
+    if (permutation != mlir::ArrayRef<int64_t>({1, 0}))
+      return rewriter.notifyMatchFailure(op, "Unsupported permutation");
+
+    auto inBlockSize = analysis.getUseBlockSize(input, op->getOpOperand(0));
+    auto outBlockSize = analysis.getDefBlockSize(result);
+    if (!inBlockSize || !outBlockSize || inShape == inBlockSize.asArrayRef() ||
+        resultShape == outBlockSize.asArrayRef())
+      return mlir::failure();
+    auto elemTy = inputTy.getElementType();
+
+    auto newDstTy = mlir::VectorType::get(outBlockSize.asArrayRef(), elemTy);
+
+    auto convertedInputTypes = convertTypes(inputTy, inBlockSize.asArrayRef());
+    auto convertedResultTypes =
+        convertTypes(resultTy, outBlockSize.asArrayRef());
+
+    auto convertedInputs = packWithUnrealizedCastOp(
+        input, convertedInputTypes, inBlockSize.asArrayRef(), loc, rewriter);
+
+    int64_t grids[2] = {resultShape[0] / outBlockSize[0],
+                        resultShape[1] / outBlockSize[1]};
+    llvm::SmallVector<mlir::Value> newOps;
+    for (auto i : llvm::seq<int64_t>(0, grids[0])) {
+      for (auto j : llvm::seq<int64_t>(0, grids[1])) {
+        int64_t idx = i + grids[0] * j;
+        mlir::Value arg = convertedInputs[idx];
+        mlir::Value res = rewriter.create<xetile::TransposeOp>(
+            loc, newDstTy, arg, permutation);
+        newOps.push_back(res);
+      }
+    }
+    auto castOp = unpackWithUnrealizedCastOp(
+        newOps, resultTy, outBlockSize.asArrayRef(), loc, rewriter);
+    rewriter.replaceOp(op, castOp);
+    return mlir::success();
+  }
+};
+
+// rewrite a vectorizable op (e.g., addf) on big vector size into multiple
+// same ops on smaller vector size.
+// TODO: replace it with upstream unroll pattern in vector dialect.
+class RewriteVectorizableOp
+    : public RewriteOpWithTrait<mlir::OpTrait::Vectorizable, BlockingAnalysis> {
+public:
+  using RewriteOpWithTrait::RewriteOpWithTrait;
+
+  mlir::LogicalResult
+  matchAndRewrite(mlir::Operation *op,
+                  OpPatternRewriter &rewriter) const override {
+    if (op->getNumResults() != 1)
+      return rewriter.notifyMatchFailure(op, "op must have 1 result");
+
+    auto res = op->getResult(0);
+    auto resType = mlir::dyn_cast<mlir::VectorType>(res.getType());
+    if (!resType || resType.getRank() != 2)
+      return mlir::failure();
+
+    auto resShape = resType.getShape();
+    auto blockSize =
+        analysis.getUseBlockSize(op->getOperand(0), op->getOpOperand(0));
+
+    if (!blockSize || resShape == blockSize.asArrayRef())
+      return mlir::failure();
+
+    auto elemTy = resType.getElementType();
+    auto newTy = mlir::VectorType::get(blockSize.asArrayRef(), elemTy);
+
+    Location loc = op->getLoc();
+    llvm::SmallVector<mlir::ValueRange> newOperands;
+    for (auto opr : op->getOperands()) {
+      auto oprTy = mlir::dyn_cast<mlir::VectorType>(opr.getType());
+      if (!oprTy || oprTy.getRank() != 2)
+        newOperands.emplace_back(opr);
+      auto convertedTypes = convertTypes(oprTy, blockSize.asArrayRef());
+      auto convertedValues = packWithUnrealizedCastOp(
+          opr, convertedTypes, blockSize.asArrayRef(), loc, rewriter);
+      newOperands.push_back(convertedValues);
+    }
+
+    mlir::OpBuilder::InsertionGuard g(rewriter);
+
+    int64_t grids[2] = {resShape[0] / blockSize[0], resShape[1] / blockSize[1]};
+    llvm::SmallVector<mlir::Value> newOps;
+    for (int64_t i = 0; i < grids[0]; i++) {
+      for (int64_t j = 0; j < grids[1]; j++) {
+        int64_t idx = i * grids[1] + j;
+        llvm::SmallVector<mlir::Value> operands;
+        for (auto valRange : newOperands) {
+          if (valRange.size() == 1)
+            operands.push_back(valRange[0]);
+          if (idx < (int64_t)valRange.size())
+            operands.push_back(valRange[idx]);
+        }
+        if (operands.size() != op->getNumOperands())
+          return mlir::failure();
+        mlir::OperationState opState(loc, op->getName(), operands,
+                                     mlir::TypeRange(newTy), op->getAttrs(),
+                                     op->getSuccessors());
+        auto newOp = rewriter.create(opState);
+        newOps.push_back(newOp->getResult(0));
+      }
+    }
+
+    auto castOp = unpackWithUnrealizedCastOp(
+        newOps, resType, blockSize.asArrayRef(), loc, rewriter);
+
+    rewriter.replaceOp(op, castOp);
+    return mlir::success();
+  }
+};
+
+// Update the SCF forOp when it has arguments being blocked and and needs
+// to be replaced with a set of new arguments with smaller size
+// TODO: Can we replace this pattern to match with RegionBranchOpInterface?
+// It may improve the generality of the pattern.
+class RewriteSCFForOp
+    : public RewriteXeTileOp<mlir::scf::ForOp, BlockingAnalysis> {
+public:
+  using RewriteXeTileOp<mlir::scf::ForOp, BlockingAnalysis>::RewriteXeTileOp;
+
+  mlir::LogicalResult
+  matchAndRewrite(mlir::scf::ForOp op,
+                  OpPatternRewriter &rewriter) const override {
+    auto loc = op.getLoc();
+    auto initArgs = op.getInitArgs();
+    auto regionArgs = op.getRegionIterArgs();
+    auto results = op.getResults();
+    llvm::SmallVector<Block> blockSZs;
+
+    // verify the block size of region args and results. They should be the
+    // same if the result is used outside of the loop. Also, if the type is
+    // TileType, the init value should have the same block size as the region
+    // arg, since there is no unpack/pack op for TileType.
+    for (auto [init, arg, res] :
+         llvm::zip_equal(initArgs, regionArgs, results)) {
+      auto initBlock = analysis.getDefBlockSize(init);
+      auto argBlock = analysis.getDefBlockSize(arg);
+      auto resBlock = analysis.getDefBlockSize(res);
+
+      if (mlir::isa<xetile::TileType>(arg.getType()) && initBlock != argBlock)
+        return rewriter.notifyMatchFailure(op, "Incompatiable blocking size.");
+
+      if (res.isUsedOutsideOfBlock(op.getBody()) && argBlock != resBlock)
+        return rewriter.notifyMatchFailure(op, "Incompatiable blocking size.");
+      blockSZs.push_back(argBlock);
+    }
+
+    // preprocess the init args by adding pack ops if necessary,
+    // and build the SignatureConversion for region arguments.
+    auto origArgCount = op.getNumRegionIterArgs();
+    mlir::TypeConverter::SignatureConversion argConversion(origArgCount);
+    llvm::SmallVector<mlir::Value> convertedInitArgs;
+    for (auto [i, v] : llvm::enumerate(initArgs)) {
+      auto blockSZ = blockSZs[i];
+      auto type = mlir::dyn_cast<mlir::ShapedType>(v.getType());
+      if (!blockSZ || !type || type.getShape() == blockSZ.asArrayRef()) {
+        argConversion.addInputs(i, v.getType());
+        convertedInitArgs.push_back(v);
+      } else {
+        auto newTypes = convertTypes(type, blockSZ.asArrayRef());
+        argConversion.addInputs(i, newTypes);
+        auto values = packWithUnrealizedCastOp(
+            v, newTypes, blockSZ.asArrayRef(), loc, rewriter);
+        convertedInitArgs.append(values.begin(), values.end());
+      }
+    }
+
+    // no change is needed if convertedInitArgs is the same as current ones.
+    if (llvm::equal(convertedInitArgs, initArgs))
+      return mlir::failure();
+
+    auto newOp = rewriter.create<mlir::scf::ForOp>(
+        loc, op.getLowerBound(), op.getUpperBound(), op.getStep(),
+        convertedInitArgs);
+    mlir::Block *newBlock = newOp.getBody();
+    // remove the terminator of the new block
+    if (newBlock->mightHaveTerminator())
+      rewriter.eraseOp(newBlock->getTerminator());
+
+    llvm::SmallVector<mlir::Value> castArgs;
+    if (auto inductionVals = newOp.getLoopInductionVars())
+      castArgs = inductionVals.value();
+
+    auto savedIP = rewriter.saveInsertionPoint();
+    PatternRewriter::InsertionGuard g(rewriter);
+    rewriter.setInsertionPointToStart(newBlock);
+    // create unpackOp for converted region arguments if necessary.
+    auto convertedArgs = newOp.getRegionIterArgs();
+    for (unsigned i = 0; i < origArgCount; i++) {
+      auto inputMap = argConversion.getInputMapping(i);
+      if (!inputMap || inputMap->size == 1) {
+        castArgs.push_back(convertedArgs[inputMap->inputNo]);
+      } else {
+        auto arg = unpackWithUnrealizedCastOp(
+            convertedArgs.slice(inputMap->inputNo, inputMap->size),
+            regionArgs[i].getType(), blockSZs[i].asArrayRef(), loc, rewriter);
+        castArgs.push_back(arg);
+      }
+    }
+    rewriter.restoreInsertionPoint(savedIP);
+    rewriter.mergeBlocks(op.getBody(), newBlock, castArgs);
+
+    llvm::SmallVector<mlir::Value> castResults;
+    auto convertedResults = newOp.getResults();
+    for (unsigned i = 0; i < origArgCount; i++) {
+      auto inputMap = argConversion.getInputMapping(i);
+      if (!inputMap || inputMap->size == 1) {
+        castResults.push_back(convertedResults[inputMap->inputNo]);
+      } else {
+        auto res = unpackWithUnrealizedCastOp(
+            convertedResults.slice(inputMap->inputNo, inputMap->size),
+            results[i].getType(), blockSZs[i].asArrayRef(), loc, rewriter);
+        castResults.push_back(res);
+      }
+    }
+
+    rewriter.replaceOp(op, castResults);
+    return mlir::success();
+  }
+};
+
+// Update the SCF Yield op when its operands have being blocked and and needs
+// to be replaced with a set of new values with smaller size
+class RewriteSCFYieldOp
+    : public RewriteXeTileOp<mlir::scf::YieldOp, BlockingAnalysis> {
+public:
+  using RewriteXeTileOp<mlir::scf::YieldOp, BlockingAnalysis>::RewriteXeTileOp;
+
+  mlir::LogicalResult
+  matchAndRewrite(mlir::scf::YieldOp op,
+                  OpPatternRewriter &rewriter) const override {
+    auto loc = op.getLoc();
+    llvm::SmallVector<mlir::Value> convertedResults;
+    for (auto res : op.getResults()) {
+      auto blockSZ = analysis.getDefBlockSize(res);
+      auto type = mlir::dyn_cast<mlir::ShapedType>(res.getType());
+      if (blockSZ && type && type.getShape() != blockSZ.asArrayRef()) {
+        auto newTypes = convertTypes(type, blockSZ.asArrayRef());
+        auto values = packWithUnrealizedCastOp(
+            res, newTypes, blockSZ.asArrayRef(), loc, rewriter);
+        convertedResults.append(values.begin(), values.end());
+      } else {
+        convertedResults.push_back(res);
+      }
+    }
+    if (llvm::equal(convertedResults, op.getResults()))
+      return mlir::failure();
+    rewriter.replaceOpWithNewOp<mlir::scf::YieldOp>(op, convertedResults);
+    return mlir::success();
+  }
+};
+
+// Rewrite a create_mask op on big vector size into multiple create_mask ops
+// on smaller vector size.
+class RewriteCreateMaskOp
+    : public RewriteXeTileOp<mlir::vector::CreateMaskOp, BlockingAnalysis> {
+public:
+  using RewriteXeTileOp<mlir::vector::CreateMaskOp,
+                        BlockingAnalysis>::RewriteXeTileOp;
+
+  mlir::LogicalResult
+  matchAndRewrite(mlir::vector::CreateMaskOp op,
+                  OpPatternRewriter &rewriter) const override {
+    auto loc = op.getLoc();
+    auto res = op.getResult();
+    auto resTy = res.getType();
+    auto shape = resTy.getShape();
+    auto blockSize = analysis.getDefBlockSize(res);
+    if (!blockSize || shape == blockSize.asArrayRef())
+      return mlir::failure();
+
+    auto operands = op.getOperands();
+
+    auto sub = [&](mlir::Value a, int64_t b) -> mlir::Value {
+      auto ofr = mlir::getAsOpFoldResult(a);
+      if (auto cst = mlir::getConstantIntValue(ofr)) {
+        auto val = std::max<int64_t>(*cst - b, 0);
+        auto attr = rewriter.getIndexAttr(val);
+        return rewriter.create<mlir::arith::ConstantOp>(loc, attr);
+      } else {
+        auto bV = rewriter.create<mlir::arith::ConstantOp>(
+            loc, rewriter.getIndexAttr(b));
+        return rewriter.create<mlir::arith::SubIOp>(loc, a, bV);
+      }
+    };
+
+    auto elemTy = resTy.getElementType();
+    auto newTy = mlir::VectorType::get(blockSize.asArrayRef(), elemTy);
+    llvm::SmallVector<mlir::Value> newOps;
+    mlir::Value x = operands[0];
+    for (int64_t i = 0; i < shape[0]; i += blockSize[0]) {
+      mlir::Value y = operands[1];
+      for (int64_t j = 0; j < shape[1]; j += blockSize[1]) {
+        auto newOp = rewriter.create<mlir::vector::CreateMaskOp>(
+            loc, newTy, mlir::ValueRange({x, y}));
+        newOps.push_back(newOp);
+        y = sub(y, blockSize[1]);
+      }
+      x = sub(x, blockSize[0]);
+    }
+    auto castOp = unpackWithUnrealizedCastOp(
+        newOps, resTy, blockSize.asArrayRef(), loc, rewriter);
+    rewriter.replaceOp(op, castOp);
+    return mlir::success();
+  }
+};
+
+// Rewrite a splat op on big vector size into multiple splat ops
+// on smaller vector size.
+class RewriteSplatOp
+    : public RewriteXeTileOp<mlir::vector::SplatOp, BlockingAnalysis> {
+public:
+  using RewriteXeTileOp<mlir::vector::SplatOp,
+                        BlockingAnalysis>::RewriteXeTileOp;
+
+  mlir::LogicalResult
+  matchAndRewrite(mlir::vector::SplatOp op,
+                  OpPatternRewriter &rewriter) const override {
+    auto loc = op.getLoc();
+    auto res = op.getAggregate();
+    auto resTy = res.getType();
+    auto shape = resTy.getShape();
+    auto blockSize = analysis.getDefBlockSize(res);
+    if (!blockSize || resTy.getRank() != 2 || shape == blockSize.asArrayRef())
+      return mlir::failure();
+
+    auto newTy =
+        mlir::VectorType::get(blockSize.asArrayRef(), resTy.getElementType());
+    auto newOp = rewriter.create<mlir::vector::SplatOp>(
+        loc, newTy, op->getOperands(), op->getAttrs());
+    auto numOps = resTy.getNumElements() / newTy.getNumElements();
+    llvm::SmallVector<mlir::Value> newOps(numOps, newOp);
+    auto castOp = unpackWithUnrealizedCastOp(
+        newOps, resTy, blockSize.asArrayRef(), loc, rewriter);
+    rewriter.replaceOp(op, castOp);
+    return mlir::success();
+  }
+};
+
+// ====================== Old 4D blocking patterns ========================
 static xetile::TileUnpackOp
 addUnpackOp(mlir::Value src, mlir::ConversionPatternRewriter &rewriter) {
   auto srcTy = llvm::dyn_cast_if_present<mlir::VectorType>(src.getType());
@@ -453,7 +1953,7 @@ struct TileMMAOpPattern
 
     auto getBlockingSize = [&](mlir::Value val,
                                int pos) -> mlir::FailureOr<Block> {
-      auto blk = analysis.getUseBlockSize(val, UsePoint(op, pos));
+      auto blk = analysis.getUseBlockSize(val, op->getOpOperand(pos));
       if (!blk)
         return rewriter.notifyMatchFailure(op, "Invalid block size.");
       return blk;
@@ -512,7 +2012,7 @@ struct TileReductionOpPattern
       return rewriter.notifyMatchFailure(
           op, "source type is not 2D vector or reduction dims are not 1");
 
-    auto blkSize = analysis.getUseBlockSize(source, UsePoint(op, 0));
+    auto blkSize = analysis.getUseBlockSize(source, op->getOpOperand(0));
     if (!blkSize)
       return rewriter.notifyMatchFailure(op, "Invalid block size.");
 
@@ -564,7 +2064,7 @@ struct TileBroadcastOpPattern
       return rewriter.notifyMatchFailure(
           op, "source type is not 2D vector or rank of broadcastDims is not 1");
 
-    auto srcBlkSize = analysis.getUseBlockSize(src, UsePoint(op, 0));
+    auto srcBlkSize = analysis.getUseBlockSize(src, op->getOpOperand(0));
     auto resBlkSize = analysis.getDefBlockSize(op.getResult());
     if (!srcBlkSize || !resBlkSize)
       return rewriter.notifyMatchFailure(op, "Invalid block size.");
@@ -616,8 +2116,7 @@ struct TileTransposeOpPattern
     if (permutation != mlir::ArrayRef<int64_t>({1, 0}))
       return rewriter.notifyMatchFailure(op, "Unsupported permutation");
 
-    UsePoint p(op, 0);
-    auto inBlockSize = analysis.getUseBlockSize(input, p);
+    auto inBlockSize = analysis.getUseBlockSize(input, op->getOpOperand(0));
     auto outBlockSize = analysis.getDefBlockSize(result);
     if (!inBlockSize || !outBlockSize)
       return rewriter.notifyMatchFailure(op, "Invalid block size.");
@@ -675,7 +2174,7 @@ struct VectorizableOpPattern
       return rewriter.notifyMatchFailure(op, "type is not 2D vector");
 
     auto blockSize =
-        analysis.getUseBlockSize(op->getOperand(0), UsePoint(op, 0));
+        analysis.getUseBlockSize(op->getOperand(0), op->getOpOperand(0));
     if (!blockSize)
       return rewriter.notifyMatchFailure(op, "Invalid block size.");
 
@@ -935,21 +2434,34 @@ struct VectorSplatOpPattern
 
 void populateXeTileBlockingPatterns(mlir::RewritePatternSet &patterns,
                                     BlockingAnalysis &analysis) {
-  patterns.insert<
-      Blocking::ArithConstantOpPattern, Blocking::InitTileOpPattern,
-      Blocking::PrefetchTileOpPattern, Blocking::LoadTileOpPattern,
-      Blocking::StoreTileOpPattern, Blocking::UpdateTileOffsetOpPattern,
-      Blocking::LoadGatherOpPattern, Blocking::StoreScatterOpPattern,
-      Blocking::TileMMAOpPattern, Blocking::TileReductionOpPattern,
-      Blocking::TileBroadcastOpPattern, Blocking::TileTransposeOpPattern,
-      Blocking::VectorizableOpPattern, Blocking::SCFForOpPattern,
-      Blocking::SCFYieldOpPattern, Blocking::VectorCreateMaskOpPattern,
-      Blocking::VectorSplatOpPattern>(patterns.getContext(), analysis);
+  if (Enable2DBlockingTransform) {
+    patterns.insert<
+        Blocking::RewriteArithConstantOp, Blocking::RewriteInitTileOp,
+        Blocking::RewritePrefetchTileOp, Blocking::RewriteLoadTileOp,
+        Blocking::RewriteStoreTileOp, Blocking::RewriteLoadGatherOp,
+        Blocking::RewriteStoreScatterOp, Blocking::RewriteUpdateTileOffsetOp,
+        Blocking::RewriteTileMMAOp, Blocking::RewriteTileReductionOp,
+        Blocking::RewriteTileBroadcastOp, Blocking::RewriteTileTransposeOp,
+        Blocking::RewriteVectorizableOp, Blocking::RewriteSCFForOp,
+        Blocking::RewriteSCFYieldOp, Blocking::RewriteCreateMaskOp,
+        Blocking::RewriteCreateMaskOp>(patterns.getContext(), analysis);
+  } else {
+    // TODO: remove this block after the 2D transform is fully supported.
+    patterns.insert<
+        Blocking::ArithConstantOpPattern, Blocking::InitTileOpPattern,
+        Blocking::PrefetchTileOpPattern, Blocking::LoadTileOpPattern,
+        Blocking::StoreTileOpPattern, Blocking::UpdateTileOffsetOpPattern,
+        Blocking::LoadGatherOpPattern, Blocking::StoreScatterOpPattern,
+        Blocking::TileMMAOpPattern, Blocking::TileReductionOpPattern,
+        Blocking::TileBroadcastOpPattern, Blocking::TileTransposeOpPattern,
+        Blocking::VectorizableOpPattern, Blocking::SCFForOpPattern,
+        Blocking::SCFYieldOpPattern, Blocking::VectorCreateMaskOpPattern,
+        Blocking::VectorSplatOpPattern>(patterns.getContext(), analysis);
+  }
 }
 
 // Lowers XeTile to blocked layout with high-dim vector
 class XeTileBlockingPass : public impl::XeTileBlockingBase<XeTileBlockingPass> {
-
 public:
   XeTileBlockingPass() {
     uArchInterface = std::make_shared<imex::XePVCuArch>();
@@ -971,6 +2483,7 @@ class XeTileBlockingPass : public impl::XeTileBlockingBase<XeTileBlockingPass> {
       uArchInterface = std::make_shared<imex::XePVCuArch>();
     else
       return errorHandler(llvm::Twine("Invalid device: ") + device);
+    Enable2DBlockingTransform = EnableTransform;
     return mlir::success();
   }
 
@@ -996,73 +2509,180 @@ class XeTileBlockingPass : public impl::XeTileBlockingBase<XeTileBlockingPass> {
 
     mlir::MLIRContext &context = getContext();
 
-    mlir::RewritePatternSet patterns(&context);
-    populateXeTileBlockingPatterns(patterns, analysis);
-
-    mlir::ConversionTarget target(context);
-    target.addLegalOp<xetile::TilePackOp>();
-    target.addLegalOp<xetile::TileUnpackOp>();
-    target.addLegalOp<mlir::func::ReturnOp>();
-    target.addLegalOp<mlir::vector::ShapeCastOp>();
-    target.addLegalOp<mlir::vector::StoreOp>();
-    target.addLegalOp<mlir::vector::LoadOp>();
-
-    target.addDynamicallyLegalOp<mlir::arith::ConstantOp>(
-        [&](mlir::arith::ConstantOp op) -> bool {
-          auto vecTy = mlir::dyn_cast<mlir::VectorType>(op.getType());
-          return (!vecTy || vecTy.getRank() != 2);
-        });
-
-    target.addDynamicallyLegalOp<xetile::InitTileOp>(
-        [&](xetile::InitTileOp op) -> bool {
-          return (op && op.getTile().getType().getInnerBlocks());
-        });
-
-    target.addDynamicallyLegalOp<xetile::PrefetchTileOp>(
-        [&](xetile::PrefetchTileOp op) -> bool {
-          return (op && op.getTile().getType().getInnerBlocks());
-        });
-
-    target.addDynamicallyLegalOp<xetile::UpdateTileOffsetOp>(
-        [&](xetile::UpdateTileOffsetOp op) -> bool {
-          return (op && op.getTile().getType().getInnerBlocks());
-        });
-
-    target.addDynamicallyLegalOp<xetile::LoadTileOp>(
-        [&](xetile::LoadTileOp op) -> bool {
-          return (op && op.getValue().getType().getRank() == 4);
+    if (EnableTransform) {
+      mlir::GreedyRewriteConfig config;
+      config.strictMode = GreedyRewriteStrictness::ExistingOps;
+      // ops inside regions, e.g., body of scf.for, needs to be processed
+      // before the op (e.g., scf.for) containing the region; otherwise
+      // the blocking analysis result for region args will be destroyed
+      // after scf.for is updated, leading to their users cannot be updated
+      // correctly.
+      mod.walk([&](mlir::Region *region) {
+        config.scope = region;
+        mlir::RewritePatternSet patterns(&context);
+        populateXeTileBlockingPatterns(patterns, analysis);
+        llvm::SmallVector<mlir::Operation *> ops;
+        region->walk([&](mlir::Operation *op) {
+          if (op->getParentRegion() == region)
+            ops.push_back(op);
         });
+        if (mlir::failed(
+                mlir::applyOpPatternsAndFold(ops, std::move(patterns), config)))
+          return signalPassFailure();
+      });
+
+      // run CSE and Canonicalizer again to remove identical packOps,
+      // and fold pack/unpack ops with the same block size.
+      mlir::PassManager pm(&context);
+      pm.addPass(mlir::createCanonicalizerPass());
+      pm.addPass(mlir::createCSEPass());
+      if (mlir::failed(pm.run(mod)))
+        return signalPassFailure();
+
+      // Resolve unfoldable Unpack and Pack Ops
+      mod.walk([&](mlir::UnrealizedConversionCastOp castOp) {
+        // remove dead castOp.
+        if (castOp->use_empty()) {
+          castOp.erase();
+          return mlir::WalkResult::advance();
+        }
 
-    target.addDynamicallyLegalOp<xetile::StoreTileOp>(
-        [&](xetile::StoreTileOp op) -> bool {
-          return (op && op.getValue().getType().getRank() == 4);
-        });
+        mlir::OpBuilder builder(castOp);
+        auto context = castOp.getContext();
+
+        // handle unpack op
+        if (Blocking::isUnpackOp(castOp)) {
+          auto user = mlir::dyn_cast<mlir::UnrealizedConversionCastOp>(
+              *castOp->user_begin());
+
+          auto [inGrids, inBlkSizes] = Blocking::getGridAndBlockSizes(castOp);
+          mlir::DenseI64ArrayAttr outBlkSizes, outGrids;
+
+          // if unpack op is used by a pack op,
+          if (castOp->hasOneUse() && Blocking::isPackOp(user) &&
+              Blocking::isUnpackPackCompatible(castOp, user)) {
+            std::tie(outGrids, outBlkSizes) =
+                Blocking::getGridAndBlockSizes(user);
+          } else {
+            auto outTy = mlir::dyn_cast<mlir::ShapedType>(
+                castOp->getResult(0).getType());
+            outGrids = mlir::DenseI64ArrayAttr::get(context, {1, 1});
+            outBlkSizes =
+                mlir::DenseI64ArrayAttr::get(context, outTy.getShape());
+          }
+
+          builder.setInsertionPointAfter(castOp);
+          auto newOps = Blocking::lowerUnpackOrPack(
+              castOp.getInputs(), inBlkSizes, outBlkSizes, inGrids, outGrids,
+              castOp.getLoc(), builder);
+          if (newOps.size() == 1) {
+            castOp->getResult(0).replaceAllUsesWith(newOps[0]);
+          } else {
+            for (auto [n, c] : llvm::zip_equal(newOps, user->getResults()))
+              c.replaceAllUsesWith(n);
+          }
+          return mlir::WalkResult::advance();
+        }
 
-    target.addDynamicallyLegalOp<xetile::TileMMAOp>(
-        [&](xetile::TileMMAOp op) -> bool {
-          return (op && op.getOutput().getType().getRank() == 4);
-        });
+        // handle pack op
+        if (Blocking::isPackOp(castOp)) {
+          auto opr = castOp->getOperand(0)
+                         .getDefiningOp<mlir::UnrealizedConversionCastOp>();
+          // should be handled as a pair of unpack and pack op in above.
+          if (Blocking::isUnpackOp(opr) && opr->hasOneUse() &&
+              Blocking::isUnpackPackCompatible(opr, castOp))
+            return mlir::WalkResult::advance();
+
+          auto inTy =
+              mlir::dyn_cast<mlir::ShapedType>(castOp->getOperand(0).getType());
+          auto inGrids = mlir::DenseI64ArrayAttr::get(context, {1, 1});
+          auto inBlkSizes =
+              mlir::DenseI64ArrayAttr::get(context, inTy.getShape());
+          auto [outGrids, outBlkSizes] = Blocking::getGridAndBlockSizes(castOp);
+          auto newOps = Blocking::lowerUnpackOrPack(
+              castOp.getInputs(), inBlkSizes, outBlkSizes, inGrids, outGrids,
+              castOp.getLoc(), builder);
+          for (auto [n, c] : llvm::zip_equal(newOps, castOp->getResults()))
+            c.replaceAllUsesWith(n);
+        }
+        return mlir::WalkResult::advance();
+      });
+
+      // remove dead castOp.
+      mod.walk([&](mlir::UnrealizedConversionCastOp castOp) {
+        if (castOp->use_empty())
+          castOp.erase();
+      });
+    } else {
+      // update blocked ops
+      mlir::RewritePatternSet patterns(&context);
+      populateXeTileBlockingPatterns(patterns, analysis);
+
+      mlir::ConversionTarget target(context);
+      target.addLegalOp<xetile::TilePackOp>();
+      target.addLegalOp<xetile::TileUnpackOp>();
+      target.addLegalOp<mlir::func::ReturnOp>();
+      target.addLegalOp<mlir::vector::ShapeCastOp>();
+      target.addLegalOp<mlir::vector::StoreOp>();
+      target.addLegalOp<mlir::vector::LoadOp>();
+
+      target.addDynamicallyLegalOp<mlir::arith::ConstantOp>(
+          [&](mlir::arith::ConstantOp op) -> bool {
+            auto vecTy = mlir::dyn_cast<mlir::VectorType>(op.getType());
+            return (!vecTy || vecTy.getRank() != 2);
+          });
+
+      target.addDynamicallyLegalOp<xetile::InitTileOp>(
+          [&](xetile::InitTileOp op) -> bool {
+            return (op && op.getTile().getType().getInnerBlocks());
+          });
+
+      target.addDynamicallyLegalOp<xetile::PrefetchTileOp>(
+          [&](xetile::PrefetchTileOp op) -> bool {
+            return (op && op.getTile().getType().getInnerBlocks());
+          });
+
+      target.addDynamicallyLegalOp<xetile::UpdateTileOffsetOp>(
+          [&](xetile::UpdateTileOffsetOp op) -> bool {
+            return (op && op.getTile().getType().getInnerBlocks());
+          });
+
+      target.addDynamicallyLegalOp<xetile::LoadTileOp>(
+          [&](xetile::LoadTileOp op) -> bool {
+            return (op && op.getValue().getType().getRank() == 4);
+          });
+
+      target.addDynamicallyLegalOp<xetile::StoreTileOp>(
+          [&](xetile::StoreTileOp op) -> bool {
+            return (op && op.getValue().getType().getRank() == 4);
+          });
+
+      target.addDynamicallyLegalOp<xetile::TileMMAOp>(
+          [&](xetile::TileMMAOp op) -> bool {
+            return (op && op.getOutput().getType().getRank() == 4);
+          });
+
+      target.markUnknownOpDynamicallyLegal([&](mlir::Operation *op) {
+        bool result = true;
+        for (auto ty : op->getOperandTypes()) {
+          if (auto vecTy = mlir::dyn_cast<mlir::VectorType>(ty))
+            result &= (vecTy.getRank() != 2);
+          if (auto tileTy = mlir::dyn_cast<xetile::TileType>(ty))
+            result &= bool(tileTy.getInnerBlocks());
+        }
+        for (auto ty : op->getResultTypes()) {
+          if (auto vecTy = mlir::dyn_cast<mlir::VectorType>(ty))
+            result &= (vecTy.getRank() != 2);
+          if (auto tileTy = mlir::dyn_cast<xetile::TileType>(ty))
+            result &= bool(tileTy.getInnerBlocks());
+        }
+        return result;
+      });
 
-    target.markUnknownOpDynamicallyLegal([&](mlir::Operation *op) {
-      bool result = true;
-      for (auto ty : op->getOperandTypes()) {
-        if (auto vecTy = mlir::dyn_cast<mlir::VectorType>(ty))
-          result &= (vecTy.getRank() != 2);
-        if (auto tileTy = mlir::dyn_cast<xetile::TileType>(ty))
-          result &= bool(tileTy.getInnerBlocks());
+      auto status = applyPartialConversion(mod, target, std::move(patterns));
+      if (failed(status)) {
+        return signalPassFailure();
       }
-      for (auto ty : op->getResultTypes()) {
-        if (auto vecTy = mlir::dyn_cast<mlir::VectorType>(ty))
-          result &= (vecTy.getRank() != 2);
-        if (auto tileTy = mlir::dyn_cast<xetile::TileType>(ty))
-          result &= bool(tileTy.getInnerBlocks());
-      }
-      return result;
-    });
-
-    auto status = applyPartialConversion(mod, target, std::move(patterns));
-    if (failed(status)) {
-      return signalPassFailure();
     }
   }
 
diff --git a/lib/Dialect/XeTile/Transforms/BlockingAnalysis.cpp b/lib/Dialect/XeTile/Transforms/BlockingAnalysis.cpp
index 5de72a637..be66dab79 100644
--- a/lib/Dialect/XeTile/Transforms/BlockingAnalysis.cpp
+++ b/lib/Dialect/XeTile/Transforms/BlockingAnalysis.cpp
@@ -4,6 +4,8 @@
 
 #include "imex/Dialect/XeTile/Transforms/BlockingAnalysis.h"
 
+extern bool Enable2DBlockingTransform;
+
 namespace llvm {
 using imex::Block;
 // Implementation of llvm::DenseMapInfo for Block, required for
@@ -62,28 +64,26 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &os, Block blk) {
 
 // ===------------------ BlockRequests Implementation --------------------===//
 // A class holding all blocking requests for a given mlir::Value.
-// For convience, it also tracks the UsePoint of the value.
+// For convience, it also tracks the use point (OpOperand) of the value.
 class BlockingRequests {
 public:
   BlockingRequests() = default;
-  BlockingRequests(int64_t h, int64_t w, mlir::Operation *user, int64_t pos)
-      : BlockingRequests(h, w, UsePoint(user, pos)) {}
 
-  BlockingRequests(int64_t h, int64_t w, UsePoint point)
+  BlockingRequests(int64_t h, int64_t w, mlir::OpOperand &point)
       : BlockingRequests(Block(h, w), point) {}
 
-  BlockingRequests(llvm::ArrayRef<int64_t> shape, UsePoint point)
+  BlockingRequests(llvm::ArrayRef<int64_t> shape, mlir::OpOperand &point)
       : BlockingRequests(shape[0], shape[1], point) {
     assert(shape.size() == 2 && "Invalid block size.");
   }
 
-  BlockingRequests(Block block, UsePoint point);
+  BlockingRequests(Block block, mlir::OpOperand &point);
 
   bool operator==(const BlockingRequests &other) const;
   bool operator!=(const BlockingRequests &other) const;
 
   Block getDefBlock() const;
-  Block getUseBlock(UsePoint point) const;
+  Block getUseBlock(mlir::OpOperand &point) const;
 
   void print(llvm::raw_ostream &os) const;
 
@@ -110,12 +110,12 @@ class BlockingRequests {
 
 private:
   Block def;
-  llvm::DenseMap<UsePoint, Block> requests;
+  llvm::DenseMap<mlir::OpOperand *, Block> requests;
 };
 
-BlockingRequests::BlockingRequests(Block block, UsePoint point) {
+BlockingRequests::BlockingRequests(Block block, mlir::OpOperand &point) {
   assert(block && "Invalid block.");
-  requests.try_emplace(point, block);
+  requests.try_emplace(&point, block);
 }
 
 Block BlockingRequests::getDefBlock() const {
@@ -127,8 +127,8 @@ Block BlockingRequests::getDefBlock() const {
   return Block();
 }
 
-Block BlockingRequests::getUseBlock(UsePoint point) const {
-  return requests.lookup(point);
+Block BlockingRequests::getUseBlock(mlir::OpOperand &point) const {
+  return requests.lookup(&point);
 }
 
 void BlockingRequests::print(llvm::raw_ostream &os) const {
@@ -140,8 +140,7 @@ void BlockingRequests::print(llvm::raw_ostream &os) const {
     for (auto [i, iter] : llvm::enumerate(requests)) {
       auto point = iter.first;
       auto block = iter.second;
-      os << "{Point(" << *point.first << ", " << point.second << "), blk("
-         << block << ")}";
+      os << "{User(" << *(point->getOwner()) << "), blk(" << block << ")}";
       if (i != requests.size() - 1)
         os << ", ";
       else
@@ -364,7 +363,7 @@ void BlockingAnalysisImpl::visitInitTileOp(
 
   // only work on scattered init_tile, which has indices
   if (op.getIndices()) {
-    auto req = BlockingRequests(block, UsePoint(op, 1));
+    auto req = BlockingRequests(block, op->getOpOperand(1));
     propagateIfChanged(operands[1], operands[1]->join(req));
   }
 }
@@ -380,7 +379,7 @@ void BlockingAnalysisImpl::visitPrefetchTileOp(
   auto size = getInnerBlockSize(op, elemTy, shape, memSpace);
   if (!size)
     return; // do nothing if didnot get a valid block size
-  auto BlockingRequest = BlockingRequests(size, UsePoint(op, 0));
+  auto BlockingRequest = BlockingRequests(size, op->getOpOperand(0));
   propagateIfChanged(operands[0], operands[0]->join(BlockingRequest));
 }
 
@@ -420,7 +419,7 @@ void BlockingAnalysisImpl::visitLoadTileOp(
   if (!block)
     return; // do nothing if didnot get a valid block size
 
-  auto BlockingRequest = BlockingRequests(block, UsePoint({op, 0}));
+  auto BlockingRequest = BlockingRequests(block, op->getOpOperand(0));
   // propagate the blocking size to its def op
   propagateIfChanged(operands[0], operands[0]->join(BlockingRequest));
 
@@ -447,7 +446,7 @@ void BlockingAnalysisImpl::visitLoadGatherOp(
     return;
 
   for (auto &&[i, inputOpr] : llvm::enumerate(operands)) {
-    auto blockingRequest = BlockingRequests(size, UsePoint(op, i));
+    auto blockingRequest = BlockingRequests(size, op->getOpOperand(i));
     propagateIfChanged(inputOpr, inputOpr->join(blockingRequest));
   }
 }
@@ -465,7 +464,7 @@ void BlockingAnalysisImpl::visitStoreTileOp(
     return; // do nothing if didnot get a valid block size
 
   for (auto &&[i, inputOpr] : llvm::enumerate(operands)) {
-    auto blockingRequest = BlockingRequests(size, UsePoint(op, i));
+    auto blockingRequest = BlockingRequests(size, op->getOpOperand(i));
     propagateIfChanged(inputOpr, inputOpr->join(blockingRequest));
   }
 }
@@ -487,7 +486,7 @@ void BlockingAnalysisImpl::visitStoreScatterOp(
     return;
 
   for (auto &&[i, inputOpr] : llvm::enumerate(operands)) {
-    auto blockingRequest = BlockingRequests(size, UsePoint(op, i));
+    auto blockingRequest = BlockingRequests(size, op->getOpOperand(i));
     propagateIfChanged(inputOpr, inputOpr->join(blockingRequest));
   }
 }
@@ -498,10 +497,10 @@ void BlockingAnalysisImpl::visitUpdateTileOp(
   auto lattice = results[0]->getValue();
   if (lattice.isInitialized()) {
     auto block = lattice.getRequests()[0];
-    auto request = BlockingRequests(block, UsePoint(op, 0));
+    auto request = BlockingRequests(block, op->getOpOperand(0));
     propagateIfChanged(operands[0], operands[0]->join(request));
     if (op.getIndices()) {
-      auto request = BlockingRequests(block, UsePoint(op, 1));
+      auto request = BlockingRequests(block, op->getOpOperand(1));
       propagateIfChanged(operands[1], operands[1]->join(request));
     }
   }
@@ -525,15 +524,15 @@ void BlockingAnalysisImpl::visitTileMMAOp(
                             cPrecision, dPrecision);
 
   auto blockSizeForA =
-      BlockingRequests(mmaSize[0], mmaSize[1], UsePoint({op, 0}));
+      BlockingRequests(mmaSize[0], mmaSize[1], op->getOpOperand(0));
   auto blockSizeForB =
-      BlockingRequests(mmaSize[1], mmaSize[2], UsePoint({op, 1}));
+      BlockingRequests(mmaSize[1], mmaSize[2], op->getOpOperand(1));
 
   propagateIfChanged(operands[0], operands[0]->join(blockSizeForA));
   propagateIfChanged(operands[1], operands[1]->join(blockSizeForB));
   if (C) {
     auto blockSizeForC =
-        BlockingRequests(mmaSize[0], mmaSize[2], UsePoint(op, 2));
+        BlockingRequests(mmaSize[0], mmaSize[2], op->getOpOperand(2));
     propagateIfChanged(operands[2], operands[2]->join(blockSizeForC));
   }
 
@@ -559,7 +558,7 @@ void BlockingAnalysisImpl::visitReductionOp(
   if (!size)
     return; // do nothing if didnot get a valid block size
 
-  auto blockingRequest = BlockingRequests(size, UsePoint(op, 0));
+  auto blockingRequest = BlockingRequests(size, op->getOpOperand(0));
   propagateIfChanged(operands[0], operands[0]->join(blockingRequest));
 }
 
@@ -592,7 +591,7 @@ void BlockingAnalysisImpl::visitBroadcastOp(
   } else {
     return;
   }
-  auto blockingRequest = BlockingRequests(blockSize, UsePoint(op, 0));
+  auto blockingRequest = BlockingRequests(blockSize, op->getOpOperand(0));
   propagateIfChanged(operands[0], operands[0]->join(blockingRequest));
 }
 
@@ -640,7 +639,7 @@ void BlockingAnalysisImpl::visitTransposeOp(
   if (!block)
     return; // do nothing if didnot get a valid block size
 
-  auto request = BlockingRequests(block, UsePoint(op, 0));
+  auto request = BlockingRequests(block, op->getOpOperand(0));
   propagateIfChanged(operands[0], operands[0]->join(request));
 
   // update the def block size for the result value
@@ -672,7 +671,8 @@ void BlockingAnalysisImpl::visitVectorizableOp(
   // Wait for requests from users, unless all of its users are terminators.
   if (!op->use_empty() && !lattice.isInitialized()) {
     for (auto user : op->getUsers()) {
-      if (!user->hasTrait<mlir::OpTrait::ReturnLike>())
+      if (!user->hasTrait<mlir::OpTrait::ReturnLike>() ||
+          !mlir::isa<mlir::FunctionOpInterface>(user->getParentOp()))
         return;
     }
   }
@@ -682,13 +682,14 @@ void BlockingAnalysisImpl::visitVectorizableOp(
   // well supported by IGC yet. Using default size (same as CreateMask)
   // could help to avoid this. Remove it when lowering of create_mask
   // and IGC get matured.
-  if (mlir::isa<mlir::arith::SelectOp>(op)) {
+  if (mlir::isa<mlir::arith::SelectOp>(op) && !Enable2DBlockingTransform) {
     block = Block(1, block[1]);
   }
 
   // elementwise operations are not sensitive to the block size.
   // It will use the block size requested by its users, except SelectOp
-  if (lattice.isInitialized() && !mlir::isa<mlir::arith::SelectOp>(op)) {
+  if (lattice.isInitialized() &&
+      (Enable2DBlockingTransform || !mlir::isa<mlir::arith::SelectOp>(op))) {
     block[0] = 0;
     for (auto &req : lattice.getRequests()) {
       block[0] = std::max(block[0], req[0]);
@@ -702,7 +703,7 @@ void BlockingAnalysisImpl::visitVectorizableOp(
 
   // propagate the block size on its operands
   for (auto &&[i, inputOpr] : llvm::enumerate(operands)) {
-    auto req = BlockingRequests(block, UsePoint(op, i));
+    auto req = BlockingRequests(block, op->getOpOperand(i));
     propagateIfChanged(inputOpr, inputOpr->join(req));
   }
 
@@ -716,7 +717,7 @@ void BlockingAnalysisImpl::visitShapecastOp(
     mlir::ArrayRef<const BlockingLattice *> results) {
   auto shape = op.getSource().getType().getShape();
   if (shape.size() == 2) {
-    auto BlockingRequest = BlockingRequests(shape, UsePoint(op, 0));
+    auto BlockingRequest = BlockingRequests(shape, op->getOpOperand(0));
     propagateIfChanged(operands[0], operands[0]->join(BlockingRequest));
   }
 }
@@ -729,10 +730,21 @@ void BlockingAnalysisImpl::visitCreateMaskOp(
   auto elemTy = vecTy.getElementType();
 
   auto lattice = results[0]->getValue();
+
+  if (Enable2DBlockingTransform && !op->use_empty() && !lattice.isInitialized())
+    return;
+
   BlockingRequests &def = getLatticeElement(op->getResult(0))->getValue();
   // TODO: following the Antonio's implementation and use the default size
-  // [1, subgroupSize] for CreateMaskOp, but it can be more general.
+  // [1, subgroupSize] for CreateMaskOp if 2D transform is not enabled.
+  // If 2D transform is enabled, it will aligned with its users.
   Block block = getInnerBlockSize(op, elemTy, shape);
+  if (Enable2DBlockingTransform) {
+    for (auto &req : lattice.getRequests()) {
+      block[0] = std::max(block[0], req[0]);
+      block[1] = std::min(block[1], req[1]);
+    }
+  }
   def.updateDefBlock(block);
 }
 
@@ -890,7 +902,7 @@ void BlockingAnalysis::printAnalysisResult() {
         for (auto [i, inputOpr] : llvm::enumerate(op->getOperands())) {
           if (mlir::isa<mlir::VectorType>(inputOpr.getType()) ||
               mlir::isa<xetile::TileType>(inputOpr.getType())) {
-            UsePoint p(op, i);
+            mlir::OpOperand &p = op->getOpOperand(i);
             llvm::dbgs() << "\n   opr[" << i << "]: " << inputOpr
                          << " --> blkSZ: " << getUseBlockSize(inputOpr, p);
           }
@@ -916,20 +928,21 @@ void BlockingAnalysis::printAnalysisResult() {
       for (auto [i, res] : llvm::enumerate(YieldOp.getResults()))
         llvm::dbgs() << "\n   res[" << i << "]: " << res
                      << " --> blkSZ: " << getDefBlockSize(res) << ", "
-                     << getUseBlockSize(res, UsePoint(op, i));
+                     << getUseBlockSize(res, op->getOpOperand(i));
       llvm::dbgs() << "\n";
     } else if (auto StoreOp = mlir::dyn_cast<xetile::StoreTileOp>(op)) {
       llvm::dbgs() << "\nOp: " << *op;
       for (auto [i, inputOpr] : llvm::enumerate(op->getOperands())) {
         llvm::dbgs() << "\n   opr[" << i << "]: " << inputOpr << " --> blkSZ: "
-                     << getUseBlockSize(inputOpr, UsePoint(StoreOp, i));
+                     << getUseBlockSize(inputOpr, op->getOpOperand(i));
       }
       llvm::dbgs() << "\n";
     }
   });
 }
 
-Block BlockingAnalysis::getUseBlockSize(mlir::Value val, UsePoint point) const {
+Block BlockingAnalysis::getUseBlockSize(mlir::Value val,
+                                        mlir::OpOperand &point) const {
   auto *state = solver.lookupState<BlockingLattice>(val);
   if (!state)
     return Block();
diff --git a/lib/Utils/XeCommon.cpp b/lib/Utils/XeCommon.cpp
index fa18e692b..c262ac97a 100644
--- a/lib/Utils/XeCommon.cpp
+++ b/lib/Utils/XeCommon.cpp
@@ -203,7 +203,7 @@ llvm::SmallVector<int64_t> defaultStrides(llvm::ArrayRef<int64_t> shape) {
 
 mlir::TypedValue<mlir::VectorType> stack(mlir::Value vecUp, mlir::Value vecDown,
                                          mlir::Location loc,
-                                         mlir::PatternRewriter &rewriter) {
+                                         mlir::OpBuilder &builder) {
   auto vecUpTy = llvm::cast<mlir::VectorType>(vecUp.getType());
   auto vecDownTy = llvm::cast<mlir::VectorType>(vecDown.getType());
   assert(vecUpTy.getRank() == 2 && vecDownTy.getRank() == vecUpTy.getRank() &&
@@ -214,10 +214,98 @@ mlir::TypedValue<mlir::VectorType> stack(mlir::Value vecUp, mlir::Value vecDown,
   llvm::SmallVector<int64_t> mask(vecUpTy.getShape()[0] +
                                   vecDownTy.getShape()[0]);
   std::iota(mask.begin(), mask.end(), 0);
-  auto op = rewriter.create<mlir::vector::ShuffleOp>(loc, vecUp, vecDown, mask);
+  auto op = builder.create<mlir::vector::ShuffleOp>(loc, vecUp, vecDown, mask);
   return op;
 }
 
+// generate linearized shuffle mask for concat.
+static llvm::SmallVector<int64_t>
+getShuffleMask(llvm::ArrayRef<int64_t> shape1, llvm::ArrayRef<int64_t> shape2) {
+  assert(shape1.size() == shape2.size() && shape1.size() <= 2 &&
+         "only 1D/2D shape are supported.");
+  assert(shape1.drop_back() == shape2.drop_back() &&
+         "the row dim of the shapes should match.");
+  int64_t size1 = std::accumulate(shape1.begin(), shape1.end(), 1,
+                                  std::multiplies<int64_t>());
+  int64_t size2 = std::accumulate(shape2.begin(), shape2.end(), 1,
+                                  std::multiplies<int64_t>());
+  llvm::SmallVector<int64_t> mask(size1 + size2);
+  auto rows = shape1.size() == 1 ? 1 : shape1[0];
+  auto cols1 = shape1.size() == 1 ? shape1[0] : shape1[1];
+  auto cols2 = shape2.size() == 1 ? shape2[0] : shape2[1];
+  for (int64_t i = 0; i < rows; i++) {
+    int64_t s = i * (cols1 + cols2);
+    int64_t m = s + cols1;
+    int64_t e = m + cols2;
+    int64_t v1 = i * cols1;
+    int64_t v2 = size1 + i * cols2;
+    std::iota(mask.begin() + s, mask.begin() + m, v1);
+    std::iota(mask.begin() + m, mask.begin() + e, v2);
+  }
+  return mask;
+}
+
+mlir::TypedValue<mlir::VectorType> concat(mlir::Value lhs, mlir::Value rhs,
+                                          mlir::Location loc,
+                                          mlir::OpBuilder &builder) {
+  auto lhsTy = llvm::cast<mlir::VectorType>(lhs.getType());
+  auto rhsTy = llvm::cast<mlir::VectorType>(rhs.getType());
+
+  assert(lhsTy.getShape()[0] == lhsTy.getShape()[0] &&
+         "Operands of concat() do not have the same number of rows.");
+  assert(lhsTy.getRank() <= 2 && rhsTy.getRank() == lhsTy.getRank() &&
+         "Currently concat only works on 1D/2D vector.");
+
+  auto elemTy = lhsTy.getElementType();
+  auto leftSize = lhsTy.getNumElements();
+  auto leftShape = lhsTy.getShape();
+  auto leftFlatTy = mlir::VectorType::get({lhsTy.getNumElements()}, elemTy);
+
+  auto rightSize = rhsTy.getNumElements();
+  auto rightShape = rhsTy.getShape();
+  auto rightFlatTy = mlir::VectorType::get({rhsTy.getNumElements()}, elemTy);
+
+  auto newShape = lhsTy.getRank() == 1
+                      ? llvm::SmallVector<int64_t>({leftSize + rightSize})
+                      : llvm::SmallVector<int64_t>(
+                            {leftShape[0], leftShape[1] + rightShape[1]});
+  auto castLeft =
+      builder.create<mlir::vector::ShapeCastOp>(loc, leftFlatTy, lhs);
+  auto castRight =
+      builder.create<mlir::vector::ShapeCastOp>(loc, rightFlatTy, rhs);
+  auto mask = getShuffleMask(leftShape, rightShape);
+  auto shuffleOp =
+      builder.create<mlir::vector::ShuffleOp>(loc, castLeft, castRight, mask);
+  auto targetTy = mlir::VectorType::get(newShape, elemTy);
+  auto newOp =
+      builder.create<mlir::vector::ShapeCastOp>(loc, targetTy, shuffleOp);
+  return newOp;
+}
+
+// A wrapper function to merge small vectors into a big one. It takes a
+// range of mlir::Value objects with mlir::VectorType, and merge them
+// into a big vector using the provided transformation function.
+mlir::Value packVectorsWith(mlir::ValueRange ins, PackFuncTy op,
+                            mlir::Location loc, mlir::OpBuilder &builder) {
+  llvm::SmallVector<mlir::Value> shuffleOps(ins.begin(), ins.end());
+  while (shuffleOps.size() > 1) {
+    auto curr = shuffleOps;
+    shuffleOps.clear();
+    size_t currPairStartIdx{0};
+    while (currPairStartIdx < curr.size() - 1) {
+      size_t leftIdx{currPairStartIdx++};
+      size_t rightIdx{currPairStartIdx++};
+      auto newOp = op(curr[leftIdx], curr[rightIdx], loc, builder);
+      shuffleOps.push_back(newOp);
+    }
+    if (currPairStartIdx < curr.size()) {
+      assert(currPairStartIdx == curr.size() - 1);
+      shuffleOps.push_back(curr[curr.size() - 1]);
+    }
+  }
+  return shuffleOps[0];
+}
+
 /// Checks if the given `type` is a 1-D vector type that requires VectorAnyINTEL
 /// capability. In other words, the vector size is not supported by SPIR-V.
 /// SPIR-V only supports 2, 3, 4, 8, 16 elements (8 and 16 with Vector16
diff --git a/test/Dialect/XeTile/Transforms/Blocking/unit_tests.mlir b/test/Dialect/XeTile/Transforms/Blocking/unit_tests.mlir
index cafbc1c67..0f057a197 100644
--- a/test/Dialect/XeTile/Transforms/Blocking/unit_tests.mlir
+++ b/test/Dialect/XeTile/Transforms/Blocking/unit_tests.mlir
@@ -28,6 +28,11 @@ gpu.module @test_kernel {
   //CHECK: %[[cst:.*]] = arith.constant dense<0.000000e+00> : vector<4x2x8x16xf32>
   //CHECK: %[[R0:.*]] = xetile.init_tile %[[arg0]][0, 0] : memref<32x32xf32> -> !xetile.tile<32x32xf32, #xetile.tile_attr<inner_blocks = [8, 16]>>
   //CHECK: xetile.store_tile %[[cst]],  %[[R0]] : vector<4x2x8x16xf32>, !xetile.tile<32x32xf32, #xetile.tile_attr<inner_blocks = [8, 16]>>
+
+  //CHECK-2D: gpu.func @sg_store_tile(%[[arg0:.*]]: memref<32x32xf32>)
+  //CHECK-2D: %[[cst:.*]] = arith.constant dense<0.000000e+00> : vector<8x16xf32>
+  //CHECK-2D-COUNT-8: %0 = xetile.init_tile %arg0[%{{.*}}, %{{.*}}] : memref<32x32xf32> -> !xetile.tile<8x16xf32>
+  //CHECK-2D-COUNT-8: xetile.store_tile %[[cst]],  %{{.*}} : vector<8x16xf32>, !xetile.tile<8x16xf32>
 	gpu.func @sg_store_tile(%a: memref<32x32xf32>) {
 		%result = arith.constant dense<0.0>: vector<32x32xf32>
 		%1 = xetile.init_tile %a[0, 0] : memref<32x32xf32> -> !xetile.tile<32x32xf32>
diff --git a/test/Dialect/XeTile/Transforms/Blocking/unit_tests_transform.mlir b/test/Dialect/XeTile/Transforms/Blocking/unit_tests_transform.mlir
new file mode 100644
index 000000000..3b52485ff
--- /dev/null
+++ b/test/Dialect/XeTile/Transforms/Blocking/unit_tests_transform.mlir
@@ -0,0 +1,1753 @@
+// RUN: imex-opt --split-input-file --xetile-blocking="enable-2d-transform=true" %s -verify-diagnostics -o -| FileCheck %s
+
+gpu.module @test_kernel {
+  //CHECK: gpu.func @sg_load_tile(%[[arg0:.*]]: memref<32x32xf16>)
+  gpu.func @sg_load_tile(%a: memref<32x32xf16>) {
+    //CHECK: %[[c0:.*]] = arith.constant 0 : index
+    //CHECK: %[[R0:.*]] = xetile.init_tile %[[arg0]][%[[c0]], %[[c0]]] : memref<32x32xf16> -> !xetile.tile<32x32xf16>
+    //CHECK: %[[R1:.*]] = xetile.load_tile %[[R0]] : !xetile.tile<32x32xf16> -> vector<32x32xf16>
+    %c0 = arith.constant 0 : index
+  	%1 = xetile.init_tile %a[%c0, %c0] : memref<32x32xf16> -> !xetile.tile<32x32xf16>
+    %2 = xetile.load_tile %1 : !xetile.tile<32x32xf16> -> vector<32x32xf16>
+  	gpu.return
+  }
+
+  //CHECK: gpu.func @sg_load_tile_unaligned(%[[arg0:.*]]: memref<128x128xf16>)
+  gpu.func @sg_load_tile_unaligned(%a: memref<128x128xf16>) {
+    %c0 = arith.constant 0 : index
+    //CHECK-COUNT-20: xetile.init_tile %arg0[{{.*}}] : memref<128x128xf16> -> !xetile.tile<17x19xf16>
+    %1 = xetile.init_tile %a[%c0, %c0] : memref<128x128xf16> -> !xetile.tile<85x76xf16>
+    //CHECK-COUNT-20: xetile.load_tile {{.*}} : !xetile.tile<17x19xf16> -> vector<17x19xf16>
+    %2 = xetile.load_tile %1 : !xetile.tile<85x76xf16> -> vector<85x76xf16>
+    //CHECK: gpu.return
+    gpu.return
+  }
+
+  //CHECK: gpu.func @sg_store_tile(%[[arg0:.*]]: memref<32x32xf32>)
+	gpu.func @sg_store_tile(%a: memref<32x32xf32>) {
+    //CHECK: %[[cst:.*]] = arith.constant dense<0.000000e+00> : vector<8x16xf32>
+		%result = arith.constant dense<0.0>: vector<32x32xf32>
+    //CHECK-COUNT-8: %{{.*}} = xetile.init_tile %arg0[%{{.*}}, %{{.*}}] : memref<32x32xf32> -> !xetile.tile<8x16xf32>
+		%1 = xetile.init_tile %a[0, 0] : memref<32x32xf32> -> !xetile.tile<32x32xf32>
+    //CHECK-COUNT-8: xetile.store_tile %[[cst]],  %{{.*}} : vector<8x16xf32>, !xetile.tile<8x16xf32>
+		xetile.store_tile %result, %1: vector<32x32xf32>, !xetile.tile<32x32xf32>
+    //CHECK: gpu.return
+		gpu.return
+	}
+
+  //CHECK: gpu.func @create_mask
+  //CHECK-SAME: %[[arg0:.*]]: vector<32x32xf16>, %[[arg1:.*]]: vector<32x32xf16>, %[[arg2:.*]]: memref<32x32xf16>
+  gpu.func @create_mask(%a: vector<32x32xf16>, %b: vector<32x32xf16>, %c: memref<32x32xf16>) {
+    %c32 = arith.constant 32 : index
+    %c20 = arith.constant 20 : index
+
+    //CHECK: %[[r0:.*]] = vector.constant_mask [8, 20] : vector<8x32xi1>
+    %mask = vector.create_mask %c32, %c20 : vector<32x32xi1>
+
+    //CHECK-COUNT-4: vector.extract_strided_slice %[[arg0]] {offsets = [{{.*}}], sizes = [8, 32], strides = [1, 1]} : vector<32x32xf16> to vector<8x32xf16>
+    //CHECK-COUNT-4: vector.extract_strided_slice %[[arg1]] {offsets = [{{.*}}], sizes = [8, 32], strides = [1, 1]} : vector<32x32xf16> to vector<8x32xf16>
+    //CHECK-COUNT-4: arith.select %[[r0]], %{{.*}}, %{{.*}} : vector<8x32xi1>, vector<8x32xf16>
+    %select = arith.select %mask, %a, %b : vector<32x32xi1>, vector<32x32xf16>
+
+    //CHECK-COUNT-4: xetile.init_tile %[[arg2]][{{.*}}] : memref<32x32xf16> -> !xetile.tile<8x32xf16>
+    %tile = xetile.init_tile %c[0, 0] : memref<32x32xf16> -> !xetile.tile<32x32xf16>
+
+    //CHECK-COUNT-4: xetile.store_tile %{{.*}},  %{{.*}} : vector<8x32xf16>, !xetile.tile<8x32xf16>
+    xetile.store_tile %select, %tile: vector<32x32xf16>, !xetile.tile<32x32xf16>
+
+    //CHECK: gpu.return
+    gpu.return
+  }
+
+  //CHECK: gpu.func @create_mask_2
+  //CHECK-SAME: %[[arg0:.*]]: vector<32x32xf16>, %[[arg1:.*]]: vector<32x32xf16>, %[[arg2:.*]]: memref<32x32xf16>
+  gpu.func @create_mask_2(%a: vector<32x32xf16>, %b: vector<32x32xf16>, %c: memref<32x32xf16>) {
+    %c20 = arith.constant 20 : index
+    %c32 = arith.constant 32 : index
+    //CHECK: %[[r0:.*]] = vector.constant_mask [8, 32] : vector<8x32xi1>
+    //CHECK: %[[r1:.*]] = vector.constant_mask [4, 32] : vector<8x32xi1>
+    //CHECK: %[[r2:.*]] = vector.constant_mask [0, 0] : vector<8x32xi1>
+    %mask = vector.create_mask %c20, %c32 : vector<32x32xi1>
+
+    //CHECK-COUNT-4: vector.extract_strided_slice %[[arg0]] {offsets = [{{.*}}], sizes = [8, 32], strides = [1, 1]} : vector<32x32xf16> to vector<8x32xf16>
+    //CHECK-COUNT-4: vector.extract_strided_slice %[[arg1]] {offsets = [{{.*}}], sizes = [8, 32], strides = [1, 1]} : vector<32x32xf16> to vector<8x32xf16>
+    //CHECK-COUNT-4: arith.select %{{.*}}, %{{.*}}, %{{.*}} : vector<8x32xi1>, vector<8x32xf16>
+    %select = arith.select %mask, %a, %b : vector<32x32xi1>, vector<32x32xf16>
+
+    //CHECK-COUNT-4: xetile.init_tile %[[arg2]][%{{.*}}, %{{.*}}] : memref<32x32xf16> -> !xetile.tile<8x32xf16>
+    %tile = xetile.init_tile %c[0, 0] : memref<32x32xf16> -> !xetile.tile<32x32xf16>
+    //CHECK-COUNT-4: xetile.store_tile %{{.*}},  %{{.*}} : vector<8x32xf16>, !xetile.tile<8x32xf16>
+    xetile.store_tile %select, %tile: vector<32x32xf16>, !xetile.tile<32x32xf16>
+    gpu.return
+  }
+
+  //CHECK: gpu.func @sg_store_tile_unaligned(%[[arg0:.*]]: memref<128x128xf32>)
+	gpu.func @sg_store_tile_unaligned(%a: memref<128x128xf32>) {
+    //CHECK: %[[cst:.*]] = arith.constant dense<0.000000e+00> : vector<4x2xf32>
+	  %result = arith.constant dense<0.0>: vector<44x38xf32>
+    //CHECK-COUNT-209: xetile.init_tile %[[arg0]][{{.*}}] : memref<128x128xf32> -> !xetile.tile<4x2xf32>
+	  %1 = xetile.init_tile %a[0, 0] : memref<128x128xf32> -> !xetile.tile<44x38xf32>
+    //CHECK-COUNT-209: xetile.store_tile %[[cst]],  %{{.*}} : vector<4x2xf32>, !xetile.tile<4x2xf32>
+	  xetile.store_tile %result, %1: vector<44x38xf32>, !xetile.tile<44x38xf32>
+    //CHECK: gpu.return
+	  gpu.return
+	}
+
+  //CHECK: gpu.func @sg_tile_mma(%[[arg0:.*]]: memref<32x32xf16>, %[[arg1:.*]]: memref<32x32xf16>)
+  //CHECK-COUNT-2: xetile.init_tile %[[arg0]][{{.*}}] : memref<32x32xf16> -> !xetile.tile<32x16xf16>
+  //CHECK-COUNT-2: xetile.load_tile %{{.*}} : !xetile.tile<32x16xf16> -> vector<32x16xf16>
+  //CHECK-COUNT-8: vector.extract_strided_slice %{{.*}} {offsets = [{{.*}}], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf16> to vector<8x16xf16>
+  //CHECK-COUNT-2: xetile.init_tile %[[arg1]][{{.*}}] : memref<32x32xf16> -> !xetile.tile<32x16xf16>
+  //CHECK-COUNT-2: xetile.load_tile %{{.*}} : !xetile.tile<32x16xf16> -> vector<32x16xf16>
+  //CHECK-COUNT-4: vector.extract_strided_slice %{{.*}} {offsets = [{{.*}}], sizes = [16, 16], strides = [1, 1]} : vector<32x16xf16> to vector<16x16xf16>
+  //CHECK-COUNT-16: xetile.tile_mma %{{.*}}, %{{.*}}{{.*}} : vector<8x16xf16>, vector<16x16xf16>{{.*}} -> vector<8x16xf32>
+  gpu.func @sg_tile_mma(%a: memref<32x32xf16>, %b: memref<32x32xf16>) {
+    %c0 = arith.constant 0 : index
+  	%1 = xetile.init_tile %a[%c0, %c0] : memref<32x32xf16> -> !xetile.tile<32x32xf16>
+    %2 = xetile.load_tile %1 : !xetile.tile<32x32xf16> -> vector<32x32xf16>
+  	%3 = xetile.init_tile %b[%c0, %c0] : memref<32x32xf16> -> !xetile.tile<32x32xf16>
+    %4 = xetile.load_tile %3 : !xetile.tile<32x32xf16> -> vector<32x32xf16>
+    %5 = xetile.tile_mma %2, %4: vector<32x32xf16>, vector<32x32xf16> -> vector<32x32xf32>
+  	gpu.return
+  }
+
+  // CHECK-LABEL: gpu.func @inner_reduction
+  // CHECK-SAME: (%[[arg0:.*]]: memref<128x256xf16>, %[[arg1:.*]]: memref<128x256xf16>)
+  gpu.func @inner_reduction(%a: memref<128x256xf16>, %b: memref<128x256xf16>) {
+    //CHECK: %[[c15_i32:.*]] = arith.constant 15 : i32
+    //CHECK: %[[c14_i32:.*]] = arith.constant 14 : i32
+    //CHECK: %[[c13_i32:.*]] = arith.constant 13 : i32
+    //CHECK: %[[c12_i32:.*]] = arith.constant 12 : i32
+    //CHECK: %[[c11_i32:.*]] = arith.constant 11 : i32
+    //CHECK: %[[c10_i32:.*]] = arith.constant 10 : i32
+    //CHECK: %[[c9_i32:.*]] = arith.constant 9 : i32
+    //CHECK: %[[c8_i32:.*]] = arith.constant 8 : i32
+    //CHECK: %[[c7_i32:.*]] = arith.constant 7 : i32
+    //CHECK: %[[c6_i32:.*]] = arith.constant 6 : i32
+    //CHECK: %[[c5_i32:.*]] = arith.constant 5 : i32
+    //CHECK: %[[c4_i32:.*]] = arith.constant 4 : i32
+    //CHECK: %[[c3_i32:.*]] = arith.constant 3 : i32
+    //CHECK: %[[c2_i32:.*]] = arith.constant 2 : i32
+    //CHECK: %[[c1_i32:.*]] = arith.constant 1 : i32
+    //CHECK: %[[c0_i32:.*]] = arith.constant 0 : i32
+    //CHECK: %[[c0:.*]] = arith.constant 0 : index
+    //CHECK: %[[r0:.*]] = xetile.init_tile %[[arg0]][%[[c0]], %[[c0]]] : memref<128x256xf16> -> !xetile.tile<16x32xf16>
+    //CHECK: %[[r1:.*]] = xetile.load_tile %[[r0]] : !xetile.tile<16x32xf16> -> vector<16x32xf16>
+    //CHECK: %[[r2:.*]] = vector.extract_strided_slice %[[r1]] {offsets = [0, 0], sizes = [1, 32], strides = [1, 1]} : vector<16x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r3:.*]] = vector.extract_strided_slice %[[r1]] {offsets = [1, 0], sizes = [1, 32], strides = [1, 1]} : vector<16x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r4:.*]] = vector.extract_strided_slice %[[r1]] {offsets = [2, 0], sizes = [1, 32], strides = [1, 1]} : vector<16x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r5:.*]] = vector.extract_strided_slice %[[r1]] {offsets = [3, 0], sizes = [1, 32], strides = [1, 1]} : vector<16x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r6:.*]] = vector.extract_strided_slice %[[r1]] {offsets = [4, 0], sizes = [1, 32], strides = [1, 1]} : vector<16x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r7:.*]] = vector.extract_strided_slice %[[r1]] {offsets = [5, 0], sizes = [1, 32], strides = [1, 1]} : vector<16x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r8:.*]] = vector.extract_strided_slice %[[r1]] {offsets = [6, 0], sizes = [1, 32], strides = [1, 1]} : vector<16x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r9:.*]] = vector.extract_strided_slice %[[r1]] {offsets = [7, 0], sizes = [1, 32], strides = [1, 1]} : vector<16x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r10:.*]] = vector.extract_strided_slice %[[r1]] {offsets = [8, 0], sizes = [1, 32], strides = [1, 1]} : vector<16x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r11:.*]] = vector.extract_strided_slice %[[r1]] {offsets = [9, 0], sizes = [1, 32], strides = [1, 1]} : vector<16x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r12:.*]] = vector.extract_strided_slice %[[r1]] {offsets = [10, 0], sizes = [1, 32], strides = [1, 1]} : vector<16x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r13:.*]] = vector.extract_strided_slice %[[r1]] {offsets = [11, 0], sizes = [1, 32], strides = [1, 1]} : vector<16x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r14:.*]] = vector.extract_strided_slice %[[r1]] {offsets = [12, 0], sizes = [1, 32], strides = [1, 1]} : vector<16x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r15:.*]] = vector.extract_strided_slice %[[r1]] {offsets = [13, 0], sizes = [1, 32], strides = [1, 1]} : vector<16x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r16:.*]] = vector.extract_strided_slice %[[r1]] {offsets = [14, 0], sizes = [1, 32], strides = [1, 1]} : vector<16x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r17:.*]] = vector.extract_strided_slice %[[r1]] {offsets = [15, 0], sizes = [1, 32], strides = [1, 1]} : vector<16x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r18:.*]] = math.exp %[[r2]] : vector<1x32xf16>
+    //CHECK: %[[r19:.*]] = math.exp %[[r3]] : vector<1x32xf16>
+    //CHECK: %[[r20:.*]] = math.exp %[[r4]] : vector<1x32xf16>
+    //CHECK: %[[r21:.*]] = math.exp %[[r5]] : vector<1x32xf16>
+    //CHECK: %[[r22:.*]] = math.exp %[[r6]] : vector<1x32xf16>
+    //CHECK: %[[r23:.*]] = math.exp %[[r7]] : vector<1x32xf16>
+    //CHECK: %[[r24:.*]] = math.exp %[[r8]] : vector<1x32xf16>
+    //CHECK: %[[r25:.*]] = math.exp %[[r9]] : vector<1x32xf16>
+    //CHECK: %[[r26:.*]] = math.exp %[[r10]] : vector<1x32xf16>
+    //CHECK: %[[r27:.*]] = math.exp %[[r11]] : vector<1x32xf16>
+    //CHECK: %[[r28:.*]] = math.exp %[[r12]] : vector<1x32xf16>
+    //CHECK: %[[r29:.*]] = math.exp %[[r13]] : vector<1x32xf16>
+    //CHECK: %[[r30:.*]] = math.exp %[[r14]] : vector<1x32xf16>
+    //CHECK: %[[r31:.*]] = math.exp %[[r15]] : vector<1x32xf16>
+    //CHECK: %[[r32:.*]] = math.exp %[[r16]] : vector<1x32xf16>
+    //CHECK: %[[r33:.*]] = math.exp %[[r17]] : vector<1x32xf16>
+    //CHECK: %[[r34:.*]] = vector.shape_cast %[[r18]] : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r35:.*]] = vector.shape_cast %[[r19]] : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r36:.*]] = vector.shape_cast %[[r20]] : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r37:.*]] = vector.shape_cast %[[r21]] : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r38:.*]] = vector.shape_cast %[[r22]] : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r39:.*]] = vector.shape_cast %[[r23]] : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r40:.*]] = vector.shape_cast %[[r24]] : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r41:.*]] = vector.shape_cast %[[r25]] : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r42:.*]] = vector.shape_cast %[[r26]] : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r43:.*]] = vector.shape_cast %[[r27]] : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r44:.*]] = vector.shape_cast %[[r28]] : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r45:.*]] = vector.shape_cast %[[r29]] : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r46:.*]] = vector.shape_cast %[[r30]] : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r47:.*]] = vector.shape_cast %[[r31]] : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r48:.*]] = vector.shape_cast %[[r32]] : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r49:.*]] = vector.shape_cast %[[r33]] : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r50:.*]] = vector.shuffle %[[r34]], %[[r35]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r51:.*]] = vector.shuffle %[[r34]], %[[r35]] [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r52:.*]] = arith.addf %[[r50]], %[[r51]] : vector<32xf16>
+    //CHECK: %[[r53:.*]] = vector.shuffle %[[r36]], %[[r37]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r54:.*]] = vector.shuffle %[[r36]], %[[r37]] [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r55:.*]] = arith.addf %[[r53]], %[[r54]] : vector<32xf16>
+    //CHECK: %[[r56:.*]] = vector.shuffle %[[r38]], %[[r39]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r57:.*]] = vector.shuffle %[[r38]], %[[r39]] [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r58:.*]] = arith.addf %[[r56]], %[[r57]] : vector<32xf16>
+    //CHECK: %[[r59:.*]] = vector.shuffle %[[r40]], %[[r41]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r60:.*]] = vector.shuffle %[[r40]], %[[r41]] [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r61:.*]] = arith.addf %[[r59]], %[[r60]] : vector<32xf16>
+    //CHECK: %[[r62:.*]] = vector.shuffle %[[r42]], %[[r43]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r63:.*]] = vector.shuffle %[[r42]], %[[r43]] [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r64:.*]] = arith.addf %[[r62]], %[[r63]] : vector<32xf16>
+    //CHECK: %[[r65:.*]] = vector.shuffle %[[r44]], %[[r45]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r66:.*]] = vector.shuffle %[[r44]], %[[r45]] [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r67:.*]] = arith.addf %[[r65]], %[[r66]] : vector<32xf16>
+    //CHECK: %[[r68:.*]] = vector.shuffle %[[r46]], %[[r47]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r69:.*]] = vector.shuffle %[[r46]], %[[r47]] [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r70:.*]] = arith.addf %[[r68]], %[[r69]] : vector<32xf16>
+    //CHECK: %[[r71:.*]] = vector.shuffle %[[r48]], %[[r49]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r72:.*]] = vector.shuffle %[[r48]], %[[r49]] [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r73:.*]] = arith.addf %[[r71]], %[[r72]] : vector<32xf16>
+    //CHECK: %[[r74:.*]] = vector.shuffle %[[r52]], %[[r55]] [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23, 32, 33, 34, 35, 36, 37, 38, 39, 48, 49, 50, 51, 52, 53, 54, 55] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r75:.*]] = vector.shuffle %[[r52]], %[[r55]] [8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31, 40, 41, 42, 43, 44, 45, 46, 47, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r76:.*]] = arith.addf %[[r74]], %[[r75]] : vector<32xf16>
+    //CHECK: %[[r77:.*]] = vector.shuffle %[[r58]], %[[r61]] [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23, 32, 33, 34, 35, 36, 37, 38, 39, 48, 49, 50, 51, 52, 53, 54, 55] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r78:.*]] = vector.shuffle %[[r58]], %[[r61]] [8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31, 40, 41, 42, 43, 44, 45, 46, 47, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r79:.*]] = arith.addf %[[r77]], %[[r78]] : vector<32xf16>
+    //CHECK: %[[r80:.*]] = vector.shuffle %[[r64]], %[[r67]] [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23, 32, 33, 34, 35, 36, 37, 38, 39, 48, 49, 50, 51, 52, 53, 54, 55] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r81:.*]] = vector.shuffle %[[r64]], %[[r67]] [8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31, 40, 41, 42, 43, 44, 45, 46, 47, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r82:.*]] = arith.addf %[[r80]], %[[r81]] : vector<32xf16>
+    //CHECK: %[[r83:.*]] = vector.shuffle %[[r70]], %[[r73]] [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23, 32, 33, 34, 35, 36, 37, 38, 39, 48, 49, 50, 51, 52, 53, 54, 55] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r84:.*]] = vector.shuffle %[[r70]], %[[r73]] [8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31, 40, 41, 42, 43, 44, 45, 46, 47, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r85:.*]] = arith.addf %[[r83]], %[[r84]] : vector<32xf16>
+    //CHECK: %[[r86:.*]] = vector.shuffle %[[r76]], %[[r79]] [0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 32, 33, 34, 35, 40, 41, 42, 43, 48, 49, 50, 51, 56, 57, 58, 59] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r87:.*]] = vector.shuffle %[[r76]], %[[r79]] [4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31, 36, 37, 38, 39, 44, 45, 46, 47, 52, 53, 54, 55, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r88:.*]] = arith.addf %[[r86]], %[[r87]] : vector<32xf16>
+    //CHECK: %[[r89:.*]] = vector.shuffle %[[r82]], %[[r85]] [0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 32, 33, 34, 35, 40, 41, 42, 43, 48, 49, 50, 51, 56, 57, 58, 59] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r90:.*]] = vector.shuffle %[[r82]], %[[r85]] [4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31, 36, 37, 38, 39, 44, 45, 46, 47, 52, 53, 54, 55, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r91:.*]] = arith.addf %[[r89]], %[[r90]] : vector<32xf16>
+    //CHECK: %[[r92:.*]] = vector.shuffle %[[r88]], %[[r91]] [0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29, 32, 33, 36, 37, 40, 41, 44, 45, 48, 49, 52, 53, 56, 57, 60, 61] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r93:.*]] = vector.shuffle %[[r88]], %[[r91]] [2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31, 34, 35, 38, 39, 42, 43, 46, 47, 50, 51, 54, 55, 58, 59, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r94:.*]] = arith.addf %[[r92]], %[[r93]] : vector<32xf16>
+    //CHECK: %[[r95:.*]] = vector.shuffle %[[r94]], %[[r94]] [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r96:.*]] = vector.shuffle %[[r94]], %[[r94]] [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r97:.*]] = arith.addf %[[r95]], %[[r96]] : vector<16xf16>
+    //CHECK: %[[r98:.*]] = vector.extractelement %[[r97]][%[[c0_i32]] : i32] : vector<16xf16>
+    //CHECK: %[[r99:.*]] = vector.splat %[[r98]] : vector<1x1xf16>
+    //CHECK: %[[r100:.*]] = vector.extractelement %[[r97]][%[[c1_i32]] : i32] : vector<16xf16>
+    //CHECK: %[[r101:.*]] = vector.splat %[[r100]] : vector<1x1xf16>
+    //CHECK: %[[r102:.*]] = vector.extractelement %[[r97]][%[[c2_i32]] : i32] : vector<16xf16>
+    //CHECK: %[[r103:.*]] = vector.splat %[[r102]] : vector<1x1xf16>
+    //CHECK: %[[r104:.*]] = vector.extractelement %[[r97]][%[[c3_i32]] : i32] : vector<16xf16>
+    //CHECK: %[[r105:.*]] = vector.splat %[[r104]] : vector<1x1xf16>
+    //CHECK: %[[r106:.*]] = vector.extractelement %[[r97]][%[[c4_i32]] : i32] : vector<16xf16>
+    //CHECK: %[[r107:.*]] = vector.splat %[[r106]] : vector<1x1xf16>
+    //CHECK: %[[r108:.*]] = vector.extractelement %[[r97]][%[[c5_i32]] : i32] : vector<16xf16>
+    //CHECK: %[[r109:.*]] = vector.splat %[[r108]] : vector<1x1xf16>
+    //CHECK: %[[r110:.*]] = vector.extractelement %[[r97]][%[[c6_i32]] : i32] : vector<16xf16>
+    //CHECK: %[[r111:.*]] = vector.splat %[[r110]] : vector<1x1xf16>
+    //CHECK: %[[r112:.*]] = vector.extractelement %[[r97]][%[[c7_i32]] : i32] : vector<16xf16>
+    //CHECK: %[[r113:.*]] = vector.splat %[[r112]] : vector<1x1xf16>
+    //CHECK: %[[r114:.*]] = vector.extractelement %[[r97]][%[[c8_i32]] : i32] : vector<16xf16>
+    //CHECK: %[[r115:.*]] = vector.splat %[[r114]] : vector<1x1xf16>
+    //CHECK: %[[r116:.*]] = vector.extractelement %[[r97]][%[[c9_i32]] : i32] : vector<16xf16>
+    //CHECK: %[[r117:.*]] = vector.splat %[[r116]] : vector<1x1xf16>
+    //CHECK: %[[r118:.*]] = vector.extractelement %[[r97]][%[[c10_i32]] : i32] : vector<16xf16>
+    //CHECK: %[[r119:.*]] = vector.splat %[[r118]] : vector<1x1xf16>
+    //CHECK: %[[r120:.*]] = vector.extractelement %[[r97]][%[[c11_i32]] : i32] : vector<16xf16>
+    //CHECK: %[[r121:.*]] = vector.splat %[[r120]] : vector<1x1xf16>
+    //CHECK: %[[r122:.*]] = vector.extractelement %[[r97]][%[[c12_i32]] : i32] : vector<16xf16>
+    //CHECK: %[[r123:.*]] = vector.splat %[[r122]] : vector<1x1xf16>
+    //CHECK: %[[r124:.*]] = vector.extractelement %[[r97]][%[[c13_i32]] : i32] : vector<16xf16>
+    //CHECK: %[[r125:.*]] = vector.splat %[[r124]] : vector<1x1xf16>
+    //CHECK: %[[r126:.*]] = vector.extractelement %[[r97]][%[[c14_i32]] : i32] : vector<16xf16>
+    //CHECK: %[[r127:.*]] = vector.splat %[[r126]] : vector<1x1xf16>
+    //CHECK: %[[r128:.*]] = vector.extractelement %[[r97]][%[[c15_i32]] : i32] : vector<16xf16>
+    //CHECK: %[[r129:.*]] = vector.splat %[[r128]] : vector<1x1xf16>
+    //CHECK: %[[r130:.*]] = vector.shuffle %[[r99]], %[[r101]] [0, 1] : vector<1x1xf16>, vector<1x1xf16>
+    //CHECK: %[[r131:.*]] = vector.shuffle %[[r103]], %[[r105]] [0, 1] : vector<1x1xf16>, vector<1x1xf16>
+    //CHECK: %[[r132:.*]] = vector.shuffle %[[r107]], %[[r109]] [0, 1] : vector<1x1xf16>, vector<1x1xf16>
+    //CHECK: %[[r133:.*]] = vector.shuffle %[[r111]], %[[r113]] [0, 1] : vector<1x1xf16>, vector<1x1xf16>
+    //CHECK: %[[r134:.*]] = vector.shuffle %[[r115]], %[[r117]] [0, 1] : vector<1x1xf16>, vector<1x1xf16>
+    //CHECK: %[[r135:.*]] = vector.shuffle %[[r119]], %[[r121]] [0, 1] : vector<1x1xf16>, vector<1x1xf16>
+    //CHECK: %[[r136:.*]] = vector.shuffle %[[r123]], %[[r125]] [0, 1] : vector<1x1xf16>, vector<1x1xf16>
+    //CHECK: %[[r137:.*]] = vector.shuffle %[[r127]], %[[r129]] [0, 1] : vector<1x1xf16>, vector<1x1xf16>
+    //CHECK: %[[r138:.*]] = vector.shuffle %[[r130]], %[[r131]] [0, 1, 2, 3] : vector<2x1xf16>, vector<2x1xf16>
+    //CHECK: %[[r139:.*]] = vector.shuffle %[[r132]], %[[r133]] [0, 1, 2, 3] : vector<2x1xf16>, vector<2x1xf16>
+    //CHECK: %[[r140:.*]] = vector.shuffle %[[r134]], %[[r135]] [0, 1, 2, 3] : vector<2x1xf16>, vector<2x1xf16>
+    //CHECK: %[[r141:.*]] = vector.shuffle %[[r136]], %[[r137]] [0, 1, 2, 3] : vector<2x1xf16>, vector<2x1xf16>
+    //CHECK: %[[r142:.*]] = vector.shuffle %[[r138]], %[[r139]] [0, 1, 2, 3, 4, 5, 6, 7] : vector<4x1xf16>, vector<4x1xf16>
+    //CHECK: %[[r143:.*]] = vector.shuffle %[[r140]], %[[r141]] [0, 1, 2, 3, 4, 5, 6, 7] : vector<4x1xf16>, vector<4x1xf16>
+    //CHECK: %[[r144:.*]] = vector.shuffle %[[r142]], %[[r143]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : vector<8x1xf16>, vector<8x1xf16>
+    //CHECK: %[[r145:.*]] = vector.shape_cast %[[r144]] : vector<16x1xf16> to vector<2x8xf16>
+    //CHECK: %[[r146:.*]] = xetile.init_tile %[[arg1]][%[[c0]], %[[c0]]] : memref<128x256xf16> -> !xetile.tile<2x8xf16>
+    //CHECK: xetile.store_tile %[[r145]],  %[[r146]] : vector<2x8xf16>, !xetile.tile<2x8xf16>
+    %c0 = arith.constant 0 : index
+    %t = xetile.init_tile %a[%c0, %c0] : memref<128x256xf16> -> !xetile.tile<16x32xf16>
+    %v = xetile.load_tile %t : !xetile.tile<16x32xf16> -> vector<16x32xf16>
+    %e = math.exp %v: vector<16x32xf16>
+    %r = xetile.reduction <add>, %e [1] : vector<16x32xf16> -> vector<16x1xf16>
+    %c = vector.shape_cast %r: vector<16x1xf16> to vector<2x8xf16>
+    %s = xetile.init_tile %b[%c0, %c0] : memref<128x256xf16> -> !xetile.tile<2x8xf16>
+    xetile.store_tile %c, %s : vector<2x8xf16>, !xetile.tile<2x8xf16>
+    gpu.return
+  }
+
+  //CHECK: gpu.func @outter_reduction(%[[arg0:.*]]: memref<128x256xf16>, %[[arg1:.*]]: memref<128x256xf16>) {
+  gpu.func @outter_reduction(%a: memref<128x256xf16>, %b: memref<128x256xf16>) {
+    //CHECK: %[[c0:.*]] = arith.constant 0 : index
+    //CHECK: %[[r0:.*]] = xetile.init_tile %[[arg0]][%[[c0]], %[[c0]]] : memref<128x256xf16> -> !xetile.tile<16x32xf16>
+    //CHECK: %[[r1:.*]] = xetile.load_tile %[[r0]] : !xetile.tile<16x32xf16> -> vector<16x32xf16>
+    //CHECK: %[[r2:.*]] = vector.extract_strided_slice %[[r1]] {offsets = [0, 0], sizes = [1, 32], strides = [1, 1]} : vector<16x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r3:.*]] = vector.extract_strided_slice %[[r1]] {offsets = [1, 0], sizes = [1, 32], strides = [1, 1]} : vector<16x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r4:.*]] = vector.extract_strided_slice %[[r1]] {offsets = [2, 0], sizes = [1, 32], strides = [1, 1]} : vector<16x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r5:.*]] = vector.extract_strided_slice %[[r1]] {offsets = [3, 0], sizes = [1, 32], strides = [1, 1]} : vector<16x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r6:.*]] = vector.extract_strided_slice %[[r1]] {offsets = [4, 0], sizes = [1, 32], strides = [1, 1]} : vector<16x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r7:.*]] = vector.extract_strided_slice %[[r1]] {offsets = [5, 0], sizes = [1, 32], strides = [1, 1]} : vector<16x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r8:.*]] = vector.extract_strided_slice %[[r1]] {offsets = [6, 0], sizes = [1, 32], strides = [1, 1]} : vector<16x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r9:.*]] = vector.extract_strided_slice %[[r1]] {offsets = [7, 0], sizes = [1, 32], strides = [1, 1]} : vector<16x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r10:.*]] = vector.extract_strided_slice %[[r1]] {offsets = [8, 0], sizes = [1, 32], strides = [1, 1]} : vector<16x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r11:.*]] = vector.extract_strided_slice %[[r1]] {offsets = [9, 0], sizes = [1, 32], strides = [1, 1]} : vector<16x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r12:.*]] = vector.extract_strided_slice %[[r1]] {offsets = [10, 0], sizes = [1, 32], strides = [1, 1]} : vector<16x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r13:.*]] = vector.extract_strided_slice %[[r1]] {offsets = [11, 0], sizes = [1, 32], strides = [1, 1]} : vector<16x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r14:.*]] = vector.extract_strided_slice %[[r1]] {offsets = [12, 0], sizes = [1, 32], strides = [1, 1]} : vector<16x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r15:.*]] = vector.extract_strided_slice %[[r1]] {offsets = [13, 0], sizes = [1, 32], strides = [1, 1]} : vector<16x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r16:.*]] = vector.extract_strided_slice %[[r1]] {offsets = [14, 0], sizes = [1, 32], strides = [1, 1]} : vector<16x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r17:.*]] = vector.extract_strided_slice %[[r1]] {offsets = [15, 0], sizes = [1, 32], strides = [1, 1]} : vector<16x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r18:.*]] = math.exp %[[r2:.*]] : vector<1x32xf16>
+    //CHECK: %[[r19:.*]] = math.exp %[[r3:.*]] : vector<1x32xf16>
+    //CHECK: %[[r20:.*]] = math.exp %[[r4:.*]] : vector<1x32xf16>
+    //CHECK: %[[r21:.*]] = math.exp %[[r5:.*]] : vector<1x32xf16>
+    //CHECK: %[[r22:.*]] = math.exp %[[r6:.*]] : vector<1x32xf16>
+    //CHECK: %[[r23:.*]] = math.exp %[[r7:.*]] : vector<1x32xf16>
+    //CHECK: %[[r24:.*]] = math.exp %[[r8:.*]] : vector<1x32xf16>
+    //CHECK: %[[r25:.*]] = math.exp %[[r9:.*]] : vector<1x32xf16>
+    //CHECK: %[[r26:.*]] = math.exp %[[r10:.*]] : vector<1x32xf16>
+    //CHECK: %[[r27:.*]] = math.exp %[[r11:.*]] : vector<1x32xf16>
+    //CHECK: %[[r28:.*]] = math.exp %[[r12:.*]] : vector<1x32xf16>
+    //CHECK: %[[r29:.*]] = math.exp %[[r13:.*]] : vector<1x32xf16>
+    //CHECK: %[[r30:.*]] = math.exp %[[r14:.*]] : vector<1x32xf16>
+    //CHECK: %[[r31:.*]] = math.exp %[[r15:.*]] : vector<1x32xf16>
+    //CHECK: %[[r32:.*]] = math.exp %[[r16:.*]] : vector<1x32xf16>
+    //CHECK: %[[r33:.*]] = math.exp %[[r17:.*]] : vector<1x32xf16>
+    //CHECK: %[[r34:.*]] = arith.addf %[[r18]], %[[r19]] : vector<1x32xf16>
+    //CHECK: %[[r35:.*]] = arith.addf %[[r34]], %[[r20]] : vector<1x32xf16>
+    //CHECK: %[[r36:.*]] = arith.addf %[[r35]], %[[r21]] : vector<1x32xf16>
+    //CHECK: %[[r37:.*]] = arith.addf %[[r36]], %[[r22]] : vector<1x32xf16>
+    //CHECK: %[[r38:.*]] = arith.addf %[[r37]], %[[r23]] : vector<1x32xf16>
+    //CHECK: %[[r39:.*]] = arith.addf %[[r38]], %[[r24]] : vector<1x32xf16>
+    //CHECK: %[[r40:.*]] = arith.addf %[[r39]], %[[r25]] : vector<1x32xf16>
+    //CHECK: %[[r41:.*]] = arith.addf %[[r40]], %[[r26]] : vector<1x32xf16>
+    //CHECK: %[[r42:.*]] = arith.addf %[[r41]], %[[r27]] : vector<1x32xf16>
+    //CHECK: %[[r43:.*]] = arith.addf %[[r42]], %[[r28]] : vector<1x32xf16>
+    //CHECK: %[[r44:.*]] = arith.addf %[[r43]], %[[r29]] : vector<1x32xf16>
+    //CHECK: %[[r45:.*]] = arith.addf %[[r44]], %[[r30]] : vector<1x32xf16>
+    //CHECK: %[[r46:.*]] = arith.addf %[[r45]], %[[r31]] : vector<1x32xf16>
+    //CHECK: %[[r47:.*]] = arith.addf %[[r46]], %[[r32]] : vector<1x32xf16>
+    //CHECK: %[[r48:.*]] = arith.addf %[[r47]], %[[r33]] : vector<1x32xf16>
+    //CHECK: %[[r49:.*]] = vector.shape_cast %[[r48]] : vector<1x32xf16> to vector<4x8xf16>
+    //CHECK: %[[r50:.*]] = xetile.init_tile %[[arg1]][%[[c0]], %[[c0]]] : memref<128x256xf16> -> !xetile.tile<4x8xf16>
+    //CHECK: xetile.store_tile %[[r49]],  %[[r50]] : vector<4x8xf16>, !xetile.tile<4x8xf16>
+    %c0 = arith.constant 0 : index
+    %t = xetile.init_tile %a[%c0, %c0] : memref<128x256xf16> -> !xetile.tile<16x32xf16>
+    %v = xetile.load_tile %t : !xetile.tile<16x32xf16> -> vector<16x32xf16>
+    %e = math.exp %v: vector<16x32xf16>
+    %r = xetile.reduction <add>, %e [0] : vector<16x32xf16> -> vector<1x32xf16>
+    %c = vector.shape_cast %r: vector<1x32xf16> to vector<4x8xf16>
+    %s = xetile.init_tile %b[%c0, %c0] : memref<128x256xf16> -> !xetile.tile<4x8xf16>
+    xetile.store_tile %c, %s : vector<4x8xf16>, !xetile.tile<4x8xf16>
+    gpu.return
+  }
+
+  //CHECK-LABEL: gpu.func @sg_gemm
+  //CHECK-SAME: (%[[arg0:.*]]: memref<32x128xf16>, %[[arg1:.*]]: memref<128x32xf16>, %[[arg2:.*]]: memref<32x32xf32>)
+  gpu.func @sg_gemm(%a: memref<32x128xf16>, %b: memref<128x32xf16>, %c: memref<32x32xf32>) {
+
+    //CHECK: %[[c24:.*]] = arith.constant 24 : index
+    //CHECK: %[[c8:.*]] = arith.constant 8 : index
+    //CHECK: %[[c16:.*]] = arith.constant 16 : index
+    //CHECK: %[[c0:.*]] = arith.constant 0 : index
+    //CHECK: %[[c32:.*]] = arith.constant 32 : index
+    //CHECK: %[[c128:.*]] = arith.constant 128 : index
+    //CHECK: %[[cst:.*]] = arith.constant dense<0.000000e+00> : vector<8x16xf32>
+    //CHECK: %[[r0:.*]] = xetile.init_tile %[[arg0]][%[[c0]], %[[c0]]] : memref<32x128xf16> -> !xetile.tile<32x16xf16>
+    //CHECK: %[[r1:.*]] = xetile.init_tile %[[arg0]][%[[c0]], %[[c16]]] : memref<32x128xf16> -> !xetile.tile<32x16xf16>
+    //CHECK: %[[r2:.*]] = xetile.init_tile %[[arg1]][%[[c0]], %[[c0]]] : memref<128x32xf16> -> !xetile.tile<32x16xf16>
+    //CHECK: %[[r3:.*]] = xetile.init_tile %[[arg1]][%[[c0]], %[[c16]]] : memref<128x32xf16> -> !xetile.tile<32x16xf16>
+    %c0 = arith.constant 0 : index
+    %c32 = arith.constant 32 : index
+    %c128 = arith.constant 128 : index
+    %cst = arith.constant dense<0.0>: vector<32x32xf32>
+  	%1 = xetile.init_tile %a[%c0, %c0] : memref<32x128xf16> -> !xetile.tile<32x32xf16>
+  	%2 = xetile.init_tile %b[%c0, %c0] : memref<128x32xf16> -> !xetile.tile<32x32xf16>
+    //CHECK: %[[r4:.*]]:12 = scf.for %[[arg3:.*]] = %[[c0]] to %[[c128]] step %[[c32]] iter_args(%[[arg4:.*]] = %[[r0]], %[[arg5:.*]] = %[[r1]], %[[arg6:.*]] = %[[r2]], %[[arg7:.*]] = %[[r3]], %[[arg8:.*]] = %[[cst]], %[[arg9:.*]] = %[[cst]], %[[arg10:.*]] = %[[cst]], %[[arg11:.*]] = %[[cst]], %[[arg12:.*]] = %[[cst]], %[[arg13:.*]] = %[[cst]], %[[arg14:.*]] = %[[cst]], %[[arg15:.*]] = %[[cst]]) -> (!xetile.tile<32x16xf16>, !xetile.tile<32x16xf16>, !xetile.tile<32x16xf16>, !xetile.tile<32x16xf16>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>) {
+    %out:3 = scf.for %k = %c0 to %c128 step %c32 iter_args(%a_tile = %1, %b_tile = %2, %c_value = %cst)
+        -> (!xetile.tile<32x32xf16>, !xetile.tile<32x32xf16>, vector<32x32xf32>) {
+      //CHECK: %[[r13:.*]] = xetile.load_tile %[[arg4]] : !xetile.tile<32x16xf16> -> vector<32x16xf16>
+      //CHECK: %[[r14:.*]] = xetile.load_tile %[[arg5]] : !xetile.tile<32x16xf16> -> vector<32x16xf16>
+      //CHECK: %[[r15:.*]] = vector.extract_strided_slice %[[r13]] {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf16> to vector<8x16xf16>
+      //CHECK: %[[r16:.*]] = vector.extract_strided_slice %[[r13]] {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf16> to vector<8x16xf16>
+      //CHECK: %[[r17:.*]] = vector.extract_strided_slice %[[r13]] {offsets = [16, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf16> to vector<8x16xf16>
+      //CHECK: %[[r18:.*]] = vector.extract_strided_slice %[[r13]] {offsets = [24, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf16> to vector<8x16xf16>
+      //CHECK: %[[r19:.*]] = vector.extract_strided_slice %[[r14]] {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf16> to vector<8x16xf16>
+      //CHECK: %[[r20:.*]] = vector.extract_strided_slice %[[r14]] {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf16> to vector<8x16xf16>
+      //CHECK: %[[r21:.*]] = vector.extract_strided_slice %[[r14]] {offsets = [16, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf16> to vector<8x16xf16>
+      //CHECK: %[[r22:.*]] = vector.extract_strided_slice %[[r14]] {offsets = [24, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf16> to vector<8x16xf16>
+      //CHECK: %[[r23:.*]] = xetile.load_tile %[[arg6]] : !xetile.tile<32x16xf16> -> vector<32x16xf16>
+      //CHECK: %[[r24:.*]] = xetile.load_tile %[[arg7]] : !xetile.tile<32x16xf16> -> vector<32x16xf16>
+      //CHECK: %[[r25:.*]] = vector.extract_strided_slice %[[r23]] {offsets = [0, 0], sizes = [16, 16], strides = [1, 1]} : vector<32x16xf16> to vector<16x16xf16>
+      //CHECK: %[[r26:.*]] = vector.extract_strided_slice %[[r23]] {offsets = [16, 0], sizes = [16, 16], strides = [1, 1]} : vector<32x16xf16> to vector<16x16xf16>
+      //CHECK: %[[r27:.*]] = vector.extract_strided_slice %[[r24]] {offsets = [0, 0], sizes = [16, 16], strides = [1, 1]} : vector<32x16xf16> to vector<16x16xf16>
+      //CHECK: %[[r28:.*]] = vector.extract_strided_slice %[[r24]] {offsets = [16, 0], sizes = [16, 16], strides = [1, 1]} : vector<32x16xf16> to vector<16x16xf16>
+      //CHECK: %[[r29:.*]] = xetile.update_tile_offset %[[arg4]], [%[[c0]], %[[c32]]] : !xetile.tile<32x16xf16>
+      //CHECK: %[[r30:.*]] = xetile.update_tile_offset %[[arg5]], [%[[c0]], %[[c32]]] : !xetile.tile<32x16xf16>
+      //CHECK: %[[r31:.*]] = xetile.update_tile_offset %[[arg6]], [%[[c32]], %[[c0]]] : !xetile.tile<32x16xf16>
+      //CHECK: %[[r32:.*]] = xetile.update_tile_offset %[[arg7]], [%[[c32]], %[[c0]]] : !xetile.tile<32x16xf16>
+      //CHECK: %[[r33:.*]] = xetile.tile_mma %[[r15]], %[[r25]], %[[arg8]] : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+      //CHECK: %[[r34:.*]] = xetile.tile_mma %[[r19]], %[[r26]], %[[r33]] : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+      //CHECK: %[[r35:.*]] = xetile.tile_mma %[[r15]], %[[r27]], %[[arg9]] : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+      //CHECK: %[[r36:.*]] = xetile.tile_mma %[[r19]], %[[r28]], %[[r35]] : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+      //CHECK: %[[r37:.*]] = xetile.tile_mma %[[r16]], %[[r25]], %[[arg10]] : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+      //CHECK: %[[r38:.*]] = xetile.tile_mma %[[r20]], %[[r26]], %[[r37]] : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+      //CHECK: %[[r39:.*]] = xetile.tile_mma %[[r16]], %[[r27]], %[[arg11]] : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+      //CHECK: %[[r40:.*]] = xetile.tile_mma %[[r20]], %[[r28]], %[[r39]] : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+      //CHECK: %[[r41:.*]] = xetile.tile_mma %[[r17]], %[[r25]], %[[arg12]] : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+      //CHECK: %[[r42:.*]] = xetile.tile_mma %[[r21]], %[[r26]], %[[r41]] : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+      //CHECK: %[[r43:.*]] = xetile.tile_mma %[[r17]], %[[r27]], %[[arg13]] : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+      //CHECK: %[[r44:.*]] = xetile.tile_mma %[[r21]], %[[r28]], %[[r43]] : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+      //CHECK: %[[r45:.*]] = xetile.tile_mma %[[r18]], %[[r25]], %[[arg14]] : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+      //CHECK: %[[r46:.*]] = xetile.tile_mma %[[r22]], %[[r26]], %[[r45]] : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+      //CHECK: %[[r47:.*]] = xetile.tile_mma %[[r18]], %[[r27]], %[[arg15]] : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+      //CHECK: %[[r48:.*]] = xetile.tile_mma %[[r22]], %[[r28]], %[[r47]] : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+      %3 = xetile.load_tile %a_tile : !xetile.tile<32x32xf16> -> vector<32x32xf16>
+      %4 = xetile.load_tile %b_tile : !xetile.tile<32x32xf16> -> vector<32x32xf16>
+      %a_next_tile = xetile.update_tile_offset %a_tile, [%c0, %c32]:  !xetile.tile<32x32xf16>
+      %b_next_tile = xetile.update_tile_offset %b_tile, [%c32, %c0]:  !xetile.tile<32x32xf16>
+      %c_new_value = xetile.tile_mma %3, %4, %c_value:
+        vector<32x32xf16>, vector<32x32xf16>, vector<32x32xf32> -> vector<32x32xf32>
+      //CHECK: scf.yield %[[r29]], %[[r30]], %[[r31]], %[[r32]], %[[r34]], %[[r36]], %[[r38]], %[[r40]], %[[r42]], %[[r44]], %[[r46]], %[[r48]] : !xetile.tile<32x16xf16>, !xetile.tile<32x16xf16>, !xetile.tile<32x16xf16>, !xetile.tile<32x16xf16>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>
+      scf.yield %a_next_tile, %b_next_tile, %c_new_value : !xetile.tile<32x32xf16>, !xetile.tile<32x32xf16>, vector<32x32xf32>
+    }
+
+    //CHECK: %[[r5:.*]] = xetile.init_tile %[[arg2]][%[[c0]], %[[c0]]] : memref<32x32xf32> -> !xetile.tile<8x16xf32>
+    //CHECK: %[[r6:.*]] = xetile.init_tile %[[arg2]][%[[c0]], %[[c16]]] : memref<32x32xf32> -> !xetile.tile<8x16xf32>
+    //CHECK: %[[r7:.*]] = xetile.init_tile %[[arg2]][%[[c8]], %[[c0]]] : memref<32x32xf32> -> !xetile.tile<8x16xf32>
+    //CHECK: %[[r8:.*]] = xetile.init_tile %[[arg2]][%[[c8]], %[[c16]]] : memref<32x32xf32> -> !xetile.tile<8x16xf32>
+    //CHECK: %[[r9:.*]] = xetile.init_tile %[[arg2]][%[[c16]], %[[c0]]] : memref<32x32xf32> -> !xetile.tile<8x16xf32>
+    //CHECK: %[[r10:.*]] = xetile.init_tile %[[arg2]][%[[c16]], %[[c16]]] : memref<32x32xf32> -> !xetile.tile<8x16xf32>
+    //CHECK: %[[r11:.*]] = xetile.init_tile %[[arg2]][%[[c24]], %[[c0]]] : memref<32x32xf32> -> !xetile.tile<8x16xf32>
+    //CHECK: %[[r12:.*]] = xetile.init_tile %[[arg2]][%[[c24]], %[[c16]]] : memref<32x32xf32> -> !xetile.tile<8x16xf32>
+  	%c_tile = xetile.init_tile %c[%c0, %c0] : memref<32x32xf32> -> !xetile.tile<32x32xf32>
+
+    //CHECK: xetile.store_tile %[[r4]]#4,  %[[r5]] : vector<8x16xf32>, !xetile.tile<8x16xf32>
+    //CHECK: xetile.store_tile %[[r4]]#5,  %[[r6]] : vector<8x16xf32>, !xetile.tile<8x16xf32>
+    //CHECK: xetile.store_tile %[[r4]]#6,  %[[r7]] : vector<8x16xf32>, !xetile.tile<8x16xf32>
+    //CHECK: xetile.store_tile %[[r4]]#7,  %[[r8]] : vector<8x16xf32>, !xetile.tile<8x16xf32>
+    //CHECK: xetile.store_tile %[[r4]]#8,  %[[r9]] : vector<8x16xf32>, !xetile.tile<8x16xf32>
+    //CHECK: xetile.store_tile %[[r4]]#9,  %[[r10]] : vector<8x16xf32>, !xetile.tile<8x16xf32>
+    //CHECK: xetile.store_tile %[[r4]]#10,  %[[r11]] : vector<8x16xf32>, !xetile.tile<8x16xf32>
+    //CHECK: xetile.store_tile %[[r4]]#11,  %[[r12]] : vector<8x16xf32>, !xetile.tile<8x16xf32>
+    xetile.store_tile %out#2, %c_tile: vector<32x32xf32>, !xetile.tile<32x32xf32>
+  	gpu.return
+  }
+
+  //CHECK-LABEL: gpu.func @sg_gemm_with_preops_for_c
+  //CHECK-SAME: (%[[arg0:.*]]: memref<32x128xf16>, %[[arg1:.*]]: memref<128x32xf16>, %[[arg2:.*]]: memref<32x32xf32>)
+  gpu.func @sg_gemm_with_preops_for_c(%a: memref<32x128xf16>, %b: memref<128x32xf16>, %c: memref<32x32xf32>) {
+    //CHECK: %[[c24:.*]] = arith.constant 24 : index
+    //CHECK: %[[c8:.*]] = arith.constant 8 : index
+    //CHECK: %[[c16:.*]] = arith.constant 16 : index
+    //CHECK: %[[c0:.*]] = arith.constant 0 : index
+    //CHECK: %[[c32:.*]] = arith.constant 32 : index
+    //CHECK: %[[c128:.*]] = arith.constant 128 : index
+    //CHECK: %[[cst:.*]] = arith.constant dense<0.000000e+00> : vector<8x16xf32>
+    //CHECK: %[[r0]] = xetile.init_tile %[[arg0]][%[[c0]], %[[c0]]] : memref<32x128xf16> -> !xetile.tile<32x16xf16>
+    //CHECK: %[[r1]] = xetile.init_tile %[[arg0]][%[[c0]], %[[c16]]] : memref<32x128xf16> -> !xetile.tile<32x16xf16>
+    //CHECK: %[[r2]] = xetile.init_tile %[[arg1]][%[[c0]], %[[c0]]] : memref<128x32xf16> -> !xetile.tile<32x16xf16>
+    //CHECK: %[[r3]] = xetile.init_tile %[[arg1]][%[[c0]], %[[c16]]] : memref<128x32xf16> -> !xetile.tile<32x16xf16>
+    //CHECK: %[[r4]]:12 = scf.for %[[arg3:.*]] = %[[c0]] to %[[c128]] step %[[c32]] iter_args(%[[arg4:.*]] = %[[r0]], %[[arg5:.*]] = %[[r1]], %[[arg6:.*]] = %[[r2]], %[[arg7:.*]] = %[[r3]], %[[arg8:.*]] = %[[cst]], %[[arg9:.*]] = %[[cst]], %[[arg10:.*]] = %[[cst]], %[[arg11:.*]] = %[[cst]], %[[arg12:.*]] = %[[cst]], %[[arg13:.*]] = %[[cst]], %[[arg14:.*]] = %[[cst]], %[[arg15:.*]] = %[[cst]]) -> (!xetile.tile<32x16xf16>, !xetile.tile<32x16xf16>, !xetile.tile<32x16xf16>, !xetile.tile<32x16xf16>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>) {
+    //CHECK:   %[[r13:.*]] = xetile.load_tile %[[arg4]] : !xetile.tile<32x16xf16> -> vector<32x16xf16>
+    //CHECK:   %[[r14:.*]] = xetile.load_tile %[[arg5]] : !xetile.tile<32x16xf16> -> vector<32x16xf16>
+    //CHECK:   %[[r15:.*]] = vector.extract_strided_slice %[[r13]] {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf16> to vector<8x16xf16>
+    //CHECK:   %[[r16:.*]] = vector.extract_strided_slice %[[r13]] {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf16> to vector<8x16xf16>
+    //CHECK:   %[[r17:.*]] = vector.extract_strided_slice %[[r13]] {offsets = [16, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf16> to vector<8x16xf16>
+    //CHECK:   %[[r18:.*]] = vector.extract_strided_slice %[[r13]] {offsets = [24, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf16> to vector<8x16xf16>
+    //CHECK:   %[[r19:.*]] = vector.extract_strided_slice %[[r14]] {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf16> to vector<8x16xf16>
+    //CHECK:   %[[r20:.*]] = vector.extract_strided_slice %[[r14]] {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf16> to vector<8x16xf16>
+    //CHECK:   %[[r21:.*]] = vector.extract_strided_slice %[[r14]] {offsets = [16, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf16> to vector<8x16xf16>
+    //CHECK:   %[[r22:.*]] = vector.extract_strided_slice %[[r14]] {offsets = [24, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf16> to vector<8x16xf16>
+    //CHECK:   %[[r23:.*]] = xetile.load_tile %[[arg6]] : !xetile.tile<32x16xf16> -> vector<32x16xf16>
+    //CHECK:   %[[r24:.*]] = xetile.load_tile %[[arg7]] : !xetile.tile<32x16xf16> -> vector<32x16xf16>
+    //CHECK:   %[[r25:.*]] = vector.extract_strided_slice %[[r23]] {offsets = [0, 0], sizes = [16, 16], strides = [1, 1]} : vector<32x16xf16> to vector<16x16xf16>
+    //CHECK:   %[[r26:.*]] = vector.extract_strided_slice %[[r23]] {offsets = [16, 0], sizes = [16, 16], strides = [1, 1]} : vector<32x16xf16> to vector<16x16xf16>
+    //CHECK:   %[[r27:.*]] = vector.extract_strided_slice %[[r24]] {offsets = [0, 0], sizes = [16, 16], strides = [1, 1]} : vector<32x16xf16> to vector<16x16xf16>
+    //CHECK:   %[[r28:.*]] = vector.extract_strided_slice %[[r24]] {offsets = [16, 0], sizes = [16, 16], strides = [1, 1]} : vector<32x16xf16> to vector<16x16xf16>
+    //CHECK:   %[[r29:.*]] = xetile.update_tile_offset %[[arg4]], [%[[c0]], %[[c32]]] : !xetile.tile<32x16xf16>
+    //CHECK:   %[[r30:.*]] = xetile.update_tile_offset %[[arg5]], [%[[c0]], %[[c32]]] : !xetile.tile<32x16xf16>
+    //CHECK:   %[[r31:.*]] = xetile.update_tile_offset %[[arg6]], [%[[c32]], %[[c0]]] : !xetile.tile<32x16xf16>
+    //CHECK:   %[[r32:.*]] = xetile.update_tile_offset %[[arg7]], [%[[c32]], %[[c0]]] : !xetile.tile<32x16xf16>
+    //CHECK:   %[[r33:.*]] = arith.addf %[[arg8]], %[[arg8]] : vector<8x16xf32>
+    //CHECK:   %[[r34:.*]] = arith.addf %[[arg9]], %[[arg9]] : vector<8x16xf32>
+    //CHECK:   %[[r35:.*]] = arith.addf %[[arg10]], %[[arg10]] : vector<8x16xf32>
+    //CHECK:   %[[r36:.*]] = arith.addf %[[arg11]], %[[arg11]] : vector<8x16xf32>
+    //CHECK:   %[[r37:.*]] = arith.addf %[[arg12]], %[[arg12]] : vector<8x16xf32>
+    //CHECK:   %[[r38:.*]] = arith.addf %[[arg13]], %[[arg13]] : vector<8x16xf32>
+    //CHECK:   %[[r39:.*]] = arith.addf %[[arg14]], %[[arg14]] : vector<8x16xf32>
+    //CHECK:   %[[r40:.*]] = arith.addf %[[arg15]], %[[arg15]] : vector<8x16xf32>
+    //CHECK:   %[[r41:.*]] = xetile.tile_mma %[[r15]], %[[r25]], %[[r33]] : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+    //CHECK:   %[[r42:.*]] = xetile.tile_mma %[[r19]], %[[r26]], %[[r41]] : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+    //CHECK:   %[[r43:.*]] = xetile.tile_mma %[[r15]], %[[r27]], %[[r34]] : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+    //CHECK:   %[[r44:.*]] = xetile.tile_mma %[[r19]], %[[r28]], %[[r43]] : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+    //CHECK:   %[[r45:.*]] = xetile.tile_mma %[[r16]], %[[r25]], %[[r35]] : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+    //CHECK:   %[[r46:.*]] = xetile.tile_mma %[[r20]], %[[r26]], %[[r45]] : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+    //CHECK:   %[[r47:.*]] = xetile.tile_mma %[[r16]], %[[r27]], %[[r36]] : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+    //CHECK:   %[[r48:.*]] = xetile.tile_mma %[[r20]], %[[r28]], %[[r47]] : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+    //CHECK:   %[[r49:.*]] = xetile.tile_mma %[[r17]], %[[r25]], %[[r37]] : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+    //CHECK:   %[[r50:.*]] = xetile.tile_mma %[[r21]], %[[r26]], %[[r49]] : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+    //CHECK:   %[[r51:.*]] = xetile.tile_mma %[[r17]], %[[r27]], %[[r38]] : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+    //CHECK:   %[[r52:.*]] = xetile.tile_mma %[[r21]], %[[r28]], %[[r51]] : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+    //CHECK:   %[[r53:.*]] = xetile.tile_mma %[[r18]], %[[r25]], %[[r39]] : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+    //CHECK:   %[[r54:.*]] = xetile.tile_mma %[[r22]], %[[r26]], %[[r53]] : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+    //CHECK:   %[[r55:.*]] = xetile.tile_mma %[[r18]], %[[r27]], %[[r40]] : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+    //CHECK:   %[[r56:.*]] = xetile.tile_mma %[[r22]], %[[r28]], %[[r55]] : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+    //CHECK:   scf.yield %[[r29]], %[[r30]], %[[r31]], %[[r32]], %[[r42]], %[[r44]], %[[r46]], %[[r48]], %[[r50]], %[[r52]], %[[r54]], %[[r56]] : !xetile.tile<32x16xf16>, !xetile.tile<32x16xf16>, !xetile.tile<32x16xf16>, !xetile.tile<32x16xf16>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>
+    //CHECK: }
+    //CHECK: %[[r5:.*]] = xetile.init_tile %[[arg2]][%[[c0]], %[[c0]]] : memref<32x32xf32> -> !xetile.tile<8x16xf32>
+    //CHECK: %[[r6:.*]] = xetile.init_tile %[[arg2]][%[[c0]], %[[c16]]] : memref<32x32xf32> -> !xetile.tile<8x16xf32>
+    //CHECK: %[[r7:.*]] = xetile.init_tile %[[arg2]][%[[c8]], %[[c0]]] : memref<32x32xf32> -> !xetile.tile<8x16xf32>
+    //CHECK: %[[r8:.*]] = xetile.init_tile %[[arg2]][%[[c8]], %[[c16]]] : memref<32x32xf32> -> !xetile.tile<8x16xf32>
+    //CHECK: %[[r9:.*]] = xetile.init_tile %[[arg2]][%[[c16]], %[[c0]]] : memref<32x32xf32> -> !xetile.tile<8x16xf32>
+    //CHECK: %[[r10:.*]] = xetile.init_tile %[[arg2]][%[[c16]], %[[c16]]] : memref<32x32xf32> -> !xetile.tile<8x16xf32>
+    //CHECK: %[[r11:.*]] = xetile.init_tile %[[arg2]][%[[c24]], %[[c0]]] : memref<32x32xf32> -> !xetile.tile<8x16xf32>
+    //CHECK: %[[r12:.*]] = xetile.init_tile %[[arg2]][%[[c24]], %[[c16]]] : memref<32x32xf32> -> !xetile.tile<8x16xf32>
+    //CHECK: xetile.store_tile %[[r4]]#4,  %[[r5]] : vector<8x16xf32>, !xetile.tile<8x16xf32>
+    //CHECK: xetile.store_tile %[[r4]]#5,  %[[r6]] : vector<8x16xf32>, !xetile.tile<8x16xf32>
+    //CHECK: xetile.store_tile %[[r4]]#6,  %[[r7]] : vector<8x16xf32>, !xetile.tile<8x16xf32>
+    //CHECK: xetile.store_tile %[[r4]]#7,  %[[r8]] : vector<8x16xf32>, !xetile.tile<8x16xf32>
+    //CHECK: xetile.store_tile %[[r4]]#8,  %[[r9]] : vector<8x16xf32>, !xetile.tile<8x16xf32>
+    //CHECK: xetile.store_tile %[[r4]]#9,  %[[r10]] : vector<8x16xf32>, !xetile.tile<8x16xf32>
+    //CHECK: xetile.store_tile %[[r4]]#10,  %[[r11]] : vector<8x16xf32>, !xetile.tile<8x16xf32>
+    //CHECK: xetile.store_tile %[[r4]]#11,  %[[r12]] : vector<8x16xf32>, !xetile.tile<8x16xf32>
+    %c0 = arith.constant 0 : index
+    %c32 = arith.constant 32 : index
+    %c128 = arith.constant 128 : index
+    %cst = arith.constant dense<0.0>: vector<32x32xf32>
+  	%1 = xetile.init_tile %a[%c0, %c0] : memref<32x128xf16> -> !xetile.tile<32x32xf16>
+  	%2 = xetile.init_tile %b[%c0, %c0] : memref<128x32xf16> -> !xetile.tile<32x32xf16>
+    %out:3 = scf.for %k = %c0 to %c128 step %c32 iter_args(%a_tile = %1, %b_tile = %2, %c_value = %cst)
+        -> (!xetile.tile<32x32xf16>, !xetile.tile<32x32xf16>, vector<32x32xf32>) {
+      %3 = xetile.load_tile %a_tile : !xetile.tile<32x32xf16> -> vector<32x32xf16>
+      %4 = xetile.load_tile %b_tile : !xetile.tile<32x32xf16> -> vector<32x32xf16>
+      %a_next_tile = xetile.update_tile_offset %a_tile, [%c0, %c32]:  !xetile.tile<32x32xf16>
+      %b_next_tile = xetile.update_tile_offset %b_tile, [%c32, %c0]:  !xetile.tile<32x32xf16>
+      %5 = arith.addf %c_value, %c_value: vector<32x32xf32>
+      %c_new_value = xetile.tile_mma %3, %4, %5: vector<32x32xf16>, vector<32x32xf16>, vector<32x32xf32> -> vector<32x32xf32>
+      scf.yield %a_next_tile, %b_next_tile, %c_new_value : !xetile.tile<32x32xf16>, !xetile.tile<32x32xf16>, vector<32x32xf32>
+    }
+  	%c_tile = xetile.init_tile %c[%c0, %c0] : memref<32x32xf32> -> !xetile.tile<32x32xf32>
+    xetile.store_tile %out#2, %c_tile: vector<32x32xf32>, !xetile.tile<32x32xf32>
+  	gpu.return
+  }
+
+  //CHECK-LABEL: gpu.func @sglevel_reduction_broadcast_dim_0
+  //CHECK-SAME: (%[[arg0:.*]]: memref<1024x1024xf16>)
+  gpu.func @sglevel_reduction_broadcast_dim_0(%a: memref<1024x1024xf16>) {
+
+    //CHECK: %[[c24:.*]] = arith.constant 24 : index
+    //CHECK: %[[c16:.*]] = arith.constant 16 : index
+    //CHECK: %[[c8:.*]] = arith.constant 8 : index
+    //CHECK: %[[c32:.*]] = arith.constant 32 : index
+    //CHECK: %[[c0:.*]] = arith.constant 0 : index
+    //CHECK: %[[r0:.*]] = xetile.init_tile %[[arg0]][%[[c0]], %[[c0]]] : memref<1024x1024xf16> -> !xetile.tile<32x32xf16>
+    //CHECK: %[[r1:.*]] = xetile.init_tile %[[arg0]][%[[c0]], %[[c32]]] : memref<1024x1024xf16> -> !xetile.tile<32x32xf16>
+    //CHECK: %[[r2:.*]] = xetile.load_tile %[[r0]] : !xetile.tile<32x32xf16> -> vector<32x32xf16>
+    //CHECK: %[[r3:.*]] = xetile.load_tile %[[r1]] : !xetile.tile<32x32xf16> -> vector<32x32xf16>
+    //CHECK-COUNT-32: %{{.*}} = vector.extract_strided_slice %[[r2]] {offsets = [{{.*}}], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK-COUNT-32: %{{.*}} = vector.extract_strided_slice %[[r3]] {offsets = [{{.*}}], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK-COUNT-62: %{{.*}} = arith.addf %{{.*}}, %{{.*}} : vector<1x32xf16>
+
+    //CHECK-COUNT-2: %{{.*}} = vector.shape_cast %{{.*}} : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %{{.*}} = vector.shuffle %{{.*}}, %{{.*}} [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %{{.*}} = vector.shape_cast %{{.*}} : vector<64xf16> to vector<1x64xf16>
+    //CHECK: %{{.*}} = xetile.broadcast %{{.*}} [0] : vector<1x64xf16> -> vector<32x64xf16>
+    //CHECK-COUNT-8: %{{.*}} = xetile.init_tile %[[arg0]][%{{.*}}, %{{.*}}] : memref<1024x1024xf16> -> !xetile.tile<8x32xf16>
+    //CHECK-COUNT-4: %{{.*}} = vector.extract_strided_slice %{{.*}} {offsets = [{{.*}}], sizes = [8, 64], strides = [1, 1]} : vector<32x64xf16> to vector<8x64xf16>
+    //CHECK-COUNT-8: %{{.*}} = vector.extract_strided_slice %{{.*}} {offsets = [{{.*}}], sizes = [8, 32], strides = [1, 1]} : vector<8x64xf16> to vector<8x32xf16>
+    //CHECK-COUNT-8: xetile.store_tile %{{.*}},  %{{.*}} : vector<8x32xf16>, !xetile.tile<8x32xf16>
+    %1 = xetile.init_tile %a[0, 0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16>
+    %2 = xetile.load_tile %1: !xetile.tile<32x64xf16> -> vector<32x64xf16>
+    %3 = xetile.reduction <add>, %2 [0]: vector<32x64xf16> -> vector<1x64xf16>
+    %4 = xetile.broadcast %3 [0]: vector<1x64xf16> -> vector<32x64xf16>
+    %5 = xetile.init_tile %a[0, 0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16>
+    xetile.store_tile %4, %5: vector<32x64xf16>, !xetile.tile<32x64xf16>
+    gpu.return
+  }
+
+
+  //CHECK-LABEL: gpu.func @sglevel_reduction_broadcast_dim_1
+  //CHECK-SAME: (%[[arg0:.*]]: memref<1024x1024xf16>)
+  gpu.func @sglevel_reduction_broadcast_dim_1(%a: memref<1024x1024xf16>) {
+
+    //CHECK: %[[c24:.*]] = arith.constant 24 : index
+    //CHECK: %[[c16:.*]] = arith.constant 16 : index
+    //CHECK: %[[c8:.*]] = arith.constant 8 : index
+    //CHECK: %[[c31_i32:.*]] = arith.constant 31 : i32
+    //CHECK: %[[c30_i32:.*]] = arith.constant 30 : i32
+    //CHECK: %[[c29_i32:.*]] = arith.constant 29 : i32
+    //CHECK: %[[c28_i32:.*]] = arith.constant 28 : i32
+    //CHECK: %[[c27_i32:.*]] = arith.constant 27 : i32
+    //CHECK: %[[c26_i32:.*]] = arith.constant 26 : i32
+    //CHECK: %[[c25_i32:.*]] = arith.constant 25 : i32
+    //CHECK: %[[c24_i32:.*]] = arith.constant 24 : i32
+    //CHECK: %[[c23_i32:.*]] = arith.constant 23 : i32
+    //CHECK: %[[c22_i32:.*]] = arith.constant 22 : i32
+    //CHECK: %[[c21_i32:.*]] = arith.constant 21 : i32
+    //CHECK: %[[c20_i32:.*]] = arith.constant 20 : i32
+    //CHECK: %[[c19_i32:.*]] = arith.constant 19 : i32
+    //CHECK: %[[c18_i32:.*]] = arith.constant 18 : i32
+    //CHECK: %[[c17_i32:.*]] = arith.constant 17 : i32
+    //CHECK: %[[c16_i32:.*]] = arith.constant 16 : i32
+    //CHECK: %[[c15_i32:.*]] = arith.constant 15 : i32
+    //CHECK: %[[c14_i32:.*]] = arith.constant 14 : i32
+    //CHECK: %[[c13_i32:.*]] = arith.constant 13 : i32
+    //CHECK: %[[c12_i32:.*]] = arith.constant 12 : i32
+    //CHECK: %[[c11_i32:.*]] = arith.constant 11 : i32
+    //CHECK: %[[c10_i32:.*]] = arith.constant 10 : i32
+    //CHECK: %[[c9_i32:.*]] = arith.constant 9 : i32
+    //CHECK: %[[c8_i32:.*]] = arith.constant 8 : i32
+    //CHECK: %[[c7_i32:.*]] = arith.constant 7 : i32
+    //CHECK: %[[c6_i32:.*]] = arith.constant 6 : i32
+    //CHECK: %[[c5_i32:.*]] = arith.constant 5 : i32
+    //CHECK: %[[c4_i32:.*]] = arith.constant 4 : i32
+    //CHECK: %[[c3_i32:.*]] = arith.constant 3 : i32
+    //CHECK: %[[c2_i32:.*]] = arith.constant 2 : i32
+    //CHECK: %[[c1_i32:.*]] = arith.constant 1 : i32
+    //CHECK: %[[c0_i32:.*]] = arith.constant 0 : i32
+    //CHECK: %[[c32:.*]] = arith.constant 32 : index
+    //CHECK: %[[c0:.*]] = arith.constant 0 : index
+    //CHECK: %[[r0:.*]] = xetile.init_tile %[[arg0]][%[[c0]], %[[c0]]] : memref<1024x1024xf16> -> !xetile.tile<32x32xf16>
+    //CHECK: %[[r1:.*]] = xetile.init_tile %[[arg0]][%[[c0]], %[[c32]]] : memref<1024x1024xf16> -> !xetile.tile<32x32xf16>
+    //CHECK: %[[r2:.*]] = xetile.load_tile %[[r0]] : !xetile.tile<32x32xf16> -> vector<32x32xf16>
+    //CHECK: %[[r3:.*]] = xetile.load_tile %[[r1]] : !xetile.tile<32x32xf16> -> vector<32x32xf16>
+    //CHECK: %[[r4:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [0, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r5:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [1, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r6:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [2, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r7:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [3, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r8:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [4, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r9:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [5, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r10:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [6, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r11:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [7, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r12:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [8, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r13:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [9, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r14:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [10, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r15:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [11, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r16:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [12, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r17:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [13, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r18:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [14, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r19:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [15, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r20:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [16, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r21:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [17, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r22:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [18, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r23:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [19, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r24:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [20, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r25:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [21, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r26:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [22, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r27:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [23, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r28:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [24, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r29:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [25, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r30:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [26, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r31:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [27, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r32:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [28, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r33:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [29, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r34:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [30, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r35:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [31, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r36:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [0, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r37:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [1, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r38:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [2, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r39:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [3, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r40:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [4, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r41:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [5, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r42:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [6, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r43:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [7, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r44:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [8, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r45:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [9, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r46:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [10, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r47:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [11, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r48:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [12, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r49:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [13, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r50:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [14, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r51:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [15, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r52:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [16, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r53:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [17, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r54:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [18, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r55:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [19, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r56:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [20, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r57:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [21, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r58:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [22, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r59:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [23, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r60:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [24, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r61:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [25, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r62:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [26, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r63:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [27, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r64:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [28, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r65:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [29, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r66:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [30, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r67:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [31, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r68:.*]] = arith.addf %[[r4]], %[[r36]] : vector<1x32xf16>
+    //CHECK: %[[r69:.*]] = vector.shape_cast %[[r68]] : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r70:.*]] = arith.addf %[[r5]], %[[r37]] : vector<1x32xf16>
+    //CHECK: %[[r71:.*]] = vector.shape_cast %[[r70]] : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r72:.*]] = arith.addf %[[r6]], %[[r38]] : vector<1x32xf16>
+    //CHECK: %[[r73:.*]] = vector.shape_cast %[[r72]] : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r74:.*]] = arith.addf %[[r7]], %[[r39]] : vector<1x32xf16>
+    //CHECK: %[[r75:.*]] = vector.shape_cast %[[r74]] : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r76:.*]] = arith.addf %[[r8]], %[[r40]] : vector<1x32xf16>
+    //CHECK: %[[r77:.*]] = vector.shape_cast %[[r76]] : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r78:.*]] = arith.addf %[[r9]], %[[r41]] : vector<1x32xf16>
+    //CHECK: %[[r79:.*]] = vector.shape_cast %[[r78]] : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r80:.*]] = arith.addf %[[r10]], %[[r42]] : vector<1x32xf16>
+    //CHECK: %[[r81:.*]] = vector.shape_cast %[[r80]] : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r82:.*]] = arith.addf %[[r11]], %[[r43]] : vector<1x32xf16>
+    //CHECK: %[[r83:.*]] = vector.shape_cast %[[r82]] : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r84:.*]] = arith.addf %[[r12]], %[[r44]] : vector<1x32xf16>
+    //CHECK: %[[r85:.*]] = vector.shape_cast %[[r84]] : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r86:.*]] = arith.addf %[[r13]], %[[r45]] : vector<1x32xf16>
+    //CHECK: %[[r87:.*]] = vector.shape_cast %[[r86]] : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r88:.*]] = arith.addf %[[r14]], %[[r46]] : vector<1x32xf16>
+    //CHECK: %[[r89:.*]] = vector.shape_cast %[[r88]] : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r90:.*]] = arith.addf %[[r15]], %[[r47]] : vector<1x32xf16>
+    //CHECK: %[[r91:.*]] = vector.shape_cast %[[r90]] : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r92:.*]] = arith.addf %[[r16]], %[[r48]] : vector<1x32xf16>
+    //CHECK: %[[r93:.*]] = vector.shape_cast %[[r92]] : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r94:.*]] = arith.addf %[[r17]], %[[r49]] : vector<1x32xf16>
+    //CHECK: %[[r95:.*]] = vector.shape_cast %[[r94]] : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r96:.*]] = arith.addf %[[r18]], %[[r50]] : vector<1x32xf16>
+    //CHECK: %[[r97:.*]] = vector.shape_cast %[[r96]] : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r98:.*]] = arith.addf %[[r19]], %[[r51]] : vector<1x32xf16>
+    //CHECK: %[[r99:.*]] = vector.shape_cast %[[r98]] : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r100:.*]] = arith.addf %[[r20]], %[[r52]] : vector<1x32xf16>
+    //CHECK: %[[r101:.*]] = vector.shape_cast %[[r100]] : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r102:.*]] = arith.addf %[[r21]], %[[r53]] : vector<1x32xf16>
+    //CHECK: %[[r103:.*]] = vector.shape_cast %[[r102]] : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r104:.*]] = arith.addf %[[r22]], %[[r54]] : vector<1x32xf16>
+    //CHECK: %[[r105:.*]] = vector.shape_cast %[[r104]] : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r106:.*]] = arith.addf %[[r23]], %[[r55]] : vector<1x32xf16>
+    //CHECK: %[[r107:.*]] = vector.shape_cast %[[r106]] : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r108:.*]] = arith.addf %[[r24]], %[[r56]] : vector<1x32xf16>
+    //CHECK: %[[r109:.*]] = vector.shape_cast %[[r108]] : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r110:.*]] = arith.addf %[[r25]], %[[r57]] : vector<1x32xf16>
+    //CHECK: %[[r111:.*]] = vector.shape_cast %[[r110]] : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r112:.*]] = arith.addf %[[r26]], %[[r58]] : vector<1x32xf16>
+    //CHECK: %[[r113:.*]] = vector.shape_cast %[[r112]] : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r114:.*]] = arith.addf %[[r27]], %[[r59]] : vector<1x32xf16>
+    //CHECK: %[[r115:.*]] = vector.shape_cast %[[r114]] : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r116:.*]] = arith.addf %[[r28]], %[[r60]] : vector<1x32xf16>
+    //CHECK: %[[r117:.*]] = vector.shape_cast %[[r116]] : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r118:.*]] = arith.addf %[[r29]], %[[r61]] : vector<1x32xf16>
+    //CHECK: %[[r119:.*]] = vector.shape_cast %[[r118]] : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r120:.*]] = arith.addf %[[r30]], %[[r62]] : vector<1x32xf16>
+    //CHECK: %[[r121:.*]] = vector.shape_cast %[[r120]] : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r122:.*]] = arith.addf %[[r31]], %[[r63]] : vector<1x32xf16>
+    //CHECK: %[[r123:.*]] = vector.shape_cast %[[r122]] : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r124:.*]] = arith.addf %[[r32]], %[[r64]] : vector<1x32xf16>
+    //CHECK: %[[r125:.*]] = vector.shape_cast %[[r124]] : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r126:.*]] = arith.addf %[[r33]], %[[r65]] : vector<1x32xf16>
+    //CHECK: %[[r127:.*]] = vector.shape_cast %[[r126]] : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r128:.*]] = arith.addf %[[r34]], %[[r66]] : vector<1x32xf16>
+    //CHECK: %[[r129:.*]] = vector.shape_cast %[[r128]] : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r130:.*]] = arith.addf %[[r35]], %[[r67]] : vector<1x32xf16>
+    //CHECK: %[[r131:.*]] = vector.shape_cast %[[r130]] : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r132:.*]] = vector.shuffle %[[r69]], %[[r71]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r133:.*]] = vector.shuffle %[[r69]], %[[r71]] [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r134:.*]] = arith.addf %[[r132]], %[[r133]] : vector<32xf16>
+    //CHECK: %[[r135:.*]] = vector.shuffle %[[r73]], %[[r75]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r136:.*]] = vector.shuffle %[[r73]], %[[r75]] [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r137:.*]] = arith.addf %[[r135]], %[[r136]] : vector<32xf16>
+    //CHECK: %[[r138:.*]] = vector.shuffle %[[r77]], %[[r79]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r139:.*]] = vector.shuffle %[[r77]], %[[r79]] [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r140:.*]] = arith.addf %[[r138]], %[[r139]] : vector<32xf16>
+    //CHECK: %[[r141:.*]] = vector.shuffle %[[r81]], %[[r83]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r142:.*]] = vector.shuffle %[[r81]], %[[r83]] [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r143:.*]] = arith.addf %[[r141]], %[[r142]] : vector<32xf16>
+    //CHECK: %[[r144:.*]] = vector.shuffle %[[r85]], %[[r87]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r145:.*]] = vector.shuffle %[[r85]], %[[r87]] [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r146:.*]] = arith.addf %[[r144]], %[[r145]] : vector<32xf16>
+    //CHECK: %[[r147:.*]] = vector.shuffle %[[r89]], %[[r91]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r148:.*]] = vector.shuffle %[[r89]], %[[r91]] [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r149:.*]] = arith.addf %[[r147]], %[[r148]] : vector<32xf16>
+    //CHECK: %[[r150:.*]] = vector.shuffle %[[r93]], %[[r95]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r151:.*]] = vector.shuffle %[[r93]], %[[r95]] [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r152:.*]] = arith.addf %[[r150]], %[[r151]] : vector<32xf16>
+    //CHECK: %[[r153:.*]] = vector.shuffle %[[r97]], %[[r99]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r154:.*]] = vector.shuffle %[[r97]], %[[r99]] [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r155:.*]] = arith.addf %[[r153]], %[[r154]] : vector<32xf16>
+    //CHECK: %[[r156:.*]] = vector.shuffle %[[r101]], %[[r103]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r157:.*]] = vector.shuffle %[[r101]], %[[r103]] [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r158:.*]] = arith.addf %[[r156]], %[[r157]] : vector<32xf16>
+    //CHECK: %[[r159:.*]] = vector.shuffle %[[r105]], %[[r107]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r160:.*]] = vector.shuffle %[[r105]], %[[r107]] [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r161:.*]] = arith.addf %[[r159]], %[[r160]] : vector<32xf16>
+    //CHECK: %[[r162:.*]] = vector.shuffle %[[r109]], %[[r111]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r163:.*]] = vector.shuffle %[[r109]], %[[r111]] [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r164:.*]] = arith.addf %[[r162]], %[[r163]] : vector<32xf16>
+    //CHECK: %[[r165:.*]] = vector.shuffle %[[r113]], %[[r115]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r166:.*]] = vector.shuffle %[[r113]], %[[r115]] [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r167:.*]] = arith.addf %[[r165]], %[[r166]] : vector<32xf16>
+    //CHECK: %[[r168:.*]] = vector.shuffle %[[r117]], %[[r119]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r169:.*]] = vector.shuffle %[[r117]], %[[r119]] [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r170:.*]] = arith.addf %[[r168]], %[[r169]] : vector<32xf16>
+    //CHECK: %[[r171:.*]] = vector.shuffle %[[r121]], %[[r123]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r172:.*]] = vector.shuffle %[[r121]], %[[r123]] [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r173:.*]] = arith.addf %[[r171]], %[[r172]] : vector<32xf16>
+    //CHECK: %[[r174:.*]] = vector.shuffle %[[r125]], %[[r127]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r175:.*]] = vector.shuffle %[[r125]], %[[r127]] [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r176:.*]] = arith.addf %[[r174]], %[[r175]] : vector<32xf16>
+    //CHECK: %[[r177:.*]] = vector.shuffle %[[r129]], %[[r131]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r178:.*]] = vector.shuffle %[[r129]], %[[r131]] [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r179:.*]] = arith.addf %[[r177]], %[[r178]] : vector<32xf16>
+    //CHECK: %[[r180:.*]] = vector.shuffle %[[r134]], %[[r137]] [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23, 32, 33, 34, 35, 36, 37, 38, 39, 48, 49, 50, 51, 52, 53, 54, 55] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r181:.*]] = vector.shuffle %[[r134]], %[[r137]] [8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31, 40, 41, 42, 43, 44, 45, 46, 47, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r182:.*]] = arith.addf %[[r180]], %[[r181]] : vector<32xf16>
+    //CHECK: %[[r183:.*]] = vector.shuffle %[[r140]], %[[r143]] [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23, 32, 33, 34, 35, 36, 37, 38, 39, 48, 49, 50, 51, 52, 53, 54, 55] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r184:.*]] = vector.shuffle %[[r140]], %[[r143]] [8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31, 40, 41, 42, 43, 44, 45, 46, 47, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r185:.*]] = arith.addf %[[r183]], %[[r184]] : vector<32xf16>
+    //CHECK: %[[r186:.*]] = vector.shuffle %[[r146]], %[[r149]] [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23, 32, 33, 34, 35, 36, 37, 38, 39, 48, 49, 50, 51, 52, 53, 54, 55] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r187:.*]] = vector.shuffle %[[r146]], %[[r149]] [8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31, 40, 41, 42, 43, 44, 45, 46, 47, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r188:.*]] = arith.addf %[[r186]], %[[r187]] : vector<32xf16>
+    //CHECK: %[[r189:.*]] = vector.shuffle %[[r152]], %[[r155]] [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23, 32, 33, 34, 35, 36, 37, 38, 39, 48, 49, 50, 51, 52, 53, 54, 55] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r190:.*]] = vector.shuffle %[[r152]], %[[r155]] [8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31, 40, 41, 42, 43, 44, 45, 46, 47, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r191:.*]] = arith.addf %[[r189]], %[[r190]] : vector<32xf16>
+    //CHECK: %[[r192:.*]] = vector.shuffle %[[r158]], %[[r161]] [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23, 32, 33, 34, 35, 36, 37, 38, 39, 48, 49, 50, 51, 52, 53, 54, 55] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r193:.*]] = vector.shuffle %[[r158]], %[[r161]] [8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31, 40, 41, 42, 43, 44, 45, 46, 47, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r194:.*]] = arith.addf %[[r192]], %[[r193]] : vector<32xf16>
+    //CHECK: %[[r195:.*]] = vector.shuffle %[[r164]], %[[r167]] [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23, 32, 33, 34, 35, 36, 37, 38, 39, 48, 49, 50, 51, 52, 53, 54, 55] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r196:.*]] = vector.shuffle %[[r164]], %[[r167]] [8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31, 40, 41, 42, 43, 44, 45, 46, 47, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r197:.*]] = arith.addf %[[r195]], %[[r196]] : vector<32xf16>
+    //CHECK: %[[r198:.*]] = vector.shuffle %[[r170]], %[[r173]] [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23, 32, 33, 34, 35, 36, 37, 38, 39, 48, 49, 50, 51, 52, 53, 54, 55] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r199:.*]] = vector.shuffle %[[r170]], %[[r173]] [8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31, 40, 41, 42, 43, 44, 45, 46, 47, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r200:.*]] = arith.addf %[[r198]], %[[r199]] : vector<32xf16>
+    //CHECK: %[[r201:.*]] = vector.shuffle %[[r176]], %[[r179]] [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23, 32, 33, 34, 35, 36, 37, 38, 39, 48, 49, 50, 51, 52, 53, 54, 55] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r202:.*]] = vector.shuffle %[[r176]], %[[r179]] [8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31, 40, 41, 42, 43, 44, 45, 46, 47, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r203:.*]] = arith.addf %[[r201]], %[[r202]] : vector<32xf16>
+    //CHECK: %[[r204:.*]] = vector.shuffle %[[r182]], %[[r185]] [0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 32, 33, 34, 35, 40, 41, 42, 43, 48, 49, 50, 51, 56, 57, 58, 59] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r205:.*]] = vector.shuffle %[[r182]], %[[r185]] [4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31, 36, 37, 38, 39, 44, 45, 46, 47, 52, 53, 54, 55, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r206:.*]] = arith.addf %[[r204]], %[[r205]] : vector<32xf16>
+    //CHECK: %[[r207:.*]] = vector.shuffle %[[r188]], %[[r191]] [0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 32, 33, 34, 35, 40, 41, 42, 43, 48, 49, 50, 51, 56, 57, 58, 59] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r208:.*]] = vector.shuffle %[[r188]], %[[r191]] [4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31, 36, 37, 38, 39, 44, 45, 46, 47, 52, 53, 54, 55, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r209:.*]] = arith.addf %[[r207]], %[[r208]] : vector<32xf16>
+    //CHECK: %[[r210:.*]] = vector.shuffle %[[r194]], %[[r197]] [0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 32, 33, 34, 35, 40, 41, 42, 43, 48, 49, 50, 51, 56, 57, 58, 59] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r211:.*]] = vector.shuffle %[[r194]], %[[r197]] [4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31, 36, 37, 38, 39, 44, 45, 46, 47, 52, 53, 54, 55, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r212:.*]] = arith.addf %[[r210]], %[[r211]] : vector<32xf16>
+    //CHECK: %[[r213:.*]] = vector.shuffle %[[r200]], %[[r203]] [0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 32, 33, 34, 35, 40, 41, 42, 43, 48, 49, 50, 51, 56, 57, 58, 59] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r214:.*]] = vector.shuffle %[[r200]], %[[r203]] [4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31, 36, 37, 38, 39, 44, 45, 46, 47, 52, 53, 54, 55, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r215:.*]] = arith.addf %[[r213]], %[[r214]] : vector<32xf16>
+    //CHECK: %[[r216:.*]] = vector.shuffle %[[r206]], %[[r209]] [0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29, 32, 33, 36, 37, 40, 41, 44, 45, 48, 49, 52, 53, 56, 57, 60, 61] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r217:.*]] = vector.shuffle %[[r206]], %[[r209]] [2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31, 34, 35, 38, 39, 42, 43, 46, 47, 50, 51, 54, 55, 58, 59, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r218:.*]] = arith.addf %[[r216]], %[[r217]] : vector<32xf16>
+    //CHECK: %[[r219:.*]] = vector.shuffle %[[r212]], %[[r215]] [0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29, 32, 33, 36, 37, 40, 41, 44, 45, 48, 49, 52, 53, 56, 57, 60, 61] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r220:.*]] = vector.shuffle %[[r212]], %[[r215]] [2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31, 34, 35, 38, 39, 42, 43, 46, 47, 50, 51, 54, 55, 58, 59, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r221:.*]] = arith.addf %[[r219]], %[[r220]] : vector<32xf16>
+    //CHECK: %[[r222:.*]] = vector.shuffle %[[r218]], %[[r221]] [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r223:.*]] = vector.shuffle %[[r218]], %[[r221]] [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r224:.*]] = arith.addf %[[r222]], %[[r223]] : vector<32xf16>
+    //CHECK: %[[r225:.*]] = vector.extractelement %[[r224]][%[[c0_i32]] : i32] : vector<32xf16>
+    //CHECK: %[[r226:.*]] = vector.splat %[[r225]] : vector<1x1xf16>
+    //CHECK: %[[r227:.*]] = vector.extractelement %[[r224]][%[[c1_i32]] : i32] : vector<32xf16>
+    //CHECK: %[[r228:.*]] = vector.splat %[[r227]] : vector<1x1xf16>
+    //CHECK: %[[r229:.*]] = vector.extractelement %224[%c2_i32 : i32] : vector<32xf16>
+    //CHECK: %[[r230:.*]] = vector.splat %[[r229]] : vector<1x1xf16>
+    //CHECK: %[[r231:.*]] = vector.extractelement %[[r224]][%[[c3_i32]] : i32] : vector<32xf16>
+    //CHECK: %[[r232:.*]] = vector.splat %[[r231]] : vector<1x1xf16>
+    //CHECK: %[[r233:.*]] = vector.extractelement %[[r224]][%[[c4_i32]] : i32] : vector<32xf16>
+    //CHECK: %[[r234:.*]] = vector.splat %[[r233]] : vector<1x1xf16>
+    //CHECK: %[[r235:.*]] = vector.extractelement %[[r224]][%[[c5_i32]] : i32] : vector<32xf16>
+    //CHECK: %[[r236:.*]] = vector.splat %[[r235]] : vector<1x1xf16>
+    //CHECK: %[[r237:.*]] = vector.extractelement %[[r224]][%[[c6_i32]] : i32] : vector<32xf16>
+    //CHECK: %[[r238:.*]] = vector.splat %[[r237]] : vector<1x1xf16>
+    //CHECK: %[[r239:.*]] = vector.extractelement %[[r224]][%[[c7_i32]] : i32] : vector<32xf16>
+    //CHECK: %[[r240:.*]] = vector.splat %[[r239]] : vector<1x1xf16>
+    //CHECK: %[[r241:.*]] = vector.extractelement %[[r224]][%[[c8_i32]] : i32] : vector<32xf16>
+    //CHECK: %[[r242:.*]] = vector.splat %[[r241]] : vector<1x1xf16>
+    //CHECK: %[[r243:.*]] = vector.extractelement %[[r224]][%[[c9_i32]] : i32] : vector<32xf16>
+    //CHECK: %[[r244:.*]] = vector.splat %[[r243]] : vector<1x1xf16>
+    //CHECK: %[[r245:.*]] = vector.extractelement %[[r224]][%[[c10_i32]] : i32] : vector<32xf16>
+    //CHECK: %[[r246:.*]] = vector.splat %[[r245]] : vector<1x1xf16>
+    //CHECK: %[[r247:.*]] = vector.extractelement %[[r224]][%[[c11_i32]] : i32] : vector<32xf16>
+    //CHECK: %[[r248:.*]] = vector.splat %[[r247]] : vector<1x1xf16>
+    //CHECK: %[[r249:.*]] = vector.extractelement %[[r224]][%[[c12_i32]] : i32] : vector<32xf16>
+    //CHECK: %[[r250:.*]] = vector.splat %[[r249]] : vector<1x1xf16>
+    //CHECK: %[[r251:.*]] = vector.extractelement %[[r224]][%[[c13_i32]] : i32] : vector<32xf16>
+    //CHECK: %[[r252:.*]] = vector.splat %[[r251]] : vector<1x1xf16>
+    //CHECK: %[[r253:.*]] = vector.extractelement %[[r224]][%[[c14_i32]] : i32] : vector<32xf16>
+    //CHECK: %[[r254:.*]] = vector.splat %[[r253]] : vector<1x1xf16>
+    //CHECK: %[[r255:.*]] = vector.extractelement %[[r224]][%[[c15_i32]] : i32] : vector<32xf16>
+    //CHECK: %[[r256:.*]] = vector.splat %[[r255]] : vector<1x1xf16>
+    //CHECK: %[[r257:.*]] = vector.extractelement %[[r224]][%[[c16_i32]] : i32] : vector<32xf16>
+    //CHECK: %[[r258:.*]] = vector.splat %[[r257]] : vector<1x1xf16>
+    //CHECK: %[[r259:.*]] = vector.extractelement %[[r224]][%[[c17_i32]] : i32] : vector<32xf16>
+    //CHECK: %[[r260:.*]] = vector.splat %[[r259]] : vector<1x1xf16>
+    //CHECK: %[[r261:.*]] = vector.extractelement %[[r224]][%[[c18_i32]] : i32] : vector<32xf16>
+    //CHECK: %[[r262:.*]] = vector.splat %[[r261]] : vector<1x1xf16>
+    //CHECK: %[[r263:.*]] = vector.extractelement %[[r224]][%[[c19_i32]] : i32] : vector<32xf16>
+    //CHECK: %[[r264:.*]] = vector.splat %[[r263]] : vector<1x1xf16>
+    //CHECK: %[[r265:.*]] = vector.extractelement %[[r224]][%[[c20_i32]] : i32] : vector<32xf16>
+    //CHECK: %[[r266:.*]] = vector.splat %[[r265]] : vector<1x1xf16>
+    //CHECK: %[[r267:.*]] = vector.extractelement %[[r224]][%[[c21_i32]] : i32] : vector<32xf16>
+    //CHECK: %[[r268:.*]] = vector.splat %[[r267]] : vector<1x1xf16>
+    //CHECK: %[[r269:.*]] = vector.extractelement %[[r224]][%[[c22_i32]] : i32] : vector<32xf16>
+    //CHECK: %[[r270:.*]] = vector.splat %[[r269]] : vector<1x1xf16>
+    //CHECK: %[[r271:.*]] = vector.extractelement %[[r224]][%[[c23_i32]] : i32] : vector<32xf16>
+    //CHECK: %[[r272:.*]] = vector.splat %[[r271]] : vector<1x1xf16>
+    //CHECK: %[[r273:.*]] = vector.extractelement %[[r224]][%[[c24_i32]] : i32] : vector<32xf16>
+    //CHECK: %[[r274:.*]] = vector.splat %[[r273]] : vector<1x1xf16>
+    //CHECK: %[[r275:.*]] = vector.extractelement %[[r224]][%[[c25_i32]] : i32] : vector<32xf16>
+    //CHECK: %[[r276:.*]] = vector.splat %[[r275]] : vector<1x1xf16>
+    //CHECK: %[[r277:.*]] = vector.extractelement %[[r224]][%[[c26_i32]] : i32] : vector<32xf16>
+    //CHECK: %[[r278:.*]] = vector.splat %[[r277]] : vector<1x1xf16>
+    //CHECK: %[[r279:.*]] = vector.extractelement %[[r224]][%[[c27_i32]] : i32] : vector<32xf16>
+    //CHECK: %[[r280:.*]] = vector.splat %[[r279]] : vector<1x1xf16>
+    //CHECK: %[[r281:.*]] = vector.extractelement %[[r224]][%c28_i32 : i32] : vector<32xf16>
+    //CHECK: %[[r282:.*]] = vector.splat %[[r281]] : vector<1x1xf16>
+    //CHECK: %[[r283:.*]] = vector.extractelement %[[r224]][%[[c29_i32]] : i32] : vector<32xf16>
+    //CHECK: %[[r284:.*]] = vector.splat %[[r283]] : vector<1x1xf16>
+    //CHECK: %[[r285:.*]] = vector.extractelement %[[r224]][%[[c30_i32]] : i32] : vector<32xf16>
+    //CHECK: %[[r286:.*]] = vector.splat %[[r285]] : vector<1x1xf16>
+    //CHECK: %[[r287:.*]] = vector.extractelement %[[r224]][%[[c31_i32]] : i32] : vector<32xf16>
+    //CHECK: %[[r288:.*]] = vector.splat %[[r287]] : vector<1x1xf16>
+    //CHECK: %[[r289:.*]] = vector.shuffle %[[r226]], %[[r228]] [0, 1] : vector<1x1xf16>, vector<1x1xf16>
+    //CHECK: %[[r290:.*]] = vector.shuffle %[[r230]], %[[r232]] [0, 1] : vector<1x1xf16>, vector<1x1xf16>
+    //CHECK: %[[r291:.*]] = vector.shuffle %[[r234]], %[[r236]] [0, 1] : vector<1x1xf16>, vector<1x1xf16>
+    //CHECK: %[[r292:.*]] = vector.shuffle %[[r238]], %[[r240]] [0, 1] : vector<1x1xf16>, vector<1x1xf16>
+    //CHECK: %[[r293:.*]] = vector.shuffle %[[r242]], %[[r244]] [0, 1] : vector<1x1xf16>, vector<1x1xf16>
+    //CHECK: %[[r294:.*]] = vector.shuffle %[[r246]], %[[r248]] [0, 1] : vector<1x1xf16>, vector<1x1xf16>
+    //CHECK: %[[r295:.*]] = vector.shuffle %[[r250]], %[[r252]] [0, 1] : vector<1x1xf16>, vector<1x1xf16>
+    //CHECK: %[[r296:.*]] = vector.shuffle %[[r254]], %[[r256]] [0, 1] : vector<1x1xf16>, vector<1x1xf16>
+    //CHECK: %[[r297:.*]] = vector.shuffle %[[r258]], %[[r260]] [0, 1] : vector<1x1xf16>, vector<1x1xf16>
+    //CHECK: %[[r298:.*]] = vector.shuffle %[[r262]], %[[r264]] [0, 1] : vector<1x1xf16>, vector<1x1xf16>
+    //CHECK: %[[r299:.*]] = vector.shuffle %[[r266]], %[[r268]] [0, 1] : vector<1x1xf16>, vector<1x1xf16>
+    //CHECK: %[[r300:.*]] = vector.shuffle %[[r270]], %[[r272]] [0, 1] : vector<1x1xf16>, vector<1x1xf16>
+    //CHECK: %[[r301:.*]] = vector.shuffle %[[r274]], %[[r276]] [0, 1] : vector<1x1xf16>, vector<1x1xf16>
+    //CHECK: %[[r302:.*]] = vector.shuffle %[[r278]], %[[r280]] [0, 1] : vector<1x1xf16>, vector<1x1xf16>
+    //CHECK: %[[r303:.*]] = vector.shuffle %[[r282]], %[[r284]] [0, 1] : vector<1x1xf16>, vector<1x1xf16>
+    //CHECK: %[[r304:.*]] = vector.shuffle %[[r286]], %[[r288]] [0, 1] : vector<1x1xf16>, vector<1x1xf16>
+    //CHECK: %[[r305:.*]] = vector.shuffle %[[r289]], %[[r290]] [0, 1, 2, 3] : vector<2x1xf16>, vector<2x1xf16>
+    //CHECK: %[[r306:.*]] = vector.shuffle %[[r291]], %[[r292]] [0, 1, 2, 3] : vector<2x1xf16>, vector<2x1xf16>
+    //CHECK: %[[r307:.*]] = vector.shuffle %[[r293]], %[[r294]] [0, 1, 2, 3] : vector<2x1xf16>, vector<2x1xf16>
+    //CHECK: %[[r308:.*]] = vector.shuffle %[[r295]], %[[r296]] [0, 1, 2, 3] : vector<2x1xf16>, vector<2x1xf16>
+    //CHECK: %[[r309:.*]] = vector.shuffle %[[r297]], %[[r298]] [0, 1, 2, 3] : vector<2x1xf16>, vector<2x1xf16>
+    //CHECK: %[[r310:.*]] = vector.shuffle %[[r299]], %[[r300]] [0, 1, 2, 3] : vector<2x1xf16>, vector<2x1xf16>
+    //CHECK: %[[r311:.*]] = vector.shuffle %[[r301]], %[[r302]] [0, 1, 2, 3] : vector<2x1xf16>, vector<2x1xf16>
+    //CHECK: %[[r312:.*]] = vector.shuffle %[[r303]], %[[r304]] [0, 1, 2, 3] : vector<2x1xf16>, vector<2x1xf16>
+    //CHECK: %[[r313:.*]] = vector.shuffle %[[r305]], %[[r306]] [0, 1, 2, 3, 4, 5, 6, 7] : vector<4x1xf16>, vector<4x1xf16>
+    //CHECK: %[[r314:.*]] = vector.shuffle %[[r307]], %[[r308]] [0, 1, 2, 3, 4, 5, 6, 7] : vector<4x1xf16>, vector<4x1xf16>
+    //CHECK: %[[r315:.*]] = vector.shuffle %[[r309]], %[[r310]] [0, 1, 2, 3, 4, 5, 6, 7] : vector<4x1xf16>, vector<4x1xf16>
+    //CHECK: %[[r316:.*]] = vector.shuffle %[[r311]], %[[r312]] [0, 1, 2, 3, 4, 5, 6, 7] : vector<4x1xf16>, vector<4x1xf16>
+    //CHECK: %[[r317:.*]] = vector.shuffle %[[r313]], %[[r314]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : vector<8x1xf16>, vector<8x1xf16>
+    //CHECK: %[[r318:.*]] = vector.shuffle %[[r315]], %[[r316]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : vector<8x1xf16>, vector<8x1xf16>
+    //CHECK: %[[r319:.*]] = vector.shuffle %[[r317]], %[[r318]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] : vector<16x1xf16>, vector<16x1xf16>
+    //CHECK: %[[r320:.*]] = xetile.broadcast %[[r319]] [1] : vector<32x1xf16> -> vector<32x64xf16>
+    //CHECK: %[[r321:.*]] = xetile.init_tile %[[arg0]][%[[c0]], %[[c0]]] : memref<1024x1024xf16> -> !xetile.tile<8x32xf16>
+    //CHECK: %[[r322:.*]] = xetile.init_tile %[[arg0]][%[[c0]], %[[c32]]] : memref<1024x1024xf16> -> !xetile.tile<8x32xf16>
+    //CHECK: %[[r323:.*]] = xetile.init_tile %[[arg0]][%[[c8]], %[[c0]]] : memref<1024x1024xf16> -> !xetile.tile<8x32xf16>
+    //CHECK: %[[r324:.*]] = xetile.init_tile %[[arg0]][%[[c8]], %[[c32]]] : memref<1024x1024xf16> -> !xetile.tile<8x32xf16>
+    //CHECK: %[[r325:.*]] = xetile.init_tile %[[arg0]][%[[c16]], %[[c0]]] : memref<1024x1024xf16> -> !xetile.tile<8x32xf16>
+    //CHECK: %[[r326:.*]] = xetile.init_tile %[[arg0]][%[[c16]], %[[c32]]] : memref<1024x1024xf16> -> !xetile.tile<8x32xf16>
+    //CHECK: %[[r327:.*]] = xetile.init_tile %[[arg0]][%[[c24]], %[[c0]]] : memref<1024x1024xf16> -> !xetile.tile<8x32xf16>
+    //CHECK: %[[r328:.*]] = xetile.init_tile %[[arg0]][%[[c24]], %[[c32]]] : memref<1024x1024xf16> -> !xetile.tile<8x32xf16>
+    //CHECK: %[[r329:.*]] = vector.extract_strided_slice %[[r320]] {offsets = [0, 0], sizes = [8, 64], strides = [1, 1]} : vector<32x64xf16> to vector<8x64xf16>
+    //CHECK: %[[r330:.*]] = vector.extract_strided_slice %[[r320]] {offsets = [8, 0], sizes = [8, 64], strides = [1, 1]} : vector<32x64xf16> to vector<8x64xf16>
+    //CHECK: %[[r331:.*]] = vector.extract_strided_slice %[[r320]] {offsets = [16, 0], sizes = [8, 64], strides = [1, 1]} : vector<32x64xf16> to vector<8x64xf16>
+    //CHECK: %[[r332:.*]] = vector.extract_strided_slice %[[r320]] {offsets = [24, 0], sizes = [8, 64], strides = [1, 1]} : vector<32x64xf16> to vector<8x64xf16>
+    //CHECK: %[[r333:.*]] = vector.extract_strided_slice %[[r329]] {offsets = [0, 0], sizes = [8, 32], strides = [1, 1]} : vector<8x64xf16> to vector<8x32xf16>
+    //CHECK: %[[r334:.*]] = vector.extract_strided_slice %[[r329]] {offsets = [0, 32], sizes = [8, 32], strides = [1, 1]} : vector<8x64xf16> to vector<8x32xf16>
+    //CHECK: %[[r335:.*]] = vector.extract_strided_slice %[[r330]] {offsets = [0, 0], sizes = [8, 32], strides = [1, 1]} : vector<8x64xf16> to vector<8x32xf16>
+    //CHECK: %[[r336:.*]] = vector.extract_strided_slice %[[r330]] {offsets = [0, 32], sizes = [8, 32], strides = [1, 1]} : vector<8x64xf16> to vector<8x32xf16>
+    //CHECK: %[[r337:.*]] = vector.extract_strided_slice %[[r331]] {offsets = [0, 0], sizes = [8, 32], strides = [1, 1]} : vector<8x64xf16> to vector<8x32xf16>
+    //CHECK: %[[r338:.*]] = vector.extract_strided_slice %[[r331]] {offsets = [0, 32], sizes = [8, 32], strides = [1, 1]} : vector<8x64xf16> to vector<8x32xf16>
+    //CHECK: %[[r339:.*]] = vector.extract_strided_slice %[[r332]] {offsets = [0, 0], sizes = [8, 32], strides = [1, 1]} : vector<8x64xf16> to vector<8x32xf16>
+    //CHECK: %[[r340:.*]] = vector.extract_strided_slice %[[r332]] {offsets = [0, 32], sizes = [8, 32], strides = [1, 1]} : vector<8x64xf16> to vector<8x32xf16>
+    //CHECK: xetile.store_tile %[[r333]],  %[[r321]] : vector<8x32xf16>, !xetile.tile<8x32xf16>
+    //CHECK: xetile.store_tile %[[r334]],  %[[r322]] : vector<8x32xf16>, !xetile.tile<8x32xf16>
+    //CHECK: xetile.store_tile %[[r335]],  %[[r323]] : vector<8x32xf16>, !xetile.tile<8x32xf16>
+    //CHECK: xetile.store_tile %[[r336]],  %[[r324]] : vector<8x32xf16>, !xetile.tile<8x32xf16>
+    //CHECK: xetile.store_tile %[[r337]],  %[[r325]] : vector<8x32xf16>, !xetile.tile<8x32xf16>
+    //CHECK: xetile.store_tile %[[r338]],  %[[r326]] : vector<8x32xf16>, !xetile.tile<8x32xf16>
+    //CHECK: xetile.store_tile %[[r339]],  %[[r327]] : vector<8x32xf16>, !xetile.tile<8x32xf16>
+    //CHECK: xetile.store_tile %[[r340]],  %[[r328]] : vector<8x32xf16>, !xetile.tile<8x32xf16>
+    %1 = xetile.init_tile %a[0, 0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16>
+    %2 = xetile.load_tile %1: !xetile.tile<32x64xf16> -> vector<32x64xf16>
+    %3 = xetile.reduction <add>, %2 [1]: vector<32x64xf16> -> vector<32x1xf16>
+    %4 = xetile.broadcast %3 [1]: vector<32x1xf16> -> vector<32x64xf16>
+    %5 = xetile.init_tile %a[0, 0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16>
+    xetile.store_tile %4, %5: vector<32x64xf16>, !xetile.tile<32x64xf16>
+    gpu.return
+  }
+
+
+  //CHECK-LABEL: gpu.func @sglevel_reduction_broadcast_transpose
+  //CHECK-SAME(%[[arg0:.*]]: memref<1024x1024xf16>)
+  gpu.func @sglevel_reduction_broadcast_transpose(%a: memref<1024x1024xf16>) {
+    //CHECK: %[[c56:.*]] = arith.constant 56 : index
+    //CHECK: %[[c48:.*]] = arith.constant 48 : index
+    //CHECK: %[[c40:.*]] = arith.constant 40 : index
+    //CHECK: %[[c24:.*]] = arith.constant 24 : index
+    //CHECK: %[[c16:.*]] = arith.constant 16 : index
+    //CHECK: %[[c8:.*]] = arith.constant 8 : index
+    //CHECK: %[[c31_i32:.*]] = arith.constant 31 : i32
+    //CHECK: %[[c30_i32:.*]] = arith.constant 30 : i32
+    //CHECK: %[[c29_i32:.*]] = arith.constant 29 : i32
+    //CHECK: %[[c28_i32:.*]] = arith.constant 28 : i32
+    //CHECK: %[[c27_i32:.*]] = arith.constant 27 : i32
+    //CHECK: %[[c26_i32:.*]] = arith.constant 26 : i32
+    //CHECK: %[[c25_i32:.*]] = arith.constant 25 : i32
+    //CHECK: %[[c24_i32:.*]] = arith.constant 24 : i32
+    //CHECK: %[[c23_i32:.*]] = arith.constant 23 : i32
+    //CHECK: %[[c22_i32:.*]] = arith.constant 22 : i32
+    //CHECK: %[[c21_i32:.*]] = arith.constant 21 : i32
+    //CHECK: %[[c20_i32:.*]] = arith.constant 20 : i32
+    //CHECK: %[[c19_i32:.*]] = arith.constant 19 : i32
+    //CHECK: %[[c18_i32:.*]] = arith.constant 18 : i32
+    //CHECK: %[[c17_i32:.*]] = arith.constant 17 : i32
+    //CHECK: %[[c16_i32:.*]] = arith.constant 16 : i32
+    //CHECK: %[[c15_i32:.*]] = arith.constant 15 : i32
+    //CHECK: %[[c14_i32:.*]] = arith.constant 14 : i32
+    //CHECK: %[[c13_i32:.*]] = arith.constant 13 : i32
+    //CHECK: %[[c12_i32:.*]] = arith.constant 12 : i32
+    //CHECK: %[[c11_i32:.*]] = arith.constant 11 : i32
+    //CHECK: %[[c10_i32:.*]] = arith.constant 10 : i32
+    //CHECK: %[[c9_i32:.*]] = arith.constant 9 : i32
+    //CHECK: %[[c8_i32:.*]] = arith.constant 8 : i32
+    //CHECK: %[[c7_i32:.*]] = arith.constant 7 : i32
+    //CHECK: %[[c6_i32:.*]] = arith.constant 6 : i32
+    //CHECK: %[[c5_i32:.*]] = arith.constant 5 : i32
+    //CHECK: %[[c4_i32:.*]] = arith.constant 4 : i32
+    //CHECK: %[[c3_i32:.*]] = arith.constant 3 : i32
+    //CHECK: %[[c2_i32:.*]] = arith.constant 2 : i32
+    //CHECK: %[[c1_i32:.*]] = arith.constant 1 : i32
+    //CHECK: %[[c0_i32:.*]] = arith.constant 0 : i32
+    //CHECK: %[[c32:.*]] = arith.constant 32 : index
+    //CHECK: %[[c0:.*]] = arith.constant 0 : index
+    //CHECK: %[[r0:.*]] = xetile.init_tile %[[arg0]][%[[c0]], %[[c0]]] : memref<1024x1024xf16> -> !xetile.tile<32x32xf16>
+    //CHECK: %[[r1:.*]] = xetile.init_tile %[[arg0]][%[[c0]], %[[c32]]] : memref<1024x1024xf16> -> !xetile.tile<32x32xf16>
+    //CHECK: %[[r2:.*]] = xetile.load_tile %[[r0]] : !xetile.tile<32x32xf16> -> vector<32x32xf16>
+    //CHECK: %[[r3:.*]] = xetile.load_tile %[[r1]] : !xetile.tile<32x32xf16> -> vector<32x32xf16>
+    //CHECK: %[[r4:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [0, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r5:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [1, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r6:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [2, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r7:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [3, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r8:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [4, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r9:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [5, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r10:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [6, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r11:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [7, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r12:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [8, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r13:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [9, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r14:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [10, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r15:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [11, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r16:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [12, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r17:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [13, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r18:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [14, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r19:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [15, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r20:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [16, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r21:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [17, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r22:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [18, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r23:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [19, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r24:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [20, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r25:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [21, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r26:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [22, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r27:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [23, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r28:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [24, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r29:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [25, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r30:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [26, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r31:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [27, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r32:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [28, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r33:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [29, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r34:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [30, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r35:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [31, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r36:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [0, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r37:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [1, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r38:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [2, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r39:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [3, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r40:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [4, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r41:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [5, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r42:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [6, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r43:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [7, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r44:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [8, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r45:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [9, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r46:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [10, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r47:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [11, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r48:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [12, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r49:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [13, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r50:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [14, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r51:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [15, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r52:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [16, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r53:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [17, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r54:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [18, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r55:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [19, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r56:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [20, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r57:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [21, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r58:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [22, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r59:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [23, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r60:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [24, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r61:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [25, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r62:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [26, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r63:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [27, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r64:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [28, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r65:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [29, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r66:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [30, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r67:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [31, 0], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
+    //CHECK: %[[r68:.*]] = arith.addf %4, %36 : vector<1x32xf16>
+    //CHECK: %[[r69:.*]] = vector.shape_cast %68 : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r70:.*]] = arith.addf %5, %37 : vector<1x32xf16>
+    //CHECK: %[[r71:.*]] = vector.shape_cast %70 : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r72:.*]] = arith.addf %6, %38 : vector<1x32xf16>
+    //CHECK: %[[r73:.*]] = vector.shape_cast %72 : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r74:.*]] = arith.addf %7, %39 : vector<1x32xf16>
+    //CHECK: %[[r75:.*]] = vector.shape_cast %74 : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r76:.*]] = arith.addf %8, %40 : vector<1x32xf16>
+    //CHECK: %[[r77:.*]] = vector.shape_cast %76 : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r78:.*]] = arith.addf %9, %41 : vector<1x32xf16>
+    //CHECK: %[[r79:.*]] = vector.shape_cast %78 : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r80:.*]] = arith.addf %10, %42 : vector<1x32xf16>
+    //CHECK: %[[r81:.*]] = vector.shape_cast %80 : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r82:.*]] = arith.addf %11, %43 : vector<1x32xf16>
+    //CHECK: %[[r83:.*]] = vector.shape_cast %82 : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r84:.*]] = arith.addf %12, %44 : vector<1x32xf16>
+    //CHECK: %[[r85:.*]] = vector.shape_cast %84 : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r86:.*]] = arith.addf %13, %45 : vector<1x32xf16>
+    //CHECK: %[[r87:.*]] = vector.shape_cast %86 : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r88:.*]] = arith.addf %14, %46 : vector<1x32xf16>
+    //CHECK: %[[r89:.*]] = vector.shape_cast %88 : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r90:.*]] = arith.addf %15, %47 : vector<1x32xf16>
+    //CHECK: %[[r91:.*]] = vector.shape_cast %90 : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r92:.*]] = arith.addf %16, %48 : vector<1x32xf16>
+    //CHECK: %[[r93:.*]] = vector.shape_cast %92 : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r94:.*]] = arith.addf %17, %49 : vector<1x32xf16>
+    //CHECK: %[[r95:.*]] = vector.shape_cast %94 : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r96:.*]] = arith.addf %18, %50 : vector<1x32xf16>
+    //CHECK: %[[r97:.*]] = vector.shape_cast %96 : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r98:.*]] = arith.addf %19, %51 : vector<1x32xf16>
+    //CHECK: %[[r99:.*]] = vector.shape_cast %98 : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r100:.*]] = arith.addf %20, %52 : vector<1x32xf16>
+    //CHECK: %[[r101:.*]] = vector.shape_cast %100 : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r102:.*]] = arith.addf %21, %53 : vector<1x32xf16>
+    //CHECK: %[[r103:.*]] = vector.shape_cast %102 : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r104:.*]] = arith.addf %22, %54 : vector<1x32xf16>
+    //CHECK: %[[r105:.*]] = vector.shape_cast %104 : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r106:.*]] = arith.addf %23, %55 : vector<1x32xf16>
+    //CHECK: %[[r107:.*]] = vector.shape_cast %106 : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r108:.*]] = arith.addf %24, %56 : vector<1x32xf16>
+    //CHECK: %[[r109:.*]] = vector.shape_cast %108 : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r110:.*]] = arith.addf %25, %57 : vector<1x32xf16>
+    //CHECK: %[[r111:.*]] = vector.shape_cast %110 : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r112:.*]] = arith.addf %26, %58 : vector<1x32xf16>
+    //CHECK: %[[r113:.*]] = vector.shape_cast %112 : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r114:.*]] = arith.addf %27, %59 : vector<1x32xf16>
+    //CHECK: %[[r115:.*]] = vector.shape_cast %114 : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r116:.*]] = arith.addf %28, %60 : vector<1x32xf16>
+    //CHECK: %[[r117:.*]] = vector.shape_cast %116 : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r118:.*]] = arith.addf %29, %61 : vector<1x32xf16>
+    //CHECK: %[[r119:.*]] = vector.shape_cast %118 : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r120:.*]] = arith.addf %30, %62 : vector<1x32xf16>
+    //CHECK: %[[r121:.*]] = vector.shape_cast %120 : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r122:.*]] = arith.addf %31, %63 : vector<1x32xf16>
+    //CHECK: %[[r123:.*]] = vector.shape_cast %122 : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r124:.*]] = arith.addf %32, %64 : vector<1x32xf16>
+    //CHECK: %[[r125:.*]] = vector.shape_cast %124 : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r126:.*]] = arith.addf %33, %65 : vector<1x32xf16>
+    //CHECK: %[[r127:.*]] = vector.shape_cast %126 : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r128:.*]] = arith.addf %34, %66 : vector<1x32xf16>
+    //CHECK: %[[r129:.*]] = vector.shape_cast %128 : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r130:.*]] = arith.addf %35, %67 : vector<1x32xf16>
+    //CHECK: %[[r131:.*]] = vector.shape_cast %130 : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %[[r132:.*]] = vector.shuffle %[[r69]], %[[r71]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r133:.*]] = vector.shuffle %[[r69]], %[[r71]] [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r134:.*]] = arith.addf %132, %133 : vector<32xf16>
+    //CHECK: %[[r135:.*]] = vector.shuffle %[[r73]], %[[r75]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r136:.*]] = vector.shuffle %[[r73]], %[[r75]] [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r137:.*]] = arith.addf %135, %136 : vector<32xf16>
+    //CHECK: %[[r138:.*]] = vector.shuffle %[[r77]], %[[r79]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r139:.*]] = vector.shuffle %[[r77]], %[[r79]] [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r140:.*]] = arith.addf %138, %139 : vector<32xf16>
+    //CHECK: %[[r141:.*]] = vector.shuffle %[[r81]], %[[r83]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r142:.*]] = vector.shuffle %[[r81]], %[[r83]] [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r143:.*]] = arith.addf %141, %142 : vector<32xf16>
+    //CHECK: %[[r144:.*]] = vector.shuffle %[[r85]], %[[r87]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r145:.*]] = vector.shuffle %[[r85]], %[[r87]] [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r146:.*]] = arith.addf %144, %145 : vector<32xf16>
+    //CHECK: %[[r147:.*]] = vector.shuffle %[[r89]], %[[r91]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r148:.*]] = vector.shuffle %[[r89]], %[[r91]] [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r149:.*]] = arith.addf %147, %148 : vector<32xf16>
+    //CHECK: %[[r150:.*]] = vector.shuffle %[[r93]], %[[r95]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r151:.*]] = vector.shuffle %[[r93]], %[[r95]] [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r152:.*]] = arith.addf %150, %151 : vector<32xf16>
+    //CHECK: %[[r153:.*]] = vector.shuffle %[[r97]], %[[r99]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r154:.*]] = vector.shuffle %[[r97]], %[[r99]] [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r155:.*]] = arith.addf %153, %154 : vector<32xf16>
+    //CHECK: %[[r156:.*]] = vector.shuffle %[[r101]], %[[r103]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r157:.*]] = vector.shuffle %[[r101]], %[[r103]] [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r158:.*]] = arith.addf %156, %157 : vector<32xf16>
+    //CHECK: %[[r159:.*]] = vector.shuffle %[[r105]], %[[r107]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r160:.*]] = vector.shuffle %[[r105]], %[[r107]] [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r161:.*]] = arith.addf %159, %160 : vector<32xf16>
+    //CHECK: %[[r162:.*]] = vector.shuffle %[[r109]], %[[r111]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r163:.*]] = vector.shuffle %[[r109]], %[[r111]] [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r164:.*]] = arith.addf %162, %163 : vector<32xf16>
+    //CHECK: %[[r165:.*]] = vector.shuffle %[[r113]], %[[r115]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r166:.*]] = vector.shuffle %[[r113]], %[[r115]] [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r167:.*]] = arith.addf %165, %166 : vector<32xf16>
+    //CHECK: %[[r168:.*]] = vector.shuffle %[[r117]], %[[r119]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r169:.*]] = vector.shuffle %[[r117]], %[[r119]] [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r170:.*]] = arith.addf %168, %169 : vector<32xf16>
+    //CHECK: %[[r171:.*]] = vector.shuffle %[[r121]], %[[r123]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r172:.*]] = vector.shuffle %[[r121]], %[[r123]] [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r173:.*]] = arith.addf %171, %172 : vector<32xf16>
+    //CHECK: %[[r174:.*]] = vector.shuffle %[[r125]], %[[r127]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r175:.*]] = vector.shuffle %[[r125]], %[[r127]] [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r176:.*]] = arith.addf %174, %175 : vector<32xf16>
+    //CHECK: %[[r177:.*]] = vector.shuffle %[[r129]], %[[r131]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r178:.*]] = vector.shuffle %[[r129]], %[[r131]] [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r179:.*]] = arith.addf %177, %178 : vector<32xf16>
+    //CHECK: %[[r180:.*]] = vector.shuffle %[[r134]], %[[r137]] [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23, 32, 33, 34, 35, 36, 37, 38, 39, 48, 49, 50, 51, 52, 53, 54, 55] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r181:.*]] = vector.shuffle %[[r134]], %[[r137]] [8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31, 40, 41, 42, 43, 44, 45, 46, 47, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r182:.*]] = arith.addf %180, %181 : vector<32xf16>
+    //CHECK: %[[r183:.*]] = vector.shuffle %[[r140]], %[[r143]] [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23, 32, 33, 34, 35, 36, 37, 38, 39, 48, 49, 50, 51, 52, 53, 54, 55] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r184:.*]] = vector.shuffle %[[r140]], %[[r143]] [8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31, 40, 41, 42, 43, 44, 45, 46, 47, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r185:.*]] = arith.addf %183, %184 : vector<32xf16>
+    //CHECK: %[[r186:.*]] = vector.shuffle %[[r146]], %[[r149]] [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23, 32, 33, 34, 35, 36, 37, 38, 39, 48, 49, 50, 51, 52, 53, 54, 55] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r187:.*]] = vector.shuffle %[[r146]], %[[r149]] [8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31, 40, 41, 42, 43, 44, 45, 46, 47, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r188:.*]] = arith.addf %186, %187 : vector<32xf16>
+    //CHECK: %[[r189:.*]] = vector.shuffle %[[r152]], %[[r155]] [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23, 32, 33, 34, 35, 36, 37, 38, 39, 48, 49, 50, 51, 52, 53, 54, 55] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r190:.*]] = vector.shuffle %[[r152]], %[[r155]] [8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31, 40, 41, 42, 43, 44, 45, 46, 47, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r191:.*]] = arith.addf %189, %190 : vector<32xf16>
+    //CHECK: %[[r192:.*]] = vector.shuffle %[[r158]], %[[r161]] [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23, 32, 33, 34, 35, 36, 37, 38, 39, 48, 49, 50, 51, 52, 53, 54, 55] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r193:.*]] = vector.shuffle %[[r158]], %[[r161]] [8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31, 40, 41, 42, 43, 44, 45, 46, 47, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r194:.*]] = arith.addf %192, %193 : vector<32xf16>
+    //CHECK: %[[r195:.*]] = vector.shuffle %[[r164]], %[[r167]] [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23, 32, 33, 34, 35, 36, 37, 38, 39, 48, 49, 50, 51, 52, 53, 54, 55] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r196:.*]] = vector.shuffle %[[r164]], %[[r167]] [8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31, 40, 41, 42, 43, 44, 45, 46, 47, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r197:.*]] = arith.addf %195, %196 : vector<32xf16>
+    //CHECK: %[[r198:.*]] = vector.shuffle %[[r170]], %[[r173]] [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23, 32, 33, 34, 35, 36, 37, 38, 39, 48, 49, 50, 51, 52, 53, 54, 55] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r199:.*]] = vector.shuffle %[[r170]], %[[r173]] [8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31, 40, 41, 42, 43, 44, 45, 46, 47, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r200:.*]] = arith.addf %198, %199 : vector<32xf16>
+    //CHECK: %[[r201:.*]] = vector.shuffle %[[r176]], %[[r179]] [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23, 32, 33, 34, 35, 36, 37, 38, 39, 48, 49, 50, 51, 52, 53, 54, 55] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r202:.*]] = vector.shuffle %[[r176]], %[[r179]] [8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31, 40, 41, 42, 43, 44, 45, 46, 47, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r203:.*]] = arith.addf %201, %202 : vector<32xf16>
+    //CHECK: %[[r204:.*]] = vector.shuffle %[[r182]], %[[r185]] [0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 32, 33, 34, 35, 40, 41, 42, 43, 48, 49, 50, 51, 56, 57, 58, 59] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r205:.*]] = vector.shuffle %[[r182]], %[[r185]] [4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31, 36, 37, 38, 39, 44, 45, 46, 47, 52, 53, 54, 55, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r206:.*]] = arith.addf %204, %205 : vector<32xf16>
+    //CHECK: %[[r207:.*]] = vector.shuffle %[[r188]], %[[r191]] [0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 32, 33, 34, 35, 40, 41, 42, 43, 48, 49, 50, 51, 56, 57, 58, 59] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r208:.*]] = vector.shuffle %[[r188]], %[[r191]] [4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31, 36, 37, 38, 39, 44, 45, 46, 47, 52, 53, 54, 55, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r209:.*]] = arith.addf %207, %208 : vector<32xf16>
+    //CHECK: %[[r210:.*]] = vector.shuffle %[[r194]], %[[r197]] [0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 32, 33, 34, 35, 40, 41, 42, 43, 48, 49, 50, 51, 56, 57, 58, 59] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r211:.*]] = vector.shuffle %[[r194]], %[[r197]] [4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31, 36, 37, 38, 39, 44, 45, 46, 47, 52, 53, 54, 55, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r212:.*]] = arith.addf %210, %211 : vector<32xf16>
+    //CHECK: %[[r213:.*]] = vector.shuffle %[[r200]], %[[r203]] [0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 32, 33, 34, 35, 40, 41, 42, 43, 48, 49, 50, 51, 56, 57, 58, 59] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r214:.*]] = vector.shuffle %[[r200]], %[[r203]] [4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31, 36, 37, 38, 39, 44, 45, 46, 47, 52, 53, 54, 55, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r215:.*]] = arith.addf %213, %214 : vector<32xf16>
+    //CHECK: %[[r216:.*]] = vector.shuffle %[[r206]], %[[r209]] [0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29, 32, 33, 36, 37, 40, 41, 44, 45, 48, 49, 52, 53, 56, 57, 60, 61] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r217:.*]] = vector.shuffle %[[r206]], %[[r209]] [2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31, 34, 35, 38, 39, 42, 43, 46, 47, 50, 51, 54, 55, 58, 59, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r218:.*]] = arith.addf %216, %217 : vector<32xf16>
+    //CHECK: %[[r219:.*]] = vector.shuffle %[[r212]], %[[r215]] [0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29, 32, 33, 36, 37, 40, 41, 44, 45, 48, 49, 52, 53, 56, 57, 60, 61] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r220:.*]] = vector.shuffle %[[r212]], %[[r215]] [2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31, 34, 35, 38, 39, 42, 43, 46, 47, 50, 51, 54, 55, 58, 59, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r221:.*]] = arith.addf %219, %220 : vector<32xf16>
+    //CHECK: %[[r222:.*]] = vector.shuffle %[[r218]], %[[r221]] [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r223:.*]] = vector.shuffle %[[r218]], %[[r221]] [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %[[r224:.*]] = arith.addf %222, %223 : vector<32xf16>
+    //CHECK: %[[r225:.*]] = vector.extractelement %224[%c0_i32 : i32] : vector<32xf16>
+    //CHECK: %[[r226:.*]] = vector.splat %225 : vector<1x1xf16>
+    //CHECK: %[[r227:.*]] = vector.extractelement %224[%c1_i32 : i32] : vector<32xf16>
+    //CHECK: %[[r228:.*]] = vector.splat %227 : vector<1x1xf16>
+    //CHECK: %[[r229:.*]] = vector.extractelement %224[%c2_i32 : i32] : vector<32xf16>
+    //CHECK: %[[r230:.*]] = vector.splat %229 : vector<1x1xf16>
+    //CHECK: %[[r231:.*]] = vector.extractelement %224[%c3_i32 : i32] : vector<32xf16>
+    //CHECK: %[[r232:.*]] = vector.splat %231 : vector<1x1xf16>
+    //CHECK: %[[r233:.*]] = vector.extractelement %224[%c4_i32 : i32] : vector<32xf16>
+    //CHECK: %[[r234:.*]] = vector.splat %233 : vector<1x1xf16>
+    //CHECK: %[[r235:.*]] = vector.extractelement %224[%c5_i32 : i32] : vector<32xf16>
+    //CHECK: %[[r236:.*]] = vector.splat %235 : vector<1x1xf16>
+    //CHECK: %[[r237:.*]] = vector.extractelement %224[%c6_i32 : i32] : vector<32xf16>
+    //CHECK: %[[r238:.*]] = vector.splat %237 : vector<1x1xf16>
+    //CHECK: %[[r239:.*]] = vector.extractelement %224[%c7_i32 : i32] : vector<32xf16>
+    //CHECK: %[[r240:.*]] = vector.splat %239 : vector<1x1xf16>
+    //CHECK: %[[r241:.*]] = vector.extractelement %224[%c8_i32 : i32] : vector<32xf16>
+    //CHECK: %[[r242:.*]] = vector.splat %241 : vector<1x1xf16>
+    //CHECK: %[[r243:.*]] = vector.extractelement %224[%c9_i32 : i32] : vector<32xf16>
+    //CHECK: %[[r244:.*]] = vector.splat %243 : vector<1x1xf16>
+    //CHECK: %[[r245:.*]] = vector.extractelement %224[%c10_i32 : i32] : vector<32xf16>
+    //CHECK: %[[r246:.*]] = vector.splat %245 : vector<1x1xf16>
+    //CHECK: %[[r247:.*]] = vector.extractelement %224[%c11_i32 : i32] : vector<32xf16>
+    //CHECK: %[[r248:.*]] = vector.splat %247 : vector<1x1xf16>
+    //CHECK: %[[r249:.*]] = vector.extractelement %224[%c12_i32 : i32] : vector<32xf16>
+    //CHECK: %[[r250:.*]] = vector.splat %249 : vector<1x1xf16>
+    //CHECK: %[[r251:.*]] = vector.extractelement %224[%c13_i32 : i32] : vector<32xf16>
+    //CHECK: %[[r252:.*]] = vector.splat %251 : vector<1x1xf16>
+    //CHECK: %[[r253:.*]] = vector.extractelement %224[%c14_i32 : i32] : vector<32xf16>
+    //CHECK: %[[r254:.*]] = vector.splat %253 : vector<1x1xf16>
+    //CHECK: %[[r255:.*]] = vector.extractelement %224[%c15_i32 : i32] : vector<32xf16>
+    //CHECK: %[[r256:.*]] = vector.splat %255 : vector<1x1xf16>
+    //CHECK: %[[r257:.*]] = vector.extractelement %224[%c16_i32 : i32] : vector<32xf16>
+    //CHECK: %[[r258:.*]] = vector.splat %257 : vector<1x1xf16>
+    //CHECK: %[[r259:.*]] = vector.extractelement %224[%c17_i32 : i32] : vector<32xf16>
+    //CHECK: %[[r260:.*]] = vector.splat %259 : vector<1x1xf16>
+    //CHECK: %[[r261:.*]] = vector.extractelement %224[%c18_i32 : i32] : vector<32xf16>
+    //CHECK: %[[r262:.*]] = vector.splat %261 : vector<1x1xf16>
+    //CHECK: %[[r263:.*]] = vector.extractelement %224[%c19_i32 : i32] : vector<32xf16>
+    //CHECK: %[[r264:.*]] = vector.splat %263 : vector<1x1xf16>
+    //CHECK: %[[r265:.*]] = vector.extractelement %224[%c20_i32 : i32] : vector<32xf16>
+    //CHECK: %[[r266:.*]] = vector.splat %265 : vector<1x1xf16>
+    //CHECK: %[[r267:.*]] = vector.extractelement %224[%c21_i32 : i32] : vector<32xf16>
+    //CHECK: %[[r268:.*]] = vector.splat %267 : vector<1x1xf16>
+    //CHECK: %[[r269:.*]] = vector.extractelement %224[%c22_i32 : i32] : vector<32xf16>
+    //CHECK: %[[r270:.*]] = vector.splat %269 : vector<1x1xf16>
+    //CHECK: %[[r271:.*]] = vector.extractelement %224[%c23_i32 : i32] : vector<32xf16>
+    //CHECK: %[[r272:.*]] = vector.splat %271 : vector<1x1xf16>
+    //CHECK: %[[r273:.*]] = vector.extractelement %224[%c24_i32 : i32] : vector<32xf16>
+    //CHECK: %[[r274:.*]] = vector.splat %273 : vector<1x1xf16>
+    //CHECK: %[[r275:.*]] = vector.extractelement %224[%c25_i32 : i32] : vector<32xf16>
+    //CHECK: %[[r276:.*]] = vector.splat %275 : vector<1x1xf16>
+    //CHECK: %[[r277:.*]] = vector.extractelement %224[%c26_i32 : i32] : vector<32xf16>
+    //CHECK: %[[r278:.*]] = vector.splat %277 : vector<1x1xf16>
+    //CHECK: %[[r279:.*]] = vector.extractelement %224[%c27_i32 : i32] : vector<32xf16>
+    //CHECK: %[[r280:.*]] = vector.splat %279 : vector<1x1xf16>
+    //CHECK: %[[r281:.*]] = vector.extractelement %224[%c28_i32 : i32] : vector<32xf16>
+    //CHECK: %[[r282:.*]] = vector.splat %281 : vector<1x1xf16>
+    //CHECK: %[[r283:.*]] = vector.extractelement %224[%c29_i32 : i32] : vector<32xf16>
+    //CHECK: %[[r284:.*]] = vector.splat %283 : vector<1x1xf16>
+    //CHECK: %[[r285:.*]] = vector.extractelement %224[%c30_i32 : i32] : vector<32xf16>
+    //CHECK: %[[r286:.*]] = vector.splat %285 : vector<1x1xf16>
+    //CHECK: %[[r287:.*]] = vector.extractelement %224[%c31_i32 : i32] : vector<32xf16>
+    //CHECK: %[[r288:.*]] = vector.splat %287 : vector<1x1xf16>
+    //CHECK: %[[r289:.*]] = vector.shuffle %[[r226]], %[[r228]] [0, 1] : vector<1x1xf16>, vector<1x1xf16>
+    //CHECK: %[[r290:.*]] = vector.shuffle %[[r230]], %[[r232]] [0, 1] : vector<1x1xf16>, vector<1x1xf16>
+    //CHECK: %[[r291:.*]] = vector.shuffle %[[r234]], %[[r236]] [0, 1] : vector<1x1xf16>, vector<1x1xf16>
+    //CHECK: %[[r292:.*]] = vector.shuffle %[[r238]], %[[r240]] [0, 1] : vector<1x1xf16>, vector<1x1xf16>
+    //CHECK: %[[r293:.*]] = vector.shuffle %[[r242]], %[[r244]] [0, 1] : vector<1x1xf16>, vector<1x1xf16>
+    //CHECK: %[[r294:.*]] = vector.shuffle %[[r246]], %[[r248]] [0, 1] : vector<1x1xf16>, vector<1x1xf16>
+    //CHECK: %[[r295:.*]] = vector.shuffle %[[r250]], %[[r252]] [0, 1] : vector<1x1xf16>, vector<1x1xf16>
+    //CHECK: %[[r296:.*]] = vector.shuffle %[[r254]], %[[r256]] [0, 1] : vector<1x1xf16>, vector<1x1xf16>
+    //CHECK: %[[r297:.*]] = vector.shuffle %[[r258]], %[[r260]] [0, 1] : vector<1x1xf16>, vector<1x1xf16>
+    //CHECK: %[[r298:.*]] = vector.shuffle %[[r262]], %[[r264]] [0, 1] : vector<1x1xf16>, vector<1x1xf16>
+    //CHECK: %[[r299:.*]] = vector.shuffle %[[r266]], %[[r268]] [0, 1] : vector<1x1xf16>, vector<1x1xf16>
+    //CHECK: %[[r300:.*]] = vector.shuffle %[[r270]], %[[r272]] [0, 1] : vector<1x1xf16>, vector<1x1xf16>
+    //CHECK: %[[r301:.*]] = vector.shuffle %[[r274]], %[[r276]] [0, 1] : vector<1x1xf16>, vector<1x1xf16>
+    //CHECK: %[[r302:.*]] = vector.shuffle %[[r278]], %[[r280]] [0, 1] : vector<1x1xf16>, vector<1x1xf16>
+    //CHECK: %[[r303:.*]] = vector.shuffle %[[r282]], %[[r284]] [0, 1] : vector<1x1xf16>, vector<1x1xf16>
+    //CHECK: %[[r304:.*]] = vector.shuffle %[[r286]], %[[r288]] [0, 1] : vector<1x1xf16>, vector<1x1xf16>
+    //CHECK: %[[r305:.*]] = vector.shuffle %[[r289]], %[[r290]] [0, 1, 2, 3] : vector<2x1xf16>, vector<2x1xf16>
+    //CHECK: %[[r306:.*]] = vector.shuffle %[[r291]], %[[r292]] [0, 1, 2, 3] : vector<2x1xf16>, vector<2x1xf16>
+    //CHECK: %[[r307:.*]] = vector.shuffle %[[r293]], %[[r294]] [0, 1, 2, 3] : vector<2x1xf16>, vector<2x1xf16>
+    //CHECK: %[[r308:.*]] = vector.shuffle %[[r295]], %[[r296]] [0, 1, 2, 3] : vector<2x1xf16>, vector<2x1xf16>
+    //CHECK: %[[r309:.*]] = vector.shuffle %[[r297]], %[[r298]] [0, 1, 2, 3] : vector<2x1xf16>, vector<2x1xf16>
+    //CHECK: %[[r310:.*]] = vector.shuffle %[[r299]], %[[r300]] [0, 1, 2, 3] : vector<2x1xf16>, vector<2x1xf16>
+    //CHECK: %[[r311:.*]] = vector.shuffle %[[r301]], %[[r302]] [0, 1, 2, 3] : vector<2x1xf16>, vector<2x1xf16>
+    //CHECK: %[[r312:.*]] = vector.shuffle %[[r303]], %[[r304]] [0, 1, 2, 3] : vector<2x1xf16>, vector<2x1xf16>
+    //CHECK: %[[r313:.*]] = vector.shuffle %[[r305]], %[[r306]] [0, 1, 2, 3, 4, 5, 6, 7] : vector<4x1xf16>, vector<4x1xf16>
+    //CHECK: %[[r314:.*]] = vector.shuffle %[[r307]], %[[r308]] [0, 1, 2, 3, 4, 5, 6, 7] : vector<4x1xf16>, vector<4x1xf16>
+    //CHECK: %[[r315:.*]] = vector.shuffle %[[r309]], %[[r310]] [0, 1, 2, 3, 4, 5, 6, 7] : vector<4x1xf16>, vector<4x1xf16>
+    //CHECK: %[[r316:.*]] = vector.shuffle %[[r311]], %[[r312]] [0, 1, 2, 3, 4, 5, 6, 7] : vector<4x1xf16>, vector<4x1xf16>
+    //CHECK: %[[r317:.*]] = vector.shuffle %[[r313]], %[[r314]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : vector<8x1xf16>, vector<8x1xf16>
+    //CHECK: %[[r318:.*]] = vector.shuffle %[[r315]], %[[r316]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : vector<8x1xf16>, vector<8x1xf16>
+    //CHECK: %[[r319:.*]] = vector.shuffle %[[r317]], %[[r318]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] : vector<16x1xf16>, vector<16x1xf16>
+    //CHECK: %[[r320:.*]] = xetile.broadcast %[[r319]] [1] : vector<32x1xf16> -> vector<32x64xf16>
+    //CHECK: %[[r321:.*]] = vector.extract_strided_slice %[[r320]] {offsets = [0, 0], sizes = [32, 8], strides = [1, 1]} : vector<32x64xf16> to vector<32x8xf16>
+    //CHECK: %[[r322:.*]] = vector.extract_strided_slice %[[r320]] {offsets = [0, 8], sizes = [32, 8], strides = [1, 1]} : vector<32x64xf16> to vector<32x8xf16>
+    //CHECK: %[[r323:.*]] = vector.extract_strided_slice %[[r320]] {offsets = [0, 16], sizes = [32, 8], strides = [1, 1]} : vector<32x64xf16> to vector<32x8xf16>
+    //CHECK: %[[r324:.*]] = vector.extract_strided_slice %[[r320]] {offsets = [0, 24], sizes = [32, 8], strides = [1, 1]} : vector<32x64xf16> to vector<32x8xf16>
+    //CHECK: %[[r325:.*]] = vector.extract_strided_slice %[[r320]] {offsets = [0, 32], sizes = [32, 8], strides = [1, 1]} : vector<32x64xf16> to vector<32x8xf16>
+    //CHECK: %[[r326:.*]] = vector.extract_strided_slice %[[r320]] {offsets = [0, 40], sizes = [32, 8], strides = [1, 1]} : vector<32x64xf16> to vector<32x8xf16>
+    //CHECK: %[[r327:.*]] = vector.extract_strided_slice %[[r320]] {offsets = [0, 48], sizes = [32, 8], strides = [1, 1]} : vector<32x64xf16> to vector<32x8xf16>
+    //CHECK: %[[r328:.*]] = vector.extract_strided_slice %[[r320]] {offsets = [0, 56], sizes = [32, 8], strides = [1, 1]} : vector<32x64xf16> to vector<32x8xf16>
+    //CHECK: %[[r329:.*]] = xetile.transpose %[[r321]], [1, 0] : vector<32x8xf16> -> vector<8x32xf16>
+    //CHECK: %[[r330:.*]] = xetile.transpose %[[r322]], [1, 0] : vector<32x8xf16> -> vector<8x32xf16>
+    //CHECK: %[[r331:.*]] = xetile.transpose %[[r323]], [1, 0] : vector<32x8xf16> -> vector<8x32xf16>
+    //CHECK: %[[r332:.*]] = xetile.transpose %[[r324]], [1, 0] : vector<32x8xf16> -> vector<8x32xf16>
+    //CHECK: %[[r333:.*]] = xetile.transpose %[[r325]], [1, 0] : vector<32x8xf16> -> vector<8x32xf16>
+    //CHECK: %[[r334:.*]] = xetile.transpose %[[r326]], [1, 0] : vector<32x8xf16> -> vector<8x32xf16>
+    //CHECK: %[[r335:.*]] = xetile.transpose %[[r327]], [1, 0] : vector<32x8xf16> -> vector<8x32xf16>
+    //CHECK: %[[r336:.*]] = xetile.transpose %[[r328]], [1, 0] : vector<32x8xf16> -> vector<8x32xf16>
+    //CHECK: %[[r337:.*]] = xetile.init_tile %[[arg0]][%[[c0]], %[[c0]]] : memref<1024x1024xf16> -> !xetile.tile<8x32xf16>
+    //CHECK: %[[r338:.*]] = xetile.init_tile %[[arg0]][%[[c8]], %[[c0]]] : memref<1024x1024xf16> -> !xetile.tile<8x32xf16>
+    //CHECK: %[[r339:.*]] = xetile.init_tile %[[arg0]][%[[c16]], %[[c0]]] : memref<1024x1024xf16> -> !xetile.tile<8x32xf16>
+    //CHECK: %[[r340:.*]] = xetile.init_tile %[[arg0]][%[[c24]], %[[c0]]] : memref<1024x1024xf16> -> !xetile.tile<8x32xf16>
+    //CHECK: %[[r341:.*]] = xetile.init_tile %[[arg0]][%[[c32]], %[[c0]]] : memref<1024x1024xf16> -> !xetile.tile<8x32xf16>
+    //CHECK: %[[r342:.*]] = xetile.init_tile %[[arg0]][%[[c40]], %[[c0]]] : memref<1024x1024xf16> -> !xetile.tile<8x32xf16>
+    //CHECK: %[[r343:.*]] = xetile.init_tile %[[arg0]][%[[c48]], %[[c0]]] : memref<1024x1024xf16> -> !xetile.tile<8x32xf16>
+    //CHECK: %[[r344:.*]] = xetile.init_tile %[[arg0]][%[[c56]], %[[c0]]] : memref<1024x1024xf16> -> !xetile.tile<8x32xf16>
+    //CHECK: xetile.store_tile %[[r329]],  %[[r337]] : vector<8x32xf16>, !xetile.tile<8x32xf16>
+    //CHECK: xetile.store_tile %[[r330]],  %[[r338]] : vector<8x32xf16>, !xetile.tile<8x32xf16>
+    //CHECK: xetile.store_tile %[[r331]],  %[[r339]] : vector<8x32xf16>, !xetile.tile<8x32xf16>
+    //CHECK: xetile.store_tile %[[r332]],  %[[r340]] : vector<8x32xf16>, !xetile.tile<8x32xf16>
+    //CHECK: xetile.store_tile %[[r333]],  %[[r341]] : vector<8x32xf16>, !xetile.tile<8x32xf16>
+    //CHECK: xetile.store_tile %[[r334]],  %[[r342]] : vector<8x32xf16>, !xetile.tile<8x32xf16>
+    //CHECK: xetile.store_tile %[[r335]],  %[[r343]] : vector<8x32xf16>, !xetile.tile<8x32xf16>
+    //CHECK: xetile.store_tile %[[r336]],  %[[r344]] : vector<8x32xf16>, !xetile.tile<8x32xf16>
+    %1 = xetile.init_tile %a[0, 0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16>
+    %2 = xetile.load_tile %1: !xetile.tile<32x64xf16> -> vector<32x64xf16>
+    %3 = xetile.reduction <add>, %2 [1]: vector<32x64xf16> -> vector<32x1xf16>
+    %4 = xetile.broadcast %3 [1]: vector<32x1xf16> -> vector<32x64xf16>
+    %5 = xetile.transpose %4, [1, 0]: vector<32x64xf16> -> vector<64x32xf16>
+    %6 = xetile.init_tile %a[0, 0] : memref<1024x1024xf16> -> !xetile.tile<64x32xf16>
+    xetile.store_tile %5, %6: vector<64x32xf16>, !xetile.tile<64x32xf16>
+    gpu.return
+  }
+
+  //CHECK-LABEL: gpu.func @sglevel_softmax_dim_0
+  //CHECK-SAME(%[[arg0:.*]]: memref<1024x1024xf16>)
+  gpu.func @sglevel_softmax_dim_0(%a: memref<1024x1024xf16>) {
+
+    //CHECK-COUNT-2: %{{.*}} = xetile.init_tile %[[arg0]][%{{.*}}, %{{.*}}] : memref<1024x1024xf16> -> !xetile.tile<32x32xf16>
+    %1 = xetile.init_tile %a[0, 0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16>
+
+    //CHECK-COUNT-2: %{{.*}} = xetile.load_tile %{{.*}} : !xetile.tile<32x32xf16> -> vector<32x32xf16>
+    %2 = xetile.load_tile %1: !xetile.tile<32x64xf16> -> vector<32x64xf16>
+
+    //CHECK-COUNT-8: %{{.*}} = vector.extract_strided_slice %{{.*}} {offsets = [{{.*}}], sizes = [8, 32], strides = [1, 1]} : vector<32x32xf16> to vector<8x32xf16>
+    //CHECK-COUNT-8: %{{.*}} = math.exp %{{.*}} : vector<8x32xf16>
+    %3 = math.exp %2: vector<32x64xf16>
+
+    //CHECK-COUNT-64: %{{.*}} = vector.extract_strided_slice %{{.*}} {offsets = [{{.*}}], sizes = [1, 32], strides = [1, 1]} : vector<8x32xf16> to vector<1x32xf16>
+    //CHECK-COUNT-62: %{{.*}} = arith.addf %{{.*}}, %{{.*}} : vector<1x32xf16>
+    //CHECK: %{{.*}} = vector.shape_cast %{{.*}} : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %{{.*}} = vector.shape_cast %{{.*}} : vector<1x32xf16> to vector<32xf16>
+    //CHECK: %{{.*}} = vector.shuffle %{{.*}}, %{{.*}} [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    %4 = xetile.reduction <add>, %3 [0]: vector<32x64xf16> -> vector<1x64xf16>
+    //CHECK: %{{.*}} = vector.shape_cast %{{.*}} : vector<64xf16> to vector<1x64xf16>
+    //CHECK: %{{.*}} = xetile.broadcast %{{.*}} [0] : vector<1x64xf16> -> vector<32x64xf16>
+    %5 = xetile.broadcast %4 [0]: vector<1x64xf16> -> vector<32x64xf16>
+    //CHECK-COUNT-4: %{{.*}} = vector.extract_strided_slice %{{.*}} {offsets = [{{.*}}], sizes = [8, 64], strides = [1, 1]} : vector<32x64xf16> to vector<8x64xf16>
+    //CHECK-COUNT-8: %{{.*}} = vector.extract_strided_slice %{{.*}} {offsets = [{{.*}}], sizes = [8, 32], strides = [1, 1]} : vector<8x64xf16> to vector<8x32xf16>
+    //CHECK-COUNT-8: %{{.*}} = arith.divf %{{.*}}, %{{.*}} : vector<8x32xf16>
+    %6 = arith.divf %3, %5: vector<32x64xf16>
+    //CHECK-COUNT-8: %{{.*}} = xetile.init_tile %[[arg0]][%{{.*}}, %{{.*}}] : memref<1024x1024xf16> -> !xetile.tile<8x32xf16>
+    %7 = xetile.init_tile %a[0, 0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16>
+    //CHECK-COUNT-8: xetile.store_tile %{{.*}},  %{{.*}} : vector<8x32xf16>, !xetile.tile<8x32xf16>
+    xetile.store_tile %6, %7: vector<32x64xf16>, !xetile.tile<32x64xf16>
+    gpu.return
+  }
+
+  //CHECK: (%[[arg0:.*]]: memref<1024x1024xf16>)
+  gpu.func @sglevel_softmax_dim_1(%a: memref<1024x1024xf16>) {
+    //CHECK: %[[c32:.*]] = arith.constant 32 : index
+    //CHECK: %[[c0:.*]] = arith.constant 0 : index
+    //CHECK: %[[r0:.*]] = xetile.init_tile %[[arg0]][%[[c0]], %[[c0]]] : memref<1024x1024xf16> -> !xetile.tile<32x32xf16>
+    //CHECK: %[[r1:.*]] = xetile.init_tile %[[arg0]][%[[c0]], %[[c32]]] : memref<1024x1024xf16> -> !xetile.tile<32x32xf16>
+    //CHECK: %[[r2:.*]] = xetile.load_tile %[[r0]] : !xetile.tile<32x32xf16> -> vector<32x32xf16>
+    //CHECK: %[[r3:.*]] = xetile.load_tile %[[r1]] : !xetile.tile<32x32xf16> -> vector<32x32xf16>
+    //CHECK-COUNT-4: %{{.*}} = vector.extract_strided_slice %[[r2]] {offsets = [{{.*}}], sizes = [8, 32], strides = [1, 1]} : vector<32x32xf16> to vector<8x32xf16>
+    //CHECK-COUNT-4: %{{.*}} = vector.extract_strided_slice %[[r3]] {offsets = [{{.*}}], sizes = [8, 32], strides = [1, 1]} : vector<32x32xf16> to vector<8x32xf16>
+    //CHECK-COUNT-8: %{{.*}} = math.exp %{{.*}} : vector<8x32xf16>
+    //CHECK-COUNT-64: %{{.*}} = vector.extract_strided_slice %{{.*}} {offsets = [{{.*}}], sizes = [1, 32], strides = [1, 1]} : vector<8x32xf16> to vector<1x32xf16>
+
+    //CHECK: %{{.*}} = vector.shuffle %{{.*}}, %{{.*}} [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47] : vector<32xf16>, vector<32xf16>
+    //CHECK: %{{.*}} = vector.shuffle %{{.*}}, %{{.*}} [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %{{.*}} = arith.addf %{{.*}}, %{{.*}} : vector<32xf16>
+
+    //CHECK: %{{.*}} = vector.shuffle %{{.*}}, %{{.*}} [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23, 32, 33, 34, 35, 36, 37, 38, 39, 48, 49, 50, 51, 52, 53, 54, 55] : vector<32xf16>, vector<32xf16>
+    //CHECK: %{{.*}} = vector.shuffle %{{.*}}, %{{.*}} [8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31, 40, 41, 42, 43, 44, 45, 46, 47, 56, 57, 58, 59, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %{{.*}} = arith.addf %{{.*}}, %{{.*}} : vector<32xf16>
+
+    //CHECK: %{{.*}} = vector.shuffle %{{.*}}, %{{.*}} [0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 32, 33, 34, 35, 40, 41, 42, 43, 48, 49, 50, 51, 56, 57, 58, 59] : vector<32xf16>, vector<32xf16>
+    //CHECK: %{{.*}} = vector.shuffle %{{.*}}, %{{.*}} [4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31, 36, 37, 38, 39, 44, 45, 46, 47, 52, 53, 54, 55, 60, 61, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %{{.*}} = arith.addf %{{.*}}, %{{.*}} : vector<32xf16>
+
+    //CHECK: %{{.*}} = vector.shuffle %{{.*}}, %{{.*}} [0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29, 32, 33, 36, 37, 40, 41, 44, 45, 48, 49, 52, 53, 56, 57, 60, 61] : vector<32xf16>, vector<32xf16>
+    //CHECK: %{{.*}} = vector.shuffle %{{.*}}, %{{.*}} [2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31, 34, 35, 38, 39, 42, 43, 46, 47, 50, 51, 54, 55, 58, 59, 62, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %{{.*}} = arith.addf %{{.*}}, %{{.*}} : vector<32xf16>
+
+    //CHECK: %{{.*}} = vector.shuffle %{{.*}}, %{{.*}} [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62] : vector<32xf16>, vector<32xf16>
+    //CHECK: %{{.*}} = vector.shuffle %{{.*}}, %{{.*}} [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63] : vector<32xf16>, vector<32xf16>
+    //CHECK: %{{.*}} = arith.addf %{{.*}}, %{{.*}} : vector<32xf16>
+
+    //CHECK-COUNT-16: %{{.*}} = vector.shuffle %{{.*}}, %{{.*}} [0, 1] : vector<1x1xf16>, vector<1x1xf16>
+    //CHECK-COUNT-8: %{{.*}} = vector.shuffle %{{.*}}, %{{.*}} [0, 1, 2, 3] : vector<2x1xf16>, vector<2x1xf16>
+    //CHECK-COUNT-4: %{{.*}} = vector.shuffle %{{.*}}, %{{.*}} [0, 1, 2, 3, 4, 5, 6, 7] : vector<4x1xf16>, vector<4x1xf16>
+    //CHECK-COUNT-2: %{{.*}} = vector.shuffle %{{.*}}, %{{.*}} [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : vector<8x1xf16>, vector<8x1xf16>
+    //CHECK: %{{.*}} = vector.shuffle %{{.*}}, %{{.*}} [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] : vector<16x1xf16>, vector<16x1xf16>
+    //CHECK: %{{.*}} = xetile.broadcast %{{.*}} [1] : vector<32x1xf16> -> vector<32x64xf16>
+    //CHECK-COUNT-4: %{{.*}} = vector.extract_strided_slice %{{.*}} {offsets = [{{.*}}], sizes = [8, 64], strides = [1, 1]} : vector<32x64xf16> to vector<8x64xf16>
+    //CHECK-COUNT-8: %{{.*}} = vector.extract_strided_slice %{{.*}} {offsets = [{{.*}}], sizes = [8, 32], strides = [1, 1]} : vector<8x64xf16> to vector<8x32xf16>
+    //CHECK-COUNT-8: %{{.*}} = arith.divf %{{.*}}, %{{.*}} : vector<8x32xf16>
+    //CHECK-COUNT-8: %{{.*}} = xetile.init_tile %[[arg0]][%{{.*}}, %{{.*}}] : memref<1024x1024xf16> -> !xetile.tile<8x32xf16>
+    //CHECK-COUNT-8: xetile.store_tile %{{.*}},  %{{.*}} : vector<8x32xf16>, !xetile.tile<8x32xf16>
+    %1 = xetile.init_tile %a[0, 0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16>
+    %2 = xetile.load_tile %1: !xetile.tile<32x64xf16> -> vector<32x64xf16>
+    %3 = math.exp %2: vector<32x64xf16>
+    %4 = xetile.reduction <add>, %3 [1]: vector<32x64xf16> -> vector<32x1xf16>
+    %5 = xetile.broadcast %4 [1]: vector<32x1xf16> -> vector<32x64xf16>
+    %6 = arith.divf %3, %5: vector<32x64xf16>
+    %7 = xetile.init_tile %a[0, 0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16>
+    xetile.store_tile %6, %7: vector<32x64xf16>, !xetile.tile<32x64xf16>
+    gpu.return
+  }
+
+  //CHECK-LABEL: gpu.func @sglevel_softmax_transpose
+  //CHECK-SAME(%[[arg0:.*]]: memref<1024x1024xf16>)
+  gpu.func @sglevel_softmax_transpose(%a: memref<1024x1024xf16>) {
+
+    //CHECK-COUNT-16: %{{.*}} = vector.shuffle %{{.*}}, %{{.*}} [0, 1] : vector<1x1xf16>, vector<1x1xf16>
+    //CHECK-COUNT-8: %{{.*}} = vector.shuffle %{{.*}}, %{{.*}} [0, 1, 2, 3] : vector<2x1xf16>, vector<2x1xf16>
+    //CHECK-COUNT-4: %{{.*}} = vector.shuffle %{{.*}}, %{{.*}} [0, 1, 2, 3, 4, 5, 6, 7] : vector<4x1xf16>, vector<4x1xf16>
+    //CHECK-COUNT-2: %{{.*}} = vector.shuffle %{{.*}}, %{{.*}} [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : vector<8x1xf16>, vector<8x1xf16>
+    //CHECK: %{{.*}} = vector.shuffle %{{.*}}, %{{.*}} [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] : vector<16x1xf16>, vector<16x1xf16>
+    //CHECK: %{{.*}} = xetile.broadcast %{{.*}} [1] : vector<32x1xf16> -> vector<32x64xf16>
+    //CHECK-COUNT-8: %{{.*}} = vector.extract_strided_slice %{{.*}} {offsets = [{{.*}}], sizes = [32, 8], strides = [1, 1]} : vector<32x64xf16> to vector<32x8xf16>
+    //CHECK-COUNT-8: %{{.*}} = arith.divf %{{.*}}, %{{.*}} : vector<32x8xf16>
+    //CHECK-COUNT-8: %{{.*}} = xetile.transpose %{{.*}}, [1, 0] : vector<32x8xf16> -> vector<8x32xf16>
+    //CHECK-COUNT-8: %{{.*}} = xetile.init_tile %[[arg0]][%{{.*}}, %{{.*}}] : memref<1024x1024xf16> -> !xetile.tile<8x32xf16>
+    //CHECK-COUNT-8: xetile.store_tile %{{.*}},  %{{.*}} : vector<8x32xf16>, !xetile.tile<8x32xf16>
+    %1 = xetile.init_tile %a[0, 0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16>
+    %2 = xetile.load_tile %1: !xetile.tile<32x64xf16> -> vector<32x64xf16>
+    %3 = math.exp %2: vector<32x64xf16>
+    %4 = xetile.reduction <add>, %3 [1]: vector<32x64xf16> -> vector<32x1xf16>
+    %5 = xetile.broadcast %4 [1]: vector<32x1xf16> -> vector<32x64xf16>
+    %6 = arith.divf %3, %5: vector<32x64xf16>
+    %7 = xetile.transpose %6, [1, 0]: vector<32x64xf16> -> vector<64x32xf16>
+    %8 = xetile.init_tile %a[0, 0] : memref<1024x1024xf16> -> !xetile.tile<64x32xf16>
+    xetile.store_tile %7, %8: vector<64x32xf16>, !xetile.tile<64x32xf16>
+    gpu.return
+  }
+
+  //CHECK-LABEL: gpu.func @sglevel_unregular_gemm
+  //CHECK-SAME: %[[arg0:.*]]: memref<16384x12288xf16>, %[[arg1:.*]]: memref<1536x12288xf16>, %[[arg2:.*]]: memref<16384x1536xf32>
+  gpu.func @sglevel_unregular_gemm(%arg0: memref<16384x12288xf16>, %arg1: memref<1536x12288xf16>, %arg2: memref<16384x1536xf32>) attributes {gemm_tiles_b = 1 : i64, gemm_tiles_x = dense<[8, 2, 4, 8]> : vector<4xi64>, gemm_tiles_y = dense<[1, 1, 8, 4]> : vector<4xi64>, physical_nd_range = dense<[8, 32]> : vector<2xi64>, region_partition = 0 : i64, region_size = 32 : i64, syn.fusion_successful, syn.tensor_signature = (tensor<16384x12288xf16>, tensor<1536x12288xf16>) -> tensor<16384x1536xf32>, synFusionGenOps = 6 : i64, synFusionRequiredBeamSize = 1 : i64, synFusionTotalCost = 1007562515.4 : f64} {
+    %c64 = arith.constant 64 : index
+    %cst = arith.constant dense<0.000000e+00> : vector<32x64xf32>
+    %c0 = arith.constant 0 : index
+    %c1024 = arith.constant 1024 : index
+    %c256 = arith.constant 256 : index
+    %c2048 = arith.constant 2048 : index
+    %c2 = arith.constant 2 : index
+    %c12288 = arith.constant 12288 : index
+    %c8 = arith.constant 8 : index
+    %c32 = arith.constant 32 : index
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %block_id_x = gpu.block_id  x
+    %block_id_y = gpu.block_id  y
+    %0 = arith.divsi %block_id_y, %c8 : index
+    %1 = arith.remsi %block_id_y, %c8 : index
+    %2 = arith.muli %1, %c256 : index
+    %3 = arith.muli %block_id_x, %c2048 : index
+    %4 = arith.muli %0, %c256 : index
+    %5 = arith.addi %3, %4 : index
+    %6 = gpu.subgroup_id : index
+    %7 = index.floordivs %6, %c4
+    %8 = index.remu %6, %c4
+    %9 = index.remu %7, %c8
+    %10 = index.mul %9, %c32
+    %11 = index.add %5, %10
+    %12 = index.remu %8, %c4
+    %13 = index.mul %12, %c64
+    %14 = index.add %2, %13
+
+    //CHECK: %{{.*}} = xetile.init_tile %[[arg2]][%{{.*}}, %{{.*}}] : memref<16384x1536xf32> -> !xetile.tile<8x16xf32>
+    //CHECK: %{{.*}} = xetile.init_tile %[[arg2]][%{{.*}}, %{{.*}}] : memref<16384x1536xf32> -> !xetile.tile<8x16xf32>
+    //CHECK: %{{.*}} = xetile.init_tile %[[arg2]][%{{.*}}, %{{.*}}] : memref<16384x1536xf32> -> !xetile.tile<8x16xf32>
+    //CHECK: %{{.*}} = xetile.init_tile %[[arg2]][%{{.*}}, %{{.*}}] : memref<16384x1536xf32> -> !xetile.tile<8x16xf32>
+    //CHECK-COUNT-4: %{{.*}} = xetile.init_tile %[[arg2]][%{{.*}}, %{{.*}}] : memref<16384x1536xf32> -> !xetile.tile<8x16xf32>
+    //CHECK-COUNT-4: %{{.*}} = xetile.init_tile %[[arg2]][%{{.*}}, %{{.*}}] : memref<16384x1536xf32> -> !xetile.tile<8x16xf32>
+    //CHECK-COUNT-4: %{{.*}} = xetile.init_tile %[[arg2]][%{{.*}}, %{{.*}}] : memref<16384x1536xf32> -> !xetile.tile<8x16xf32>
+
+    %15 = xetile.init_tile %arg2[%11, %14] : memref<16384x1536xf32> -> !xetile.tile<32x64xf32>
+    %16 = index.remu %8, %c1
+    %17 = index.mul %16, %c32
+    //CHECK: %{{.*}} = xetile.init_tile %[[arg1]][%{{.*}}, %{{.*}}] : memref<1536x12288xf16> -> !xetile.tile<32x16xf16>
+    //CHECK: %{{.*}} = xetile.init_tile %[[arg1]][%{{.*}}, %{{.*}}] : memref<1536x12288xf16> -> !xetile.tile<32x16xf16>
+    %18 = xetile.init_tile %arg0[%11, %17] : memref<16384x12288xf16> -> !xetile.tile<32x32xf16>
+    %19 = index.floordivs %6, %c8
+    %20 = index.remu %6, %c8
+    %21 = index.remu %19, %c4
+    %22 = index.mul %21, %c64
+    %23 = index.add %2, %22
+    %24 = index.remu %20, %c1
+    %25 = index.mul %24, %c32
+    //CHECK-COUNT-2: %{{.*}} = xetile.init_tile %[[arg1]][%{{.*}}, %{{.*}}] : memref<1536x12288xf16> -> !xetile.tile<32x16xf16>
+    %26 = xetile.init_tile %arg1[%23, %25] : memref<1536x12288xf16> -> !xetile.tile<64x32xf16>
+    %27:2 = scf.for %arg15 = %c0 to %c2 step %c1 iter_args(%arg16 = %15, %arg17 = %18) -> (!xetile.tile<32x64xf32>, !xetile.tile<32x32xf16>) {
+      //CHECK-COUNT-2: %{{.*}} = xetile.update_tile_offset %{{.*}}, [%{{.*}}, %{{.*}}] : !xetile.tile<32x16xf16>
+      //CHECK-COUNT-16: %{{.*}} = xetile.update_tile_offset %{{.*}}, [%{{.*}}, %{{.*}}] : !xetile.tile<8x16xf32>
+      %28 = xetile.update_tile_offset %arg17, [%c1024,  %c0] :  !xetile.tile<32x32xf16>
+      %29 = xetile.update_tile_offset %arg16, [%c1024,  %c0] : !xetile.tile<32x64xf32>
+      //CHECK: %{{.*}}:22 = scf.for %[[arg22:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args({{.*}}) -> (vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, !xetile.tile<32x16xf16>, !xetile.tile<32x16xf16>, !xetile.tile<32x16xf16>, !xetile.tile<32x16xf16>, !xetile.tile<32x16xf16>, !xetile.tile<32x16xf16>) {
+      %30:3 = scf.for %arg18 = %c0 to %c12288 step %c32 iter_args(%arg19 = %cst, %arg20 = %arg17, %arg21 = %26) -> (vector<32x64xf32>, !xetile.tile<32x32xf16>, !xetile.tile<64x32xf16>) {
+        //CHECK-COUNT-6: %{{.*}} = xetile.update_tile_offset %{{.*}}, [%{{.*}}, %{{.*}}] : !xetile.tile<32x16xf16>
+        %32 = xetile.update_tile_offset %arg21, [%c0,  %c32] : !xetile.tile<64x32xf16>
+        %33 = xetile.update_tile_offset %arg20, [%c0,  %c32] :  !xetile.tile<32x32xf16>
+        //CHECK-COUNT-2: %{{.*}} = xetile.load_tile %{{.*}} {padding = 0.000000e+00 : f32} : !xetile.tile<32x16xf16> -> vector<32x16xf16>
+        //CHECK-COUNT-8: %{{.*}} = vector.extract_strided_slice %{{.*}} {offsets = [{{.*}}], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf16> to vector<8x16xf16>
+        %34 = xetile.load_tile %arg20 {padding = 0.000000e+00 : f32}  : !xetile.tile<32x32xf16> -> vector<32x32xf16>
+        //CHECK-COUNT-8: %{{.*}} = math.exp %{{.*}} : vector<8x16xf16>
+        %35 = math.exp %34 : vector<32x32xf16>
+        //CHECK-COUNT-4: %{{.*}} = xetile.load_tile %{{.*}} {padding = 0.000000e+00 : f32} : !xetile.tile<32x16xf16> -> vector<32x16xf16>
+        %36 = xetile.load_tile %arg21 {padding = 0.000000e+00 : f32}  : !xetile.tile<64x32xf16> -> vector<64x32xf16>
+        //CHECK-COUNT-8: %{{.*}} = vector.extract_strided_slice %{{.*}} {offsets = [{{.*}}], sizes = [16, 16], strides = [1, 1]} : vector<32x16xf16> to vector<16x16xf16>
+        //CHECK-COUNT-8: %{{.*}} = xetile.transpose %{{.*}}, [1, 0] : vector<16x16xf16> -> vector<16x16xf16>
+        %37 = xetile.transpose %36, [1, 0] : vector<64x32xf16> -> vector<32x64xf16>
+        //CHECK-COUNT-8: %{{.*}} = math.exp %{{.*}} : vector<16x16xf16>
+        %38 = math.exp %37 : vector<32x64xf16>
+        //CHECK: xegpu.compile_hint
+        xegpu.compile_hint
+        //CHECK-COUNT-32: %{{.*}} = xetile.tile_mma %{{.*}}, %{{.*}}, %{{.*}} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+        %39 = xetile.tile_mma %35, %38, %cst : vector<32x32xf16>, vector<32x64xf16>, vector<32x64xf32> -> vector<32x64xf32>
+        //CHECK: xegpu.compile_hint
+        xegpu.compile_hint
+        //CHECK-COUNT-16: %{{.*}} = arith.addf %{{.*}}, %{{.*}} : vector<8x16xf32>
+        %40 = arith.addf %arg19, %39 : vector<32x64xf32>
+        scf.yield %40, %33, %32 : vector<32x64xf32>, !xetile.tile<32x32xf16>, !xetile.tile<64x32xf16>
+      }
+      //CHECK-COUNT-16: %{{.*}} = math.exp %{{.*}} : vector<8x16xf32>
+      %31 = math.exp %30#0 : vector<32x64xf32>
+      //CHECK-COUNT-16: xetile.store_tile %{{.*}},  %{{.*}} : vector<8x16xf32>, !xetile.tile<8x16xf32>
+      xetile.store_tile %31,  %arg16 : vector<32x64xf32>, !xetile.tile<32x64xf32>
+      scf.yield %29, %28 : !xetile.tile<32x64xf32>, !xetile.tile<32x32xf16>
+    }
+    gpu.return
+  }
+
+  //CHECK-LABEL: gpu.func @sglevel_transpose_broadcast_dim_0
+  //CHECK-SAME(%[[arg0:.*]]: memref<384x1xf32>, %[[arg1:.*]]: memref<256x384xf32>)
+  gpu.func @sglevel_transpose_broadcast_dim_0(%arg0: memref<384x1xf32>, %arg1: memref<256x384xf32>) {
+    //CHECK: %[[r0:.*]] = xetile.init_tile %[[arg0]][0, 0] : memref<384x1xf32> -> !xetile.tile<32x1xf32>
+    %1 = xetile.init_tile %arg0[0, 0] : memref<384x1xf32> -> !xetile.tile<32x1xf32>
+    //CHECK: %[[r1:.*]] = xetile.load_tile %[[r0]] {padding = 0.000000e+00 : f32} : !xetile.tile<32x1xf32> -> vector<32x1xf32>
+    %2 = xetile.load_tile %1 {padding = 0.000000e+00 : f32} : !xetile.tile<32x1xf32> -> vector<32x1xf32>
+    //CHECK: %[[r2:.*]] = vector.extract_strided_slice %[[r1]] {offsets = [0, 0], sizes = [16, 1], strides = [1, 1]} : vector<32x1xf32> to vector<16x1xf32>
+    //CHECK: %[[r3:.*]] = vector.extract_strided_slice %[[r1]] {offsets = [16, 0], sizes = [16, 1], strides = [1, 1]} : vector<32x1xf32> to vector<16x1xf32>
+    //CHECK: %[[r4:.*]] = xetile.transpose %[[r2]], [1, 0] : vector<16x1xf32> -> vector<1x16xf32>
+    //CHECK: %[[r5:.*]] = xetile.transpose %[[r3]], [1, 0] : vector<16x1xf32> -> vector<1x16xf32>
+    //CHECK: %[[r6:.*]] = vector.shape_cast %[[r4]] : vector<1x16xf32> to vector<16xf32>
+    //CHECK: %[[r7:.*]] = vector.shape_cast %[[r5]] : vector<1x16xf32> to vector<16xf32>
+    //CHECK: %[[r8:.*]] = vector.shuffle %[[r6]], %[[r7]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] : vector<16xf32>, vector<16xf32>
+    //CHECK: %[[r9:.*]] = vector.shape_cast %[[r8]] : vector<32xf32> to vector<1x32xf32>
+    %3 = xetile.transpose %2, [1, 0] : vector<32x1xf32> -> vector<1x32xf32>
+    //CHECK: %{{.*}} = xetile.broadcast %{{.*}} [0] : vector<1x32xf32> -> vector<64x32xf32>
+    %4 = xetile.broadcast %3 [0] : vector<1x32xf32> -> vector<64x32xf32>
+    //CHECK-COUNT-16: %{{.*}} = xetile.init_tile %[[arg1]][%{{.*}}, %{{.*}}] : memref<256x384xf32> -> !xetile.tile<8x16xf32>
+    %5 = xetile.init_tile %arg1[0, 0] : memref<256x384xf32> -> !xetile.tile<64x32xf32>
+    //CHECK-COUNT-8: %{{.*}} = vector.extract_strided_slice %{{.*}} {offsets = [{{.*}}], sizes = [8, 32], strides = [1, 1]} : vector<64x32xf32> to vector<8x32xf32>
+    //CHECK-COUNT-16: %{{.*}} = vector.extract_strided_slice %{{.*}} {offsets = [{{.*}}], sizes = [8, 16], strides = [1, 1]} : vector<8x32xf32> to vector<8x16xf32>
+    //CHECK-COUNT-16: xetile.store_tile %{{.*}},  %{{.*}} : vector<8x16xf32>, !xetile.tile<8x16xf32>
+    xetile.store_tile %4, %5 : vector<64x32xf32>, !xetile.tile<64x32xf32>
+    gpu.return
+  }
+
+  gpu.func @sglevel_transpose_broadcast_dim_1(%arg0: memref<1x384xf16>, %arg1: memref<384x256xf16>) {
+
+    //CHECK: %[[r0:.*]] = xetile.init_tile %[[arg0]][0, 0] : memref<1x384xf16> -> !xetile.tile<1x32xf16>
+    //CHECK: %[[r1:.*]] = xetile.load_tile %[[r0]] {padding = 0.000000e+00 : f32} : !xetile.tile<1x32xf16> -> vector<1x32xf16>
+    //CHECK: %[[r2:.*]] = xetile.transpose %[[r1]], [1, 0] : vector<1x32xf16> -> vector<32x1xf16>
+    //CHECK: %[[r3:.*]] = xetile.broadcast %[[r2]] [1] : vector<32x1xf16> -> vector<32x64xf16>
+    %1 = xetile.init_tile %arg0[0, 0] : memref<1x384xf16> -> !xetile.tile<1x32xf16>
+    %2 = xetile.load_tile %1 {padding = 0.000000e+00 : f32} : !xetile.tile<1x32xf16> -> vector<1x32xf16>
+    %3 = xetile.transpose %2, [1, 0] : vector<1x32xf16> -> vector<32x1xf16>
+    %4 = xetile.broadcast %3 [1] : vector<32x1xf16> -> vector<32x64xf16>
+    //CHECK-COUNT-8: %{{.*}} = xetile.init_tile %[[arg1]][%{{.*}}, %{{.*}}] : memref<384x256xf16> -> !xetile.tile<8x32xf16>
+    %5 = xetile.init_tile %arg1[0, 0] : memref<384x256xf16> -> !xetile.tile<32x64xf16>
+    //CHECK-COUNT-4: %{{.*}} = vector.extract_strided_slice %{{.*}} {offsets = [{{.*}}], sizes = [8, 64], strides = [1, 1]} : vector<32x64xf16> to vector<8x64xf16>
+    //CHECK-COUNT-8: %{{.*}} = vector.extract_strided_slice %{{.*}} {offsets = [{{.*}}], sizes = [8, 32], strides = [1, 1]} : vector<8x64xf16> to vector<8x32xf16>
+    //CHECK-COUNT-8: xetile.store_tile %{{.*}},  %{{.*}} : vector<8x32xf16>, !xetile.tile<8x32xf16>
+    xetile.store_tile %4, %5 : vector<32x64xf16>, !xetile.tile<32x64xf16>
+    gpu.return
+  }
+
+  //CHECK-LABEL: gpu.func @sg_loadgather
+  //CHECK-SAME: %[[arg0:.*]]: memref<1024xf16>, %[[arg1:.*]]: vector<4x32xindex>
+  gpu.func @sg_loadgather(%a: memref<1024xf16>, %indices: vector<4x32xindex>) {
+    //CHECK: %[[cst:.*]] = arith.constant dense<true> : vector<1x32xi1>
+    //CHECK: %[[r0:.*]] = vector.extract_strided_slice %[[arg1]] {offsets = [0, 0], sizes = [1, 32], strides = [1, 1]} : vector<4x32xindex> to vector<1x32xindex>
+    //CHECK: %[[r1:.*]] = vector.extract_strided_slice %[[arg1]] {offsets = [1, 0], sizes = [1, 32], strides = [1, 1]} : vector<4x32xindex> to vector<1x32xindex>
+    //CHECK: %[[r2:.*]] = vector.extract_strided_slice %[[arg1]] {offsets = [2, 0], sizes = [1, 32], strides = [1, 1]} : vector<4x32xindex> to vector<1x32xindex>
+    //CHECK: %[[r3:.*]] = vector.extract_strided_slice %[[arg1]] {offsets = [3, 0], sizes = [1, 32], strides = [1, 1]} : vector<4x32xindex> to vector<1x32xindex>
+    //CHECK: %[[r4:.*]] = xetile.init_tile %[[arg0]], %[[r0]] : memref<1024xf16>, vector<1x32xindex> -> !xetile.tile<1x32xf16, #xetile.tile_attr<scattered = true>>
+    //CHECK: %[[r5:.*]] = xetile.init_tile %[[arg0]], %[[r1]] : memref<1024xf16>, vector<1x32xindex> -> !xetile.tile<1x32xf16, #xetile.tile_attr<scattered = true>>
+    //CHECK: %[[r6:.*]] = xetile.init_tile %[[arg0]], %[[r2]] : memref<1024xf16>, vector<1x32xindex> -> !xetile.tile<1x32xf16, #xetile.tile_attr<scattered = true>>
+    //CHECK: %[[r7:.*]] = xetile.init_tile %[[arg0]], %[[r3]] : memref<1024xf16>, vector<1x32xindex> -> !xetile.tile<1x32xf16, #xetile.tile_attr<scattered = true>>
+    //CHECK: %[[r8:.*]] = xetile.load %[[r4]], %[[cst]] : !xetile.tile<1x32xf16, #xetile.tile_attr<scattered = true>>, vector<1x32xi1> -> vector<1x32xf16>
+    //CHECK: %[[r9:.*]] = xetile.load %[[r5]], %[[cst]] : !xetile.tile<1x32xf16, #xetile.tile_attr<scattered = true>>, vector<1x32xi1> -> vector<1x32xf16>
+    //CHECK: %[[r10:.*]] = xetile.load %[[r6]], %[[cst]] : !xetile.tile<1x32xf16, #xetile.tile_attr<scattered = true>>, vector<1x32xi1> -> vector<1x32xf16>
+    //CHECK: %[[r11:.*]] = xetile.load %[[r7]], %[[cst]] : !xetile.tile<1x32xf16, #xetile.tile_attr<scattered = true>>, vector<1x32xi1> -> vector<1x32xf16>
+    %mask = arith.constant dense<1> : vector<4x32xi1>
+    %1 = xetile.init_tile %a, %indices : memref<1024xf16>, vector<4x32xindex> -> !xetile.tile<4x32xf16, #xetile.tile_attr<scattered = true>>
+    %2 = xetile.load %1, %mask : !xetile.tile<4x32xf16, #xetile.tile_attr<scattered = true>>, vector<4x32xi1> -> vector<4x32xf16>
+    gpu.return
+  }
+
+  //CHECK-LABEL: gpu.func @sg_storescatter
+  //CHECK-SAME: %[[arg0:.*]]: memref<1024xf16>, %[[arg1:.*]]: vector<4x32xindex>
+  gpu.func @sg_storescatter(%a: memref<1024xf16>, %indices: vector<4x32xindex>) {
+    //CHECK: %[[cst:.*]] = arith.constant dense<4.200000e+01> : vector<1x32xf16>
+    //CHECK: %[[cst_0:.*]] = arith.constant dense<true> : vector<1x32xi1>
+    //CHECK: %[[cst_1:.*]] = arith.constant dense<1> : vector<1x32xindex>
+    //CHECK: %[[r0:.*]] = vector.extract_strided_slice %[[arg1]] {offsets = [0, 0], sizes = [1, 32], strides = [1, 1]} : vector<4x32xindex> to vector<1x32xindex>
+    //CHECK: %[[r1:.*]] = vector.extract_strided_slice %[[arg1]] {offsets = [1, 0], sizes = [1, 32], strides = [1, 1]} : vector<4x32xindex> to vector<1x32xindex>
+    //CHECK: %[[r2:.*]] = vector.extract_strided_slice %[[arg1]] {offsets = [2, 0], sizes = [1, 32], strides = [1, 1]} : vector<4x32xindex> to vector<1x32xindex>
+    //CHECK: %[[r3:.*]] = vector.extract_strided_slice %[[arg1]] {offsets = [3, 0], sizes = [1, 32], strides = [1, 1]} : vector<4x32xindex> to vector<1x32xindex>
+    //CHECK: %[[r4:.*]] = xetile.init_tile %[[arg0]], %[[r0]] : memref<1024xf16>, vector<1x32xindex> -> !xetile.tile<1x32xf16, #xetile.tile_attr<scattered = true>>
+    //CHECK: %[[r5:.*]] = xetile.init_tile %[[arg0]], %[[r1]] : memref<1024xf16>, vector<1x32xindex> -> !xetile.tile<1x32xf16, #xetile.tile_attr<scattered = true>>
+    //CHECK: %[[r6:.*]] = xetile.init_tile %[[arg0]], %[[r2]] : memref<1024xf16>, vector<1x32xindex> -> !xetile.tile<1x32xf16, #xetile.tile_attr<scattered = true>>
+    //CHECK: %[[r7:.*]] = xetile.init_tile %[[arg0]], %[[r3]] : memref<1024xf16>, vector<1x32xindex> -> !xetile.tile<1x32xf16, #xetile.tile_attr<scattered = true>>
+    //CHECK: xetile.store %[[cst]], %[[r4]], %[[cst_0]] : vector<1x32xf16>, !xetile.tile<1x32xf16, #xetile.tile_attr<scattered = true>>, vector<1x32xi1>
+    //CHECK: xetile.store %[[cst]], %[[r5]], %[[cst_0]] : vector<1x32xf16>, !xetile.tile<1x32xf16, #xetile.tile_attr<scattered = true>>, vector<1x32xi1>
+    //CHECK: xetile.store %[[cst]], %[[r6]], %[[cst_0]] : vector<1x32xf16>, !xetile.tile<1x32xf16, #xetile.tile_attr<scattered = true>>, vector<1x32xi1>
+    //CHECK: xetile.store %[[cst]], %[[r7]], %[[cst_0]] : vector<1x32xf16>, !xetile.tile<1x32xf16, #xetile.tile_attr<scattered = true>>, vector<1x32xi1>
+    //CHECK: %[[r8:.*]] = xetile.update_tile_offset %[[r4]], %[[cst_1]] : !xetile.tile<1x32xf16, #xetile.tile_attr<scattered = true>>, vector<1x32xindex>
+    //CHECK: %[[r9:.*]] = xetile.update_tile_offset %[[r5]], %[[cst_1]] : !xetile.tile<1x32xf16, #xetile.tile_attr<scattered = true>>, vector<1x32xindex>
+    //CHECK: %[[r10:.*]] = xetile.update_tile_offset %[[r6]], %[[cst_1]] : !xetile.tile<1x32xf16, #xetile.tile_attr<scattered = true>>, vector<1x32xindex>
+    //CHECK: %[[r11:.*]] = xetile.update_tile_offset %[[r7]], %[[cst_1]] : !xetile.tile<1x32xf16, #xetile.tile_attr<scattered = true>>, vector<1x32xindex>
+    //CHECK: xetile.store %[[cst]], %[[r8]], %[[cst_0]] : vector<1x32xf16>, !xetile.tile<1x32xf16, #xetile.tile_attr<scattered = true>>, vector<1x32xi1>
+    //CHECK: xetile.store %[[cst]], %[[r9]], %[[cst_0]] : vector<1x32xf16>, !xetile.tile<1x32xf16, #xetile.tile_attr<scattered = true>>, vector<1x32xi1>
+    //CHECK: xetile.store %[[cst]], %[[r10]], %[[cst_0]] : vector<1x32xf16>, !xetile.tile<1x32xf16, #xetile.tile_attr<scattered = true>>, vector<1x32xi1>
+    //CHECK: xetile.store %[[cst]], %[[r11]], %[[cst_0]] : vector<1x32xf16>, !xetile.tile<1x32xf16, #xetile.tile_attr<scattered = true>>, vector<1x32xi1>
+    %offsets = arith.constant dense<1> : vector<4x32xindex>
+    %mask = arith.constant dense<1> : vector<4x32xi1>
+    %data = arith.constant dense<42.0> : vector<4x32xf16>
+    %tile = xetile.init_tile %a, %indices : memref<1024xf16>, vector<4x32xindex> -> !xetile.tile<4x32xf16, #xetile.tile_attr<scattered = true>>
+    xetile.store %data, %tile, %mask : vector<4x32xf16>, !xetile.tile<4x32xf16, #xetile.tile_attr<scattered = true>>, vector<4x32xi1>
+    %next = xetile.update_tile_offset %tile, %offsets : !xetile.tile<4x32xf16, #xetile.tile_attr<scattered = true>>, vector<4x32xindex>
+    xetile.store %data, %next, %mask : vector<4x32xf16>, !xetile.tile<4x32xf16, #xetile.tile_attr<scattered = true>>, vector<4x32xi1>
+    gpu.return
+  }
+
+  //CHECK-LABEL: gpu.func @add_kernel
+  //CHECK-SAME: %[[arg0:.*]]: memref<*xf32>, %[[arg1:.*]]: memref<*xf32>, %[[arg2:.*]]: memref<*xf32>
+  gpu.func @add_kernel(%arg0: memref<*xf32>, %arg1: memref<*xf32>, %arg2: memref<*xf32>) {
+
+    //CHECK: %[[c1024:.*]] = arith.constant 1024 : index
+    //CHECK: %[[cst:.*]] = arith.constant dense<true> : vector<1x16xi1>
+    //CHECK: %[[cast:.*]] = memref.cast %[[arg0]] : memref<*xf32> to memref<?xf32>
+    //CHECK: %[[cast_0:.*]] = memref.cast %[[arg1]] : memref<*xf32> to memref<?xf32>
+    //CHECK: %[[cast_1:.*]] = memref.cast %[[arg2]] : memref<*xf32> to memref<?xf32>
+    //CHECK: %[[block_id_x:.*]] = gpu.block_id  x
+    //CHECK: %[[r0:.*]] = arith.muli %[[block_id_x]], %[[c1024]] : index
+    //CHECK: %[[r1:.*]] = vector.splat %[[r0]] : vector<1x32xindex>
+    //CHECK: %[[r2:.*]] = vector.extract_strided_slice %[[r1]] {offsets = [0, 0], sizes = [1, 16], strides = [1, 1]} : vector<1x32xindex> to vector<1x16xindex>
+    //CHECK: %[[r3:.*]] = vector.extract_strided_slice %[[r1]] {offsets = [0, 16], sizes = [1, 16], strides = [1, 1]} : vector<1x32xindex> to vector<1x16xindex>
+    //CHECK: %[[r4:.*]] = xetile.init_tile %[[cast]], %[[r2]] : memref<?xf32>, vector<1x16xindex> -> !xetile.tile<1x16xf32, #xetile.tile_attr<scattered = true>>
+    //CHECK: %[[r5:.*]] = xetile.init_tile %[[cast]], %[[r3]] : memref<?xf32>, vector<1x16xindex> -> !xetile.tile<1x16xf32, #xetile.tile_attr<scattered = true>>
+    //CHECK: %[[r6:.*]] = xetile.load %[[r4]], %[[cst]] : !xetile.tile<1x16xf32, #xetile.tile_attr<scattered = true>>, vector<1x16xi1> -> vector<1x16xf32>
+    //CHECK: %[[r7:.*]] = xetile.load %[[r5]], %[[cst]] : !xetile.tile<1x16xf32, #xetile.tile_attr<scattered = true>>, vector<1x16xi1> -> vector<1x16xf32>
+    //CHECK: %[[r8:.*]] = xetile.init_tile %[[cast_0]], %[[r2]] : memref<?xf32>, vector<1x16xindex> -> !xetile.tile<1x16xf32, #xetile.tile_attr<scattered = true>>
+    //CHECK: %[[r9:.*]] = xetile.init_tile %[[cast_0]], %[[r3]] : memref<?xf32>, vector<1x16xindex> -> !xetile.tile<1x16xf32, #xetile.tile_attr<scattered = true>>
+    //CHECK: %[[r10:.*]] = xetile.load %[[r8]], %[[cst]] : !xetile.tile<1x16xf32, #xetile.tile_attr<scattered = true>>, vector<1x16xi1> -> vector<1x16xf32>
+    //CHECK: %[[r11:.*]] = xetile.load %[[r9]], %[[cst]] : !xetile.tile<1x16xf32, #xetile.tile_attr<scattered = true>>, vector<1x16xi1> -> vector<1x16xf32>
+    //CHECK: %[[r12:.*]] = arith.addf %[[r6]], %[[r10]] : vector<1x16xf32>
+    //CHECK: %[[r13:.*]] = arith.addf %[[r7]], %[[r11]] : vector<1x16xf32>
+    //CHECK: %[[r14:.*]] = xetile.init_tile %[[cast_1]], %[[r2]] : memref<?xf32>, vector<1x16xindex> -> !xetile.tile<1x16xf32, #xetile.tile_attr<scattered = true>>
+    //CHECK: %[[r15:.*]] = xetile.init_tile %[[cast_1]], %[[r3]] : memref<?xf32>, vector<1x16xindex> -> !xetile.tile<1x16xf32, #xetile.tile_attr<scattered = true>>
+    //CHECK: xetile.store %[[r12]], %[[r14]], %[[cst]] : vector<1x16xf32>, !xetile.tile<1x16xf32, #xetile.tile_attr<scattered = true>>, vector<1x16xi1>
+    //CHECK: xetile.store %[[r13]], %[[r15]], %[[cst]] : vector<1x16xf32>, !xetile.tile<1x16xf32, #xetile.tile_attr<scattered = true>>, vector<1x16xi1>
+
+    %cst = arith.constant dense<true> : vector<1x32xi1>
+    %c1024 = arith.constant 1024 : index
+    %cast = memref.cast %arg0 : memref<*xf32> to memref<?xf32>
+    %cast_0 = memref.cast %arg1 : memref<*xf32> to memref<?xf32>
+    %cast_1 = memref.cast %arg2 : memref<*xf32> to memref<?xf32>
+    %block_id_x = gpu.block_id  x
+    %0 = arith.muli %block_id_x, %c1024 : index
+    %1 = vector.splat %0 : vector<1x32xindex>
+    %2 = xetile.init_tile %cast, %1 : memref<?xf32>, vector<1x32xindex> -> !xetile.tile<1x32xf32, #xetile.tile_attr<scattered = true>>
+    %3 = xetile.load %2, %cst : !xetile.tile<1x32xf32, #xetile.tile_attr<scattered = true>>, vector<1x32xi1> -> vector<1x32xf32>
+    %4 = xetile.init_tile %cast_0, %1 : memref<?xf32>, vector<1x32xindex> -> !xetile.tile<1x32xf32, #xetile.tile_attr<scattered = true>>
+    %5 = xetile.load %4, %cst : !xetile.tile<1x32xf32, #xetile.tile_attr<scattered = true>>, vector<1x32xi1> -> vector<1x32xf32>
+    %6 = arith.addf %3, %5 : vector<1x32xf32>
+    %7 = xetile.init_tile %cast_1, %1 : memref<?xf32>, vector<1x32xindex> -> !xetile.tile<1x32xf32, #xetile.tile_attr<scattered = true>>
+    xetile.store %6, %7, %cst : vector<1x32xf32>, !xetile.tile<1x32xf32, #xetile.tile_attr<scattered = true>>, vector<1x32xi1>
+    gpu.return
+  }
+}