From 734811221a58e88cfc6e8cd2010820bb5b10b697 Mon Sep 17 00:00:00 2001
From: Dimple Prajapati <dimpalben.r.prajapati@intel.com>
Date: Wed, 28 Aug 2024 11:00:56 -0700
Subject: [PATCH 1/2] [test] Fix test cases with unsupported shaped buffer
 sizes (#847)

[test] Fix test cases that does xegpu.load_nd on unsupported shaped buffer

In several test cases, we use load_nd on unsupported shaped buffer.
This PR fixes one of those test cases.

Co-authored-by: Md Abdullah Shahneous Bari <md.abdullah.shahneous.bari@intel.com>
---
 .../imex/ExecutionEngine/ImexRunnerUtils.h    |   5 +
 .../Dialect/XeGPU/dynamic_memref.vc.mlir      |  44 +++---
 .../Integration/Dialect/XeGPU/exp_f32.vc.mlir |  89 +++++-------
 .../Dialect/XeGPU/fmax_f32.vc.mlir            |   7 +-
 .../Dialect/XeGPU/load2d-padding.mlir         |  42 +++---
 .../Dialect/XeGPU/load2d_dpas_store2d.mlir    |  97 -------------
 .../Integration/Dialect/XeGPU/preop_dpas.mlir | 102 +++++++------
 .../Dialect/XeGPU/vector_broadcast_1.mlir     | 132 ++++++++++-------
 .../Dialect/XeGPU/vector_broadcast_2.mlir     | 136 ++++++++++--------
 .../vector_extract_strided_slice_1.vc.mlir    | 122 ++++++++++------
 .../Dialect/XeGPU/vector_insert_1.mlir        |  51 +++----
 .../Dialect/XeGPU/vector_insert_2.mlir        |  49 +++----
 .../Dialect/XeGPU/xegpu-to-vc.mlir            | 135 +++++++++++------
 13 files changed, 510 insertions(+), 501 deletions(-)
 delete mode 100644 test/Integration/Dialect/XeGPU/load2d_dpas_store2d.mlir
diff --git a/include/imex/ExecutionEngine/ImexRunnerUtils.h b/include/imex/ExecutionEngine/ImexRunnerUtils.h
index b9f03023a..464ad6418 100644
--- a/include/imex/ExecutionEngine/ImexRunnerUtils.h
+++ b/include/imex/ExecutionEngine/ImexRunnerUtils.h
@@ -72,6 +72,11 @@ _mlir_ciface_fillResource1DRandomF16(UnrankedMemRefType<f16> *ptr,
                                      const float lower, const float upper,
                                      const bool genInt);
 
+extern "C" IMEX_RUNNERUTILS_EXPORT void
+_mlir_ciface_fillResource1DRandomF32(UnrankedMemRefType<float> *ptr,
+                                     const float lower, const float upper,
+                                     const bool genInt);
+
 extern "C" IMEX_RUNNERUTILS_EXPORT void
 _mlir_ciface_printMemrefBF16(UnrankedMemRefType<bf16> *m);
 extern "C" IMEX_RUNNERUTILS_EXPORT void
diff --git a/test/Integration/Dialect/XeGPU/dynamic_memref.vc.mlir b/test/Integration/Dialect/XeGPU/dynamic_memref.vc.mlir
index a70a08c76..55c8f0902 100644
--- a/test/Integration/Dialect/XeGPU/dynamic_memref.vc.mlir
+++ b/test/Integration/Dialect/XeGPU/dynamic_memref.vc.mlir
@@ -7,55 +7,49 @@
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
 module @gemm attributes {gpu.container_module} {
-  func.func @test(%A : memref<8x16xf16>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} {
+  func.func @test(%A : memref<8x16xf32>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} {
     %c1 = arith.constant 1 : index
-    %memref_0 = gpu.alloc  host_shared () : memref<8x16xf16>
-    memref.copy %A, %memref_0 : memref<8x16xf16> to memref<8x16xf16>
+    %memref_0 = gpu.alloc  host_shared () : memref<8x16xf32>
+    memref.copy %A, %memref_0 : memref<8x16xf32> to memref<8x16xf32>
     %memref_1 = gpu.alloc  host_shared () : memref<8x16xf32>
-    %memref_0_cast = memref.cast %memref_0 : memref<8x16xf16> to memref<?x?xf16>
+    %memref_0_cast = memref.cast %memref_0 : memref<8x16xf32> to memref<?x?xf32>
     %memref_1_cast = memref.cast %memref_1 : memref<8x16xf32> to memref<?x?xf32>
-    gpu.launch_func  @test_kernel::@test_kernel blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref_0_cast : memref<?x?xf16>, %memref_1_cast : memref<?x?xf32>)
-    gpu.dealloc %memref_0 : memref<8x16xf16>
+    gpu.launch_func  @test_kernel::@test_kernel blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref_0_cast : memref<?x?xf32>, %memref_1_cast : memref<?x?xf32>)
+    gpu.dealloc %memref_0 : memref<8x16xf32>
     return %memref_1 : memref<8x16xf32>
   }
   gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, SubgroupDispatch, VectorComputeINTEL, VectorAnyINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
-   gpu.func @test_kernel(%arg0 : memref<?x?xf16>, %arg1: memref<?x?xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+   gpu.func @test_kernel(%arg0 : memref<?x?xf32>, %arg1: memref<?x?xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
       %c1 = arith.constant 1 : index
       %c8 = arith.constant 8 : index
       %c16 = arith.constant 16 : index
-      %1 = xegpu.create_nd_tdesc %arg0[0, 0], [%c8, %c16], [%c16, %c1] : memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16>
-      %2 = xegpu.load_nd %1 {l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>}  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-      %3 = vector.shape_cast %2 : vector<8x16xf16> to vector<128xf16>
-      %5 = arith.extf %3 : vector<128xf16> to vector<128xf32>
-      %4 = vector.shape_cast %5 : vector<128xf32> to vector<8x16xf32>
+      %1 = xegpu.create_nd_tdesc %arg0[0, 0], [%c8, %c16], [%c16, %c1] : memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32>
+      %2 = xegpu.load_nd %1 {l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>}  : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
       %6 = xegpu.create_nd_tdesc %arg1[0, 0], [%c8, %c16], [%c16, %c1] : memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32>
-      xegpu.store_nd %4, %6 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+      xegpu.store_nd %2, %6 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
       gpu.return
     }
   }
   func.func @main() attributes {llvm.emit_c_interface} {
-    %A = memref.alloc() : memref<8x16xf16>
-    %A_random = memref.cast %A : memref<8x16xf16> to memref<*xf16>
+    %A = memref.alloc() : memref<8x16xf32>
+    %A_random = memref.cast %A : memref<8x16xf32> to memref<*xf32>
     %c_gen_int = arith.constant 0 : i1
     %cf_lower = arith.constant -0.5 : f32
     %cf_upper = arith.constant 0.5 : f32
 
-    call @fillResource1DRandomF16(%A_random, %cf_lower, %cf_upper, %c_gen_int) : (memref<*xf16>, f32, f32, i1) -> ()
+    call @fillResource1DRandomF32(%A_random, %cf_lower, %cf_upper, %c_gen_int) : (memref<*xf32>, f32, f32, i1) -> ()
 
-    %B = call @test(%A) : (memref<8x16xf16>) -> memref<8x16xf32>
+    %B = call @test(%A) : (memref<8x16xf32>) -> memref<8x16xf32>
     %B_cast = memref.cast %B : memref<8x16xf32> to memref<*xf32>
-    %A_cast = memref.cast %A : memref<8x16xf16> to memref<*xf16>
-    // call @printMemrefF16(%A_cast) : (memref<*xf16>) -> ()
+    %A_cast = memref.cast %A : memref<8x16xf32> to memref<*xf32>
     // call @printMemrefF32(%B_cast) : (memref<*xf32>) -> ()
     // CHECK: [ALLCLOSE: TRUE]
-    call @printAllcloseF16(%A_cast, %B_cast) : (memref<*xf16>, memref<*xf32>) -> ()
+    call @printAllcloseF32(%A_cast, %B_cast) : (memref<*xf32>, memref<*xf32>) -> ()
 
-    memref.dealloc %A : memref<8x16xf16>
+    memref.dealloc %A : memref<8x16xf32>
     return
   }
-  func.func private @printMemrefF16(memref<*xf16>) attributes {llvm.emit_c_interface}
   func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface}
-  func.func private @fillResource1DRandomF16(memref<*xf16>, f32, f32, i1) attributes {llvm.emit_c_interface}
-  func.func private @fillResource1DF16(memref<*xf16>, f32) attributes {llvm.emit_c_interface}
-  func.func private @printAllcloseF16(memref<*xf16>, memref<*xf32>) attributes {llvm.emit_c_interface}
+  func.func private @fillResource1DRandomF32(memref<*xf32>, f32, f32, i1) attributes {llvm.emit_c_interface}
+  func.func private @printAllcloseF32(memref<*xf32>, memref<*xf32>) attributes {llvm.emit_c_interface}
 }
diff --git a/test/Integration/Dialect/XeGPU/exp_f32.vc.mlir b/test/Integration/Dialect/XeGPU/exp_f32.vc.mlir
index 45036fbbf..92111fc1d 100644
--- a/test/Integration/Dialect/XeGPU/exp_f32.vc.mlir
+++ b/test/Integration/Dialect/XeGPU/exp_f32.vc.mlir
@@ -7,35 +7,28 @@
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
 module @gemm attributes {gpu.container_module} {
-  func.func @test(%A: memref<8x16xf16>, %B: memref<16x16xf16> ) -> (memref<8x16xf32>, memref<8x16xf32>) attributes {llvm.emit_c_interface} {
+  func.func @test(%A: memref<8x16xf32>) -> (memref<8x16xf32>, memref<8x16xf32>) attributes {llvm.emit_c_interface} {
     %c1 = arith.constant 1 : index
-    %memref = gpu.alloc  host_shared () : memref<8x16xf16>
-    %memref_1 = gpu.alloc  host_shared () : memref<16x16xf16>
-    memref.copy %A, %memref : memref<8x16xf16> to memref<8x16xf16>
-    memref.copy %B, %memref_1 : memref<16x16xf16> to memref<16x16xf16>
+    %memref = gpu.alloc  host_shared () : memref<8x16xf32>
+    memref.copy %A, %memref : memref<8x16xf32> to memref<8x16xf32>
+
     %memref_2 = gpu.alloc  host_shared () : memref<8x16xf32>
     %memref_3 = gpu.alloc  host_shared () : memref<8x16xf32>
-    gpu.launch_func  @module0::@test_exp_larger_vec blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<8x16xf16>, %memref_1 : memref<16x16xf16>, %memref_2 : memref<8x16xf32>)
-    gpu.launch_func  @module1::@test_exp_generic_vec blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<8x16xf16>, %memref_1 : memref<16x16xf16>, %memref_3 : memref<8x16xf32>)
-    gpu.dealloc  %memref : memref<8x16xf16>
-    gpu.dealloc  %memref_1 : memref<16x16xf16>
+    gpu.launch_func  @module0::@test_exp_larger_vec blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<8x16xf32>, %memref_2 : memref<8x16xf32>)
+    gpu.launch_func  @module1::@test_exp_generic_vec blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<8x16xf32>, %memref_3 : memref<8x16xf32>)
+    gpu.dealloc  %memref : memref<8x16xf32>
     return %memref_2, %memref_3 : memref<8x16xf32>, memref<8x16xf32>
   }
 
     gpu.module @module0 attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, SubgroupDispatch, VectorComputeINTEL, VectorAnyINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
-    gpu.func @test_exp_larger_vec(%A: memref<8x16xf16>, %B: memref<16x16xf16>, %Out: memref<8x16xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+    gpu.func @test_exp_larger_vec(%A: memref<8x16xf32>, %Out: memref<8x16xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
       %c0 = arith.constant 0 : index
       %c16 = arith.constant 16 : index
       // load A tile
-      %a_tile0 = xegpu.create_nd_tdesc %A [%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
-      %val0 = xegpu.load_nd %a_tile0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-      // load B tile
-      %b_tile0 = xegpu.create_nd_tdesc %B [%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-      %val2 = xegpu.load_nd %b_tile0 { packed} : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16>
-      // do DPAS
-      %val4 = xegpu.dpas %val0, %val2 : vector<8x16xf16>, vector<8x16x2xf16> -> vector<8x16xf32>
+      %a_tile0 = xegpu.create_nd_tdesc %A [%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+      %val0 = xegpu.load_nd %a_tile0 : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
       // take exp
-      %t6 = math.exp %val4 : vector<8x16xf32>
+      %t6 = math.exp %val0 : vector<8x16xf32>
       // store
       %out_tile = xegpu.create_nd_tdesc %Out [%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
       xegpu.store_nd %t6, %out_tile  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
@@ -43,27 +36,22 @@ module @gemm attributes {gpu.container_module} {
     }
   }
   gpu.module @module1 attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, SubgroupDispatch, VectorComputeINTEL, VectorAnyINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
-    gpu.func @test_exp_generic_vec(%A: memref<8x16xf16>, %B: memref<16x16xf16>, %Out: memref<8x16xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+    gpu.func @test_exp_generic_vec(%A: memref<8x16xf32>, %Out: memref<8x16xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
       %c0 = arith.constant 0 : index
       %c16 = arith.constant 16 : index
       // load A tile
-      %a_tile0 = xegpu.create_nd_tdesc %A [%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
-      %val0 = xegpu.load_nd %a_tile0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-      // load B tile
-      %b_tile0 = xegpu.create_nd_tdesc %B [%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-      %val2 = xegpu.load_nd %b_tile0 {packed} : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16>
-      // do DPAS
-      %val4 = xegpu.dpas %val0, %val2 : vector<8x16xf16>, vector<8x16x2xf16> -> vector<8x16xf32>
-      // extract dpas out into 16xf32 vectors
-      %cst1 = arith.constant dense<1.4426950408889634> : vector<128xf32>
-      %v0 = vector.extract %val4[0] : vector<16xf32> from vector<8x16xf32>
-      %v1 = vector.extract %val4[1] : vector<16xf32> from vector<8x16xf32>
-      %v2 = vector.extract %val4[2] : vector<16xf32> from vector<8x16xf32>
-      %v3 = vector.extract %val4[3] : vector<16xf32> from vector<8x16xf32>
-      %v4 = vector.extract %val4[4] : vector<16xf32> from vector<8x16xf32>
-      %v5 = vector.extract %val4[5] : vector<16xf32> from vector<8x16xf32>
-      %v6 = vector.extract %val4[6] : vector<16xf32> from vector<8x16xf32>
-      %v7 = vector.extract %val4[7] : vector<16xf32> from vector<8x16xf32>
+      %a_tile0 = xegpu.create_nd_tdesc %A [%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+      %val0 = xegpu.load_nd %a_tile0 : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
+
+      // extract the loaded vector into 16xf32 vectors
+      %v0 = vector.extract %val0[0] : vector<16xf32> from vector<8x16xf32>
+      %v1 = vector.extract %val0[1] : vector<16xf32> from vector<8x16xf32>
+      %v2 = vector.extract %val0[2] : vector<16xf32> from vector<8x16xf32>
+      %v3 = vector.extract %val0[3] : vector<16xf32> from vector<8x16xf32>
+      %v4 = vector.extract %val0[4] : vector<16xf32> from vector<8x16xf32>
+      %v5 = vector.extract %val0[5] : vector<16xf32> from vector<8x16xf32>
+      %v6 = vector.extract %val0[6] : vector<16xf32> from vector<8x16xf32>
+      %v7 = vector.extract %val0[7] : vector<16xf32> from vector<8x16xf32>
       // do generic size exp
       %v0_exp = math.exp %v0 : vector<16xf32>
       %v1_exp = math.exp %v1 : vector<16xf32>
@@ -104,31 +92,19 @@ module @gemm attributes {gpu.container_module} {
     %rand_lower = arith.constant -1.0 : f32
     %rand_upper = arith.constant 1.0 : f32
     %gen_int = arith.constant 0 : i1
-    %A = memref.alloc() : memref<8x16xf16>
-    %B = memref.alloc() : memref<16x16xf16>
+    %A = memref.alloc() : memref<8x16xf32>
     %Out_cpu = memref.alloc() : memref<8x16xf32>
-    %A_random = memref.cast %A : memref<8x16xf16> to memref<*xf16>
-    %B_random = memref.cast %B : memref<16x16xf16> to memref<*xf16>
-    call @fillResource1DRandomF16(%A_random, %rand_lower, %rand_upper, %gen_int) : (memref<*xf16>, f32, f32, i1) -> ()
-    call @fillResource1DRandomF16(%B_random, %rand_lower, %rand_upper, %gen_int) : (memref<*xf16>, f32, f32, i1) -> ()
+    %A_random = memref.cast %A : memref<8x16xf32> to memref<*xf32>
+    call @fillResource1DRandomF32(%A_random, %rand_lower, %rand_upper, %gen_int) : (memref<*xf32>, f32, f32, i1) -> ()
     // run GPU version
-    %Out_gpu_large, %Out_gpu_generic = call @test(%A, %B) : (memref<8x16xf16>, memref<16x16xf16>) -> (memref<8x16xf32>, memref<8x16xf32>)
+    %Out_gpu_large, %Out_gpu_generic = call @test(%A) : (memref<8x16xf32>) -> (memref<8x16xf32>, memref<8x16xf32>)
     %Out_gpu_generic_cast = memref.cast %Out_gpu_generic : memref<8x16xf32> to memref<*xf32>
     %Out_gpu_large_cast = memref.cast %Out_gpu_large : memref<8x16xf32> to memref<*xf32>
     // run CPU version
     scf.for %i = %c0 to %c8 step %c1 {
       scf.for %j = %c0 to  %c16 step %c1 {
-        %v0_init = arith.constant 0.0 : f32
-        %result:1 = scf.for %k = %c0 to %c16 step %c1 iter_args(%v0 = %v0_init) -> f32 {
-          %a0 = memref.load %A[%i, %k] : memref<8x16xf16>
-          %b0 = memref.load %B[%k, %j] : memref<16x16xf16>
-          %a0_f32 = arith.extf %a0 : f16 to f32
-          %b0_f32 = arith.extf %b0 : f16 to f32
-          %t0 = arith.mulf %a0_f32, %b0_f32 : f32
-          %v0_new = arith.addf %v0, %t0 : f32
-          scf.yield %v0_new : f32
-        }
-        %vexp = math.exp %result#0: f32
+        %a0 = memref.load %A[%i, %j] : memref<8x16xf32>
+        %vexp = math.exp %a0: f32
         memref.store %vexp, %Out_cpu[%i, %j] : memref<8x16xf32>
       }
     }
@@ -141,8 +117,7 @@ module @gemm attributes {gpu.container_module} {
     call @printAllcloseF32(%Out_gpu_generic_cast, %Out_cpu_cast) : (memref<*xf32>, memref<*xf32>) -> ()
     call @printAllcloseF32(%Out_gpu_large_cast, %Out_cpu_cast) : (memref<*xf32>, memref<*xf32>) -> ()
     // dealloc
-    memref.dealloc %A : memref<8x16xf16>
-    memref.dealloc %B : memref<16x16xf16>
+    memref.dealloc %A : memref<8x16xf32>
     memref.dealloc %Out_cpu : memref<8x16xf32>
     // gpu dealloc
     gpu.dealloc %Out_gpu_generic : memref<8x16xf32>
@@ -150,6 +125,6 @@ module @gemm attributes {gpu.container_module} {
     return
   }
   func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface}
-  func.func private @fillResource1DRandomF16(memref<*xf16>, f32, f32, i1) attributes {llvm.emit_c_interface}
+  func.func private @fillResource1DRandomF32(memref<*xf32>, f32, f32, i1) attributes {llvm.emit_c_interface}
   func.func private @printAllcloseF32(memref<*xf32>, memref<*xf32>) attributes {llvm.emit_c_interface}
 }
diff --git a/test/Integration/Dialect/XeGPU/fmax_f32.vc.mlir b/test/Integration/Dialect/XeGPU/fmax_f32.vc.mlir
index b2bb6829a..8cb0eab7b 100644
--- a/test/Integration/Dialect/XeGPU/fmax_f32.vc.mlir
+++ b/test/Integration/Dialect/XeGPU/fmax_f32.vc.mlir
@@ -50,6 +50,7 @@ module @gemm attributes {gpu.container_module} {
     %c1 = arith.constant 1 : index
     %c8 = arith.constant 8 : index
     %c16 = arith.constant 16 : index
+
     %A = memref.alloc() : memref<8x32xf16>
     %B = memref.alloc() : memref<16x32xf16>
     %Out_cpu = memref.alloc() : memref<8x16xf32>
@@ -72,9 +73,9 @@ module @gemm attributes {gpu.container_module} {
         %v0_init = arith.constant 0.0 : f32
         %v1_init = arith.constant 0.0 : f32
         %result:2 = scf.for %k = %c0 to %c16 step %c1 iter_args(%v0 = %v0_init, %v1 = %v1_init) -> (f32, f32){
-          %a0 = memref.load %A[%i, %k] : memref<8x32xf16>
           %1 = arith.addi %k, %c16 : index
           %2 = arith.addi %j, %c16 : index
+          %a0 = memref.load %A[%i, %k] : memref<8x32xf16>
           %a1 = memref.load %A[%i, %1] : memref<8x32xf16>
           %b0 = memref.load %B[%k, %j] : memref<16x32xf16>
           %b1 = memref.load %B[%k, %2] : memref<16x32xf16>
@@ -94,8 +95,8 @@ module @gemm attributes {gpu.container_module} {
     }
     %Out_cpu_cast = memref.cast %Out_cpu : memref<8x16xf32> to memref<*xf32>
     // print GPU and CPU outs
-    // call @printMemrefF32(%Out_cpu_cast) : (memref<*xf32>) -> ()
-    // call @printMemrefF32(%Out_gpu_cast) : (memref<*xf32>) -> ()
+     call @printMemrefF32(%Out_cpu_cast) : (memref<*xf32>) -> ()
+     call @printMemrefF32(%Out_gpu_cast) : (memref<*xf32>) -> ()
     // CHECK: [ALLCLOSE: TRUE]
     call @printAllcloseF32(%Out_gpu_cast, %Out_cpu_cast) : (memref<*xf32>, memref<*xf32>) -> ()
     // dealloc
diff --git a/test/Integration/Dialect/XeGPU/load2d-padding.mlir b/test/Integration/Dialect/XeGPU/load2d-padding.mlir
index 22a97f496..b6c488f46 100644
--- a/test/Integration/Dialect/XeGPU/load2d-padding.mlir
+++ b/test/Integration/Dialect/XeGPU/load2d-padding.mlir
@@ -7,46 +7,46 @@
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
 module @gemm attributes {gpu.container_module} {
-  // memref.global "private" constant @__constant_8x16xf16 : memref<8x16xf16> = dense<1.0>
-  memref.global "private" constant @__constant_8x16xf16 : memref<8x16xf16> = dense<1.0>
+  // memref.global "private" constant @__constant_8x16xf32 : memref<8x16xf32> = dense<1.0>
+  memref.global "private" constant @__constant_8x16xf32 : memref<8x16xf32> = dense<1.0>
 
-  func.func @test(%arg0: memref<8x16xf16>,%arg3:index) -> memref<8x16xf16> attributes {llvm.emit_c_interface} {
+  func.func @test(%arg0: memref<8x16xf32>,%arg3:index) -> memref<8x16xf32> attributes {llvm.emit_c_interface} {
     %c1 = arith.constant 1 : index
-    %memref = gpu.alloc  host_shared () : memref<8x16xf16>
-    memref.copy %arg0, %memref : memref<8x16xf16> to memref<8x16xf16>
-    %memref_1 = gpu.alloc  host_shared () : memref<8x16xf16>
-    gpu.launch_func  @test_kernel::@test_padding blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<8x16xf16>, %memref_1 : memref<8x16xf16>, %arg3:index)
+    %memref = gpu.alloc  host_shared () : memref<8x16xf32>
+    memref.copy %arg0, %memref : memref<8x16xf32> to memref<8x16xf32>
+    %memref_1 = gpu.alloc  host_shared () : memref<8x16xf32>
+    gpu.launch_func  @test_kernel::@test_padding blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<8x16xf32>, %memref_1 : memref<8x16xf32>, %arg3:index)
 
-    gpu.dealloc  %memref : memref<8x16xf16>
-    return %memref_1 : memref<8x16xf16>
+    gpu.dealloc  %memref : memref<8x16xf32>
+    return %memref_1 : memref<8x16xf32>
   }
   gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, SubgroupDispatch, VectorComputeINTEL, VectorAnyINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
-    gpu.func @test_padding(%arg0: memref<8x16xf16>, %arg1: memref<8x16xf16>,%arg3:index) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+    gpu.func @test_padding(%arg0: memref<8x16xf32>, %arg1: memref<8x16xf32>,%arg3:index) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
       %0 = xegpu.create_nd_tdesc %arg0[%arg3, %arg3]
-      : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
+      : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
       %2 = xegpu.create_nd_tdesc %arg1[0, 0]
-      : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
-      %3 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-      xegpu.store_nd %3,%2 : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16>
+      : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+      %3 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
+      xegpu.store_nd %3,%2 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
       gpu.return
     }
   }
   func.func @main() attributes {llvm.emit_c_interface} {
-    %0 = memref.get_global @__constant_8x16xf16 : memref<8x16xf16>
+    %0 = memref.get_global @__constant_8x16xf32 : memref<8x16xf32>
     %c0 = arith.constant 0 : index
     %c1 = arith.constant 1 : index
     %c2 = arith.constant 2 : index
-    %2 = call @test(%0, %c1) : (memref<8x16xf16>, index) -> memref<8x16xf16>
-    %3 = call @test(%0, %c2) : (memref<8x16xf16>, index) -> memref<8x16xf16>
+    %2 = call @test(%0, %c1) : (memref<8x16xf32>, index) -> memref<8x16xf32>
+    %3 = call @test(%0, %c2) : (memref<8x16xf32>, index) -> memref<8x16xf32>
 
     %c7 = arith.constant 7 : index
-    %vector_0 = vector.load %2[%c7,%c0] :memref<8x16xf16>, vector<16xf16>
+    %vector_0 = vector.load %2[%c7,%c0] :memref<8x16xf32>, vector<16xf32>
 // CHECK: ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 )
-    vector.print %vector_0 : vector<16xf16>
+    vector.print %vector_0 : vector<16xf32>
 
-    %vector_1 = vector.load %3[%c0,%c0] :memref<8x16xf16>, vector<16xf16>
+    %vector_1 = vector.load %3[%c0,%c0] :memref<8x16xf32>, vector<16xf32>
 // CHECK: ( 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0 )
-    vector.print %vector_1 : vector<16xf16>
+    vector.print %vector_1 : vector<16xf32>
     return
   }
 }
diff --git a/test/Integration/Dialect/XeGPU/load2d_dpas_store2d.mlir b/test/Integration/Dialect/XeGPU/load2d_dpas_store2d.mlir
deleted file mode 100644
index 98de82596..000000000
--- a/test/Integration/Dialect/XeGPU/load2d_dpas_store2d.mlir
+++ /dev/null
@@ -1,97 +0,0 @@
-// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \
-// RUN:                                       --runner imex-cpu-runner -e main \
-// RUN:                                       --entry-point-result=void \
-// RUN:                                       --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%levelzero_runtime --filecheck
-// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \
-// RUN:                                        --runner imex-cpu-runner -e main \
-// RUN:                                        --entry-point-result=void \
-// RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
-module @gemm attributes {gpu.container_module} {
-  memref.global "private"  @__constant_8x16xf16 : memref<8x16xf16> = dense<1.0>
-  memref.global "private"  @__constant_16x16xf16 : memref<16x16xf16> = dense<1.0>
-  memref.global "private"  @__constant_16x16xf32 : memref<16x16xf32> = dense<0.0>
-
-  func.func @test(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} {
-    %c1 = arith.constant 1 : index
-    %memref = gpu.alloc  host_shared () : memref<8x16xf16>
-    memref.copy %arg0, %memref : memref<8x16xf16> to memref<8x16xf16>
-    %memref_0 = gpu.alloc  host_shared () : memref<16x16xf16>
-    memref.copy %arg1, %memref_0 : memref<16x16xf16> to memref<16x16xf16>
-    %memref_1 = gpu.alloc  host_shared () : memref<8x16xf32>
-    gpu.launch_func  @test_kernel::@test_kernel blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<8x16xf16>, %memref_0 : memref<16x16xf16>, %memref_1 : memref<8x16xf32>)
-    gpu.dealloc  %memref : memref<8x16xf16>
-    gpu.dealloc  %memref_0 : memref<16x16xf16>
-    return %memref_1 : memref<8x16xf32>
-  }
-
-  gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, SubgroupDispatch, VectorComputeINTEL, VectorAnyINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
-    gpu.func @test_kernel(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
-      %0 = xegpu.create_nd_tdesc %arg0[0, 0]
-      : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
-      %1 = xegpu.create_nd_tdesc %arg1[0, 0]
-      : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-      %2 = xegpu.create_nd_tdesc %arg2[0, 0]
-      : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-      %3 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-      %4 = xegpu.load_nd %1 {packed} : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16>
-      %5 = xegpu.load_nd %2 : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
-      %6 = xegpu.dpas %3, %4, %5 : vector<8x16xf16>, vector<8x16x2xf16>, vector<8x16xf32> -> vector<8x16xf32>
-      xegpu.store_nd %6,%2 : vector<8x16xf32>,!xegpu.tensor_desc<8x16xf32>
-      gpu.return
-    }
-  }
-  func.func @main() attributes {llvm.emit_c_interface} {
-    %0 = memref.get_global @__constant_8x16xf16 : memref<8x16xf16>
-    %1 = memref.get_global @__constant_16x16xf16 : memref<16x16xf16>
-    %ref = memref.get_global @__constant_16x16xf32 : memref<16x16xf32>
-
-    %c0 = arith.constant 0 : index
-    %c1 = arith.constant 1 : index
-    %c8 = arith.constant 8 : index
-    %c16 = arith.constant 16 : index
-    %c128 = arith.constant 128 : index
-    %c1024 = arith.constant 1024 : index
-
-    scf.for %arg0 = %c0 to %c8 step %c1 {
-      scf.for %arg1 = %c0 to %c16 step %c1 {
-        %int0 = arith.index_cast %arg0 : index to i16
-        %int1 = arith.index_cast %arg1 : index to i16
-        %c16_i16 = arith.constant 16 : i16
-        %idx0 = arith.muli %int0, %c16_i16 : i16
-        %idx1 = arith.addi %int1, %idx0 : i16
-        %fp = arith.uitofp %idx1 : i16 to f16
-        %cst100 = arith.constant 1.0 : f16
-        %val0 = arith.divf %fp, %cst100 : f16
-        %cst1 = arith.constant 1.0 : f16
-        %val1 = arith.addf %val0, %cst1 : f16
-        memref.store %val0, %0[%arg0, %arg1] : memref<8x16xf16>
-        memref.store %val1, %1[%arg0, %arg1] : memref<16x16xf16>
-      }
-    }
-    // caculate the result C matrix
-    scf.for %arg0 = %c0 to %c8 step %c1 {
-      scf.for %arg1 = %c0 to %c16 step %c1 {
-        %acc = memref.load %ref[%arg0, %arg1] : memref<16x16xf32>
-        %res = scf.for %arg2 = %c0 to %c1024 step %c1 iter_args(%arg3 = %acc) -> f32 {
-          %a = memref.load %0[%arg0, %arg2] : memref<8x16xf16>
-          %b = memref.load %1[%arg2, %arg1] : memref<16x16xf16>
-          %c = arith.mulf %a, %b : f16
-          %cc = arith.extf %c : f16 to f32
-          %ccc = arith.addf %cc, %arg3 : f32
-          scf.yield %ccc : f32
-        }
-        memref.store %res, %ref[%arg0, %arg1] : memref<16x16xf32>
-      }
-    }
-
-    %cast_ref = memref.cast %ref : memref<16x16xf32> to memref<*xf32>
-    %2 = call @test(%0, %1) : (memref<8x16xf16>, memref<16x16xf16>) -> memref<8x16xf32>
-    %cast = memref.cast %2 : memref<8x16xf32> to memref<*xf32>
-    // call @printMemrefF32(%cast) : (memref<*xf32>) -> ()
-    // CHECK:   [ALLCLOSE: TRUE]
-    call @printAllcloseF32(%cast, %cast_ref) : (memref<*xf32>, memref<*xf32>) -> ()
-    return
-  }
-  func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface}
-  func.func private @printAllcloseF32(memref<*xf32>, memref<*xf32>) attributes {llvm.emit_c_interface}
-}
diff --git a/test/Integration/Dialect/XeGPU/preop_dpas.mlir b/test/Integration/Dialect/XeGPU/preop_dpas.mlir
index c3bef77ca..cc984886d 100644
--- a/test/Integration/Dialect/XeGPU/preop_dpas.mlir
+++ b/test/Integration/Dialect/XeGPU/preop_dpas.mlir
@@ -7,39 +7,57 @@
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
 module @gemm attributes {gpu.container_module} {
-  memref.global "private" @__constant_8x16xf32 : memref<8x16xf32> = dense<0.0>
-  func.func @test(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} {
+  memref.global "private" @__constant_32x32xf16 : memref<32x32xf16> = dense<1.0>
+  memref.global "private" @__Bconstant_32x32xf16 : memref<32x32xf16> = dense<2.0>
+  func.func @test(%arg0: memref<32x32xf16>, %arg1: memref<32x32xf16>) -> memref<32x32xf32> attributes {llvm.emit_c_interface} {
     %c64 = arith.constant 64 : index
-    %c128 = arith.constant 128 : index
+    %c4 = arith.constant 4 : index
+    %c2 = arith.constant 2 : index
+
     %c1 = arith.constant 1 : index
-    %memref = gpu.alloc  host_shared () : memref<8x16xf16>
-    memref.copy %arg0, %memref : memref<8x16xf16> to memref<8x16xf16>
-    %memref_0 = gpu.alloc  host_shared () : memref<16x16xf16>
-    memref.copy %arg1, %memref_0 : memref<16x16xf16> to memref<16x16xf16>
-    %memref_1 = gpu.alloc  host_shared () : memref<8x16xf32>
-    gpu.launch_func  @test_kernel::@test_kernel blocks in (%c128, %c64, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<8x16xf16>, %memref_0 : memref<16x16xf16>, %memref_1 : memref<8x16xf32>)
-    gpu.dealloc  %memref : memref<8x16xf16>
-    gpu.dealloc  %memref_0 : memref<16x16xf16>
-    return %memref_1 : memref<8x16xf32>
+    %memref = gpu.alloc  host_shared () : memref<32x32xf16>
+    memref.copy %arg0, %memref : memref<32x32xf16> to memref<32x32xf16>
+    %memref_0 = gpu.alloc  host_shared () : memref<32x32xf16>
+    memref.copy %arg1, %memref_0 : memref<32x32xf16> to memref<32x32xf16>
+    %memref_1 = gpu.alloc  host_shared () : memref<32x32xf32>
+    gpu.launch_func  @test_kernel::@test_kernel blocks in (%c4, %c2, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<32x32xf16>, %memref_0 : memref<32x32xf16>, %memref_1 : memref<32x32xf32>)
+    gpu.dealloc  %memref : memref<32x32xf16>
+    gpu.dealloc  %memref_0 : memref<32x32xf16>
+    return %memref_1 : memref<32x32xf32>
   }
 
-  gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, SubgroupDispatch, VectorComputeINTEL, VectorAnyINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
-    gpu.func @test_kernel(%A: memref<8x16xf16>, %B: memref<16x16xf16>, %C: memref<8x16xf32>) kernel attributes {VectorComputeFunctionINTEL, gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 128, 64, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, SubgroupDispatch, VectorComputeINTEL, VectorAnyINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
+    gpu.func @test_kernel(%A: memref<32x32xf16>, %B: memref<32x32xf16>, %C: memref<32x32xf32>) kernel attributes {VectorComputeFunctionINTEL, gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 4, 2, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+
+
       %c0 = arith.constant 0 : index
       %c16 = arith.constant 16 : index
+      %c32 = arith.constant 32 : index
       %c8 = arith.constant 8 : index
-      %c1024 = arith.constant 1024 : index
       %cst = arith.constant dense<1.0> : vector<8x16xf16>
       %0 = gpu.block_id  x
       %1 = gpu.block_id  y
-      %4 = xegpu.create_nd_tdesc %C[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-      %7 = xegpu.create_nd_tdesc %A[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
-      %8 = xegpu.create_nd_tdesc %B[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-      %9 = xegpu.load_nd %7 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-      %10 = xegpu.load_nd %8 {packed} : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16>
-      %13 = arith.addf %9, %cst : vector<8x16xf16>
-      %11 = xegpu.dpas %13, %10 : vector<8x16xf16>, vector<8x16x2xf16> -> vector<8x16xf32>
-      xegpu.store_nd %11, %4 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+
+       %2 = arith.muli %0, %c8 : index
+      %3 = arith.muli %1, %c16 : index
+
+      %4 = xegpu.create_nd_tdesc %C[%2, %3] : memref<32x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+      %5 = xegpu.load_nd %4 : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
+
+      %6 = scf.for %arg3 = %c0 to %c32 step %c16 iter_args(%arg4 = %5) -> (vector<8x16xf32>) {
+        %A0 = xegpu.create_nd_tdesc %A[%2, %arg3] : memref<32x32xf16> -> !xegpu.tensor_desc<8x16xf16>
+        %A0_val = xegpu.load_nd %A0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
+
+        %B0 = xegpu.create_nd_tdesc %B[%arg3, %3] : memref<32x32xf16> -> !xegpu.tensor_desc<16x16xf16>
+        %B0_val = xegpu.load_nd %B0 {packed} : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16>
+
+        %A0_preop = arith.addf %A0_val, %cst : vector<8x16xf16>
+
+        %dpas0 = xegpu.dpas %A0_preop, %B0_val , %arg4: vector<8x16xf16>, vector<8x16x2xf16>, vector<8x16xf32> -> vector<8x16xf32>
+        scf.yield %dpas0 : vector<8x16xf32>
+      }
+      xegpu.store_nd %6, %4 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+
       gpu.return
     }
   }
@@ -47,28 +65,19 @@ module @gemm attributes {gpu.container_module} {
   func.func @main() attributes {llvm.emit_c_interface} {
     %c0 = arith.constant 0 : index
     %c1 = arith.constant 1 : index
-    %c8 = arith.constant 8 : index
-    %c16 = arith.constant 16 : index
-    %c128 = arith.constant 128 : index
+    %c32 = arith.constant 32 : index
 
-    %rand_lower = arith.constant -2.0 : f32
-    %rand_upper = arith.constant 2.0 : f32
-    %gen_int = arith.constant 1 : i1
+    %A = memref.get_global @__constant_32x32xf16 : memref<32x32xf16>
+    %B = memref.get_global @__Bconstant_32x32xf16 : memref<32x32xf16>
+    %C_ref = memref.alloc() : memref<32x32xf32>
 
-    %A = memref.alloc() : memref<8x16xf16>
-    %B = memref.alloc() : memref<16x16xf16>
-    %C_ref = memref.get_global @__constant_8x16xf32 : memref<8x16xf32>
-    %A_random = memref.cast %A : memref<8x16xf16> to memref<*xf16>
-    %B_random = memref.cast %B : memref<16x16xf16> to memref<*xf16>
-    call @fillResource1DRandomF16(%A_random, %rand_lower, %rand_upper, %gen_int) : (memref<*xf16>, f32, f32, i1) -> ()
-    call @fillResource1DRandomF16(%B_random, %rand_lower, %rand_upper, %gen_int) : (memref<*xf16>, f32, f32, i1) -> ()
     // caculate the result C matrix
-    scf.for %i = %c0 to %c8 step %c1 {
-      scf.for %j = %c0 to %c16 step %c1 {
-        %acc = memref.load %C_ref[%i, %j] : memref<8x16xf32>
-        %res = scf.for %k = %c0 to %c16 step %c1 iter_args(%acc1 = %acc) -> f32 {
-          %a = memref.load %A[%i, %k] : memref<8x16xf16>
-          %b = memref.load %B[%k, %j] : memref<16x16xf16>
+    scf.for %i = %c0 to %c32 step %c1 {
+      scf.for %j = %c0 to %c32 step %c1 {
+        %acc = arith.constant 0.0 : f32
+        %res = scf.for %k = %c0 to %c32 step %c1 iter_args(%acc1 = %acc) -> f32 {
+          %a = memref.load %A[%i, %k] : memref<32x32xf16>
+          %b = memref.load %B[%k, %j] : memref<32x32xf16>
           // adjust for preop in GPU kernel, where we add 1 between load and dpas
           %cst1 = arith.constant 1.0 : f16
           %a_adj = arith.addf %a, %cst1 : f16
@@ -77,21 +86,20 @@ module @gemm attributes {gpu.container_module} {
           %ccc = arith.addf %cc, %acc1 : f32
           scf.yield %ccc : f32
         }
-        memref.store %res, %C_ref[%i, %j] : memref<8x16xf32>
+        memref.store %res, %C_ref[%i, %j] : memref<32x32xf32>
       }
     }
 
-    %2 = call @test(%A, %B) : (memref<8x16xf16>, memref<16x16xf16>) -> memref<8x16xf32>
-    %cast = memref.cast %2 : memref<8x16xf32> to memref<*xf32>
+   %2 = call @test(%A, %B) : (memref<32x32xf16>, memref<32x32xf16>) -> memref<32x32xf32>
+    %cast = memref.cast %2 : memref<32x32xf32> to memref<*xf32>
     // call @printMemrefF32(%cast) : (memref<*xf32>) -> ()
-    %cast_ref = memref.cast %C_ref : memref<8x16xf32> to memref<*xf32>
+    %cast_ref = memref.cast %C_ref : memref<32x32xf32> to memref<*xf32>
     // call @printMaxErrorF32(%cast, %cast_ref) : (memref<*xf32>, memref<*xf32>) -> ()
     // call @printMemrefF32(%cast_ref) : (memref<*xf32>) -> ()
     // CHECK:   [ALLCLOSE: TRUE]
     call @printAllcloseF32(%cast, %cast_ref) : (memref<*xf32>, memref<*xf32>) -> ()
     return
   }
-  func.func private @fillResource1DRandomF16(memref<*xf16>, f32, f32, i1) attributes {llvm.emit_c_interface}
   func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface}
   func.func private @printAllcloseF32(memref<*xf32>, memref<*xf32>) attributes {llvm.emit_c_interface}
   func.func private @printMaxErrorF32(memref<*xf32>, memref<*xf32>) attributes {llvm.emit_c_interface}
diff --git a/test/Integration/Dialect/XeGPU/vector_broadcast_1.mlir b/test/Integration/Dialect/XeGPU/vector_broadcast_1.mlir
index 4cbfab6bc..2b6922ec3 100644
--- a/test/Integration/Dialect/XeGPU/vector_broadcast_1.mlir
+++ b/test/Integration/Dialect/XeGPU/vector_broadcast_1.mlir
@@ -7,47 +7,73 @@
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
 module @gemm attributes {gpu.container_module} {
-  func.func @test(%A: memref<8x16xf16>, %B: memref<16x16xf16>, %bcast : memref<1x32xf16> ) -> memref<8x16xf32> attributes {llvm.emit_c_interface} {
+  memref.global "private" @__constant_32x32xf16 : memref<32x32xf16> = dense<1.0>
+  memref.global "private" @__constant_B32x32xf16 : memref<32x32xf16> = dense<2.0>
+  memref.global "private" @__constant_1x32xf16 : memref<1x32xf16> = dense<10.0>
+  func.func @test(%A: memref<32x32xf16>, %B: memref<32x32xf16>, %bcast : memref<1x32xf16> ) -> memref<32x32xf32> attributes {llvm.emit_c_interface} {
     %c1 = arith.constant 1 : index
-    %memref = gpu.alloc  host_shared () : memref<8x16xf16>
-    %memref_1 = gpu.alloc  host_shared () : memref<16x16xf16>
+        %c4 = arith.constant 4 : index
+    %c2 = arith.constant 2 : index
+    %memref = gpu.alloc  host_shared () : memref<32x32xf16>
+    %memref_1 = gpu.alloc  host_shared () : memref<32x32xf16>
     %memref_2 = gpu.alloc  host_shared () : memref<1x32xf16>
-    memref.copy %A, %memref : memref<8x16xf16> to memref<8x16xf16>
-    memref.copy %B, %memref_1 : memref<16x16xf16> to memref<16x16xf16>
+    memref.copy %A, %memref : memref<32x32xf16> to memref<32x32xf16>
+    memref.copy %B, %memref_1 : memref<32x32xf16> to memref<32x32xf16>
     memref.copy %bcast, %memref_2 : memref<1x32xf16> to memref<1x32xf16>
-    %memref_3 = gpu.alloc  host_shared () : memref<8x16xf32>
-    gpu.launch_func  @module0::@test_kernel blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<8x16xf16>, %memref_1 : memref<16x16xf16>, %memref_3 : memref<8x16xf32>, %memref_2 : memref<1x32xf16>)
-    gpu.dealloc  %memref : memref<8x16xf16>
-    gpu.dealloc  %memref_1 : memref<16x16xf16>
+    %memref_3 = gpu.alloc  host_shared () : memref<32x32xf32>
+    gpu.launch_func  @module0::@test_kernel blocks in (%c4, %c2, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<32x32xf16>, %memref_1 : memref<32x32xf16>, %memref_3 : memref<32x32xf32>, %memref_2 : memref<1x32xf16>)
+    gpu.dealloc  %memref : memref<32x32xf16>
+    gpu.dealloc  %memref_1 : memref<32x32xf16>
     gpu.dealloc  %memref_2 : memref<1x32xf16>
-    return %memref_3 : memref<8x16xf32>
+    return %memref_3 : memref<32x32xf32>
   }
 
     gpu.module @module0 attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, SubgroupDispatch, VectorComputeINTEL, VectorAnyINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
-    gpu.func @test_kernel(%A: memref<8x16xf16>, %B: memref<16x16xf16>, %Out: memref<8x16xf32>, %bcast : memref<1x32xf16>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+    gpu.func @test_kernel(%A: memref<32x32xf16>, %B: memref<32x32xf16>, %Out: memref<32x32xf32>, %bcast : memref<1x32xf16>) kernel attributes {VectorComputeFunctionINTEL, gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 4, 2, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
       %c0 = arith.constant 0 : index
       %c16 = arith.constant 16 : index
-      // load A tile
-      %a_tile0 = xegpu.create_nd_tdesc %A [%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
-      %val0 = xegpu.load_nd %a_tile0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-      // load B tile
-      %b_tile0 = xegpu.create_nd_tdesc %B [%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-      %val2 = xegpu.load_nd %b_tile0 {packed} : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16>
-      // load B cast
-      %bcast_tile = xegpu.create_nd_tdesc %bcast [%c0, %c0] : memref<1x32xf16> -> !xegpu.tensor_desc<1x32xf16>
-      %val3 = xegpu.load_nd %bcast_tile  : !xegpu.tensor_desc<1x32xf16> -> vector<1x32xf16>
-      // extract first 16 elems
-      %val5 = vector.extract_strided_slice %val3 {offsets = [0, 0], strides = [1, 1], sizes = [1, 16]}
-        : vector<1x32xf16> to vector<1x16xf16>
-      // broadcast over row dim
-      %val6 = vector.broadcast %val5 : vector<1x16xf16> to vector<8x16xf16>
-      // add to A
-      %val8 = arith.addf %val0, %val6 : vector<8x16xf16>
-      // do DPAS
-      %val4 = xegpu.dpas %val8, %val2 : vector<8x16xf16>, vector<8x16x2xf16> -> vector<8x16xf32>
+      %c32 = arith.constant 32 : index
+      %c8 = arith.constant 8 : index
+
+      %0 = gpu.block_id  x
+      %1 = gpu.block_id  y
+
+      %2 = arith.muli %0, %c8 : index
+      %3 = arith.muli %1, %c16 : index
+
+      %4 = xegpu.create_nd_tdesc %Out[%2, %3] : memref<32x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+      %5 = xegpu.load_nd %4 : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
+
+      %6 = scf.for %arg3 = %c0 to %c32 step %c16 iter_args(%arg4 = %5) -> (vector<8x16xf32>) {
+
+        // load A tile
+        %a_tile0 = xegpu.create_nd_tdesc %A [%2, %arg3] : memref<32x32xf16> -> !xegpu.tensor_desc<8x16xf16>
+        %A0_val = xegpu.load_nd %a_tile0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
+
+        // load B tile
+        %b_tile0 = xegpu.create_nd_tdesc %B [%arg3, %3] : memref<32x32xf16> -> !xegpu.tensor_desc<16x16xf16>
+        %B0_val = xegpu.load_nd %b_tile0 {packed} : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16>
+
+        // load B cast
+        %bcast_tile = xegpu.create_nd_tdesc %bcast [%c0, %c0] : memref<1x32xf16> -> !xegpu.tensor_desc<1x32xf16>
+        %val3 = xegpu.load_nd %bcast_tile  : !xegpu.tensor_desc<1x32xf16> -> vector<1x32xf16>
+
+        // extract first 16 elems
+        %val5 = vector.extract_strided_slice %val3 {offsets = [0, 0], strides = [1, 1], sizes = [1, 16]}
+       : vector<1x32xf16> to vector<1x16xf16>
+        // broadcast over row dim
+        %val6 = vector.broadcast %val5 : vector<1x16xf16> to vector<8x16xf16>
+        // add to A
+        %A0_val8 = arith.addf %A0_val, %val6 : vector<8x16xf16>
+
+        // do DPAS
+        %dpas = xegpu.dpas %A0_val8, %B0_val, %arg4 : vector<8x16xf16>, vector<8x16x2xf16>, vector<8x16xf32> -> vector<8x16xf32>
+
+        scf.yield %dpas : vector<8x16xf32>
+      }
       // store
-      %out_tile = xegpu.create_nd_tdesc %Out [%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-      xegpu.store_nd %val4, %out_tile  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+
+      xegpu.store_nd %6, %4  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
       gpu.return
     }
   }
@@ -57,31 +83,33 @@ module @gemm attributes {gpu.container_module} {
     %c1 = arith.constant 1 : index
     %c8 = arith.constant 8 : index
     %c16 = arith.constant 16 : index
+    %c32 = arith.constant 32 : index
+
     %c1_f32 = arith.constant 1.0 : f32
     // random init
     %lower = arith.constant -1.0 : f32
     %upper = arith.constant 1.0 : f32
     %false = arith.constant 0 : i1
-    %A = memref.alloc() : memref<8x16xf16>
-    %B = memref.alloc() : memref<16x16xf16>
-    %bcast = memref.alloc() : memref<1x32xf16>
-    %Out_cpu = memref.alloc() : memref<8x16xf32>
-    %A_random = memref.cast %A : memref<8x16xf16> to memref<*xf16>
-    %B_random = memref.cast %B : memref<16x16xf16> to memref<*xf16>
+    %A = memref.get_global @__constant_32x32xf16 : memref<32x32xf16>
+    %B = memref.get_global @__constant_B32x32xf16 : memref<32x32xf16>
+    %bcast = memref.get_global @__constant_1x32xf16 : memref<1x32xf16>
+
+    %Out_cpu = memref.alloc() : memref<32x32xf32>
+
+    %A_random = memref.cast %A : memref<32x32xf16> to memref<*xf16>
+    %B_random = memref.cast %B : memref<32x32xf16> to memref<*xf16>
     %bcast_random = memref.cast %bcast : memref<1x32xf16> to memref<*xf16>
-    call @fillResource1DRandomF16(%A_random, %lower, %upper, %false) : (memref<*xf16>, f32, f32, i1) -> ()
-    call @fillResource1DRandomF16(%B_random, %lower, %upper, %false) : (memref<*xf16>, f32, f32, i1) -> ()
-    call @fillResource1DRandomF16(%bcast_random, %lower, %upper, %false) : (memref<*xf16>, f32, f32, i1) -> ()
+
     // run GPU version
-    %Out_gpu = call @test(%A, %B, %bcast) : (memref<8x16xf16>, memref<16x16xf16>, memref<1x32xf16>) -> memref<8x16xf32>
-    %Out_gpu_cast = memref.cast %Out_gpu : memref<8x16xf32> to memref<*xf32>
+    %Out_gpu = call @test(%A, %B, %bcast) : (memref<32x32xf16>, memref<32x32xf16>, memref<1x32xf16>) -> memref<32x32xf32>
+    %Out_gpu_cast = memref.cast %Out_gpu : memref<32x32xf32> to memref<*xf32>
     // run CPU version
-    scf.for %i = %c0 to %c8 step %c1 {
-      scf.for %j = %c0 to  %c16 step %c1 {
+    scf.for %i = %c0 to %c32 step %c1 {
+      scf.for %j = %c0 to  %c32 step %c1 {
         %v0_init = arith.constant 0.0 : f32
-        %result:1 = scf.for %k = %c0 to %c16 step %c1 iter_args(%v0 = %v0_init) -> f32 {
-          %a0 = memref.load %A[%i, %k] : memref<8x16xf16>
-          %b0 = memref.load %B[%k, %j] : memref<16x16xf16>
+        %result:1 = scf.for %k = %c0 to %c32 step %c1 iter_args(%v0 = %v0_init) -> f32 {
+          %a0 = memref.load %A[%i, %k] : memref<32x32xf16>
+          %b0 = memref.load %B[%k, %j] : memref<32x32xf16>
           %bcast_val = memref.load %bcast[%c0, %k] : memref<1x32xf16>
           %t1 = arith.addf %a0, %bcast_val : f16
           %a0_f32 = arith.extf %t1 : f16 to f32
@@ -91,21 +119,15 @@ module @gemm attributes {gpu.container_module} {
           scf.yield %v0_new : f32
         }
         // only update the first 8x8 of the result, next 8x8 is value 1
-        memref.store %result#0, %Out_cpu[%i, %j] : memref<8x16xf32>
+        memref.store %result#0, %Out_cpu[%i, %j] : memref<32x32xf32>
       }
     }
-    %Out_cpu_cast = memref.cast %Out_cpu : memref<8x16xf32> to memref<*xf32>
+    %Out_cpu_cast = memref.cast %Out_cpu : memref<32x32xf32> to memref<*xf32>
     // print GPU and CPU outs
     // call @printMemrefF32(%Out_cpu_cast) : (memref<*xf32>) -> ()
     // call @printMemrefF32(%Out_gpu_cast) : (memref<*xf32>) -> ()
     // CHECK: [ALLCLOSE: TRUE]
     call @printAllcloseF32(%Out_gpu_cast, %Out_cpu_cast) : (memref<*xf32>, memref<*xf32>) -> ()
-    // dealloc
-    memref.dealloc %A : memref<8x16xf16>
-    memref.dealloc %B : memref<16x16xf16>
-    memref.dealloc %Out_cpu : memref<8x16xf32>
-    // gpu dealloc
-    gpu.dealloc %Out_gpu : memref<8x16xf32>
     return
   }
   func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface}
diff --git a/test/Integration/Dialect/XeGPU/vector_broadcast_2.mlir b/test/Integration/Dialect/XeGPU/vector_broadcast_2.mlir
index e523f0c9e..007768ce7 100644
--- a/test/Integration/Dialect/XeGPU/vector_broadcast_2.mlir
+++ b/test/Integration/Dialect/XeGPU/vector_broadcast_2.mlir
@@ -7,49 +7,75 @@
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
 module @gemm attributes {gpu.container_module} {
-  func.func @test(%A: memref<8x16xf16>, %B: memref<16x16xf16>, %bcast : memref<1x32xf16> ) -> memref<8x16xf32> attributes {llvm.emit_c_interface} {
+  memref.global "private" @__constant_32x32xf16 : memref<32x32xf16> = dense<1.0>
+  memref.global "private" @__constant_B32x32xf16 : memref<32x32xf16> = dense<2.0>
+  memref.global "private" @__constant_1x32xf16 : memref<1x32xf16> = dense<10.0>
+  func.func @test(%A: memref<32x32xf16>, %B: memref<32x32xf16>, %bcast : memref<1x32xf16> ) -> memref<32x32xf32> attributes {llvm.emit_c_interface} {
     %c1 = arith.constant 1 : index
-    %memref = gpu.alloc  host_shared () : memref<8x16xf16>
-    %memref_1 = gpu.alloc  host_shared () : memref<16x16xf16>
+        %c4 = arith.constant 4 : index
+    %c2 = arith.constant 2 : index
+    %memref = gpu.alloc  host_shared () : memref<32x32xf16>
+    %memref_1 = gpu.alloc  host_shared () : memref<32x32xf16>
     %memref_2 = gpu.alloc  host_shared () : memref<1x32xf16>
-    memref.copy %A, %memref : memref<8x16xf16> to memref<8x16xf16>
-    memref.copy %B, %memref_1 : memref<16x16xf16> to memref<16x16xf16>
+    memref.copy %A, %memref : memref<32x32xf16> to memref<32x32xf16>
+    memref.copy %B, %memref_1 : memref<32x32xf16> to memref<32x32xf16>
     memref.copy %bcast, %memref_2 : memref<1x32xf16> to memref<1x32xf16>
-    %memref_3 = gpu.alloc  host_shared () : memref<8x16xf32>
-    gpu.launch_func  @module0::@test_kernel blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<8x16xf16>, %memref_1 : memref<16x16xf16>, %memref_3 : memref<8x16xf32>, %memref_2 : memref<1x32xf16>)
-    gpu.dealloc  %memref : memref<8x16xf16>
-    gpu.dealloc  %memref_1 : memref<16x16xf16>
+    %memref_3 = gpu.alloc  host_shared () : memref<32x32xf32>
+    gpu.launch_func  @module0::@test_kernel blocks in (%c4, %c2, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<32x32xf16>, %memref_1 : memref<32x32xf16>, %memref_3 : memref<32x32xf32>, %memref_2 : memref<1x32xf16>)
+    gpu.dealloc  %memref : memref<32x32xf16>
+    gpu.dealloc  %memref_1 : memref<32x32xf16>
     gpu.dealloc  %memref_2 : memref<1x32xf16>
-    return %memref_3 : memref<8x16xf32>
+    return %memref_3 : memref<32x32xf32>
   }
 
     gpu.module @module0 attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, SubgroupDispatch, VectorComputeINTEL, VectorAnyINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
-    gpu.func @test_kernel(%A: memref<8x16xf16>, %B: memref<16x16xf16>, %Out: memref<8x16xf32>, %bcast : memref<1x32xf16>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+    gpu.func @test_kernel(%A: memref<32x32xf16>, %B: memref<32x32xf16>, %Out: memref<32x32xf32>, %bcast : memref<1x32xf16>) kernel attributes {VectorComputeFunctionINTEL, gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 4, 2, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
       %c0 = arith.constant 0 : index
       %c16 = arith.constant 16 : index
-      // load A tile
-      %a_tile0 = xegpu.create_nd_tdesc %A [%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
-      %val0 = xegpu.load_nd %a_tile0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
-      // load B tile
-      %b_tile0 = xegpu.create_nd_tdesc %B [%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-      %val2 = xegpu.load_nd %b_tile0 {packed} : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16>
-      // load B cast
-      %bcast_tile = xegpu.create_nd_tdesc %bcast [%c0, %c0] : memref<1x32xf16> -> !xegpu.tensor_desc<1x32xf16>
-      %val3 = xegpu.load_nd %bcast_tile  : !xegpu.tensor_desc<1x32xf16> -> vector<1x32xf16>
-      // extract first 8 elems
-      %val5 = vector.extract_strided_slice %val3 {offsets = [0, 0], strides = [1, 1], sizes = [1, 8]}
-        : vector<1x32xf16> to vector<1x8xf16>
-      // reshape and broadcast over col dim
-      %val6 = vector.shape_cast %val5 : vector<1x8xf16> to vector<8xf16>
-      %t = vector.shape_cast %val6 : vector<8xf16> to vector<8x1xf16>
-      %val7 = vector.broadcast %t : vector<8x1xf16> to vector<8x16xf16>
-      // add to A
-      %val9 = arith.addf %val0, %val7 : vector<8x16xf16>
-      // do DPAS
-      %val4 = xegpu.dpas %val9, %val2 : vector<8x16xf16>, vector<8x16x2xf16> -> vector<8x16xf32>
+      %c32 = arith.constant 32 : index
+      %c8 = arith.constant 8 : index
+
+      %0 = gpu.block_id  x
+      %1 = gpu.block_id  y
+
+      %2 = arith.muli %0, %c8 : index
+      %3 = arith.muli %1, %c16 : index
+
+      %4 = xegpu.create_nd_tdesc %Out[%2, %3] : memref<32x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+      %5 = xegpu.load_nd %4 : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
+
+      %6 = scf.for %arg3 = %c0 to %c32 step %c16 iter_args(%arg4 = %5) -> (vector<8x16xf32>) {
+
+        // load A tile
+        %a_tile0 = xegpu.create_nd_tdesc %A [%2, %arg3] : memref<32x32xf16> -> !xegpu.tensor_desc<8x16xf16>
+        %A0_val = xegpu.load_nd %a_tile0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
+
+        // load B tile
+        %b_tile0 = xegpu.create_nd_tdesc %B [%arg3, %3] : memref<32x32xf16> -> !xegpu.tensor_desc<16x16xf16>
+        %B0_val = xegpu.load_nd %b_tile0 {packed} : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16>
+
+        // load B cast
+        %bcast_tile = xegpu.create_nd_tdesc %bcast [%c0, %c0] : memref<1x32xf16> -> !xegpu.tensor_desc<1x32xf16>
+        %val3 = xegpu.load_nd %bcast_tile  : !xegpu.tensor_desc<1x32xf16> -> vector<1x32xf16>
+
+        // extract first 8 elems
+        %val5 = vector.extract_strided_slice %val3 {offsets = [0, 0], strides = [1, 1], sizes = [1, 8]}
+          : vector<1x32xf16> to vector<1x8xf16>
+        // reshape and broadcast over col dim
+        %val6 = vector.shape_cast %val5 : vector<1x8xf16> to vector<8xf16>
+        %t = vector.shape_cast %val6 : vector<8xf16> to vector<8x1xf16>
+        %val7 = vector.broadcast %t : vector<8x1xf16> to vector<8x16xf16>
+        // add to A
+        %A0_val8 = arith.addf %A0_val, %val7 : vector<8x16xf16>
+
+        // do DPAS
+        %dpas = xegpu.dpas %A0_val8, %B0_val, %arg4 : vector<8x16xf16>, vector<8x16x2xf16>, vector<8x16xf32> -> vector<8x16xf32>
+
+        scf.yield %dpas : vector<8x16xf32>
+      }
       // store
-      %out_tile = xegpu.create_nd_tdesc %Out [%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-      xegpu.store_nd %val4, %out_tile  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+
+      xegpu.store_nd %6, %4  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
       gpu.return
     }
   }
@@ -59,31 +85,33 @@ module @gemm attributes {gpu.container_module} {
     %c1 = arith.constant 1 : index
     %c8 = arith.constant 8 : index
     %c16 = arith.constant 16 : index
+    %c32 = arith.constant 32 : index
+
     %c1_f32 = arith.constant 1.0 : f32
     // random init
     %lower = arith.constant -1.0 : f32
     %upper = arith.constant 1.0 : f32
     %false = arith.constant 0 : i1
-    %A = memref.alloc() : memref<8x16xf16>
-    %B = memref.alloc() : memref<16x16xf16>
-    %bcast = memref.alloc() : memref<1x32xf16>
-    %Out_cpu = memref.alloc() : memref<8x16xf32>
-    %A_random = memref.cast %A : memref<8x16xf16> to memref<*xf16>
-    %B_random = memref.cast %B : memref<16x16xf16> to memref<*xf16>
+    %A = memref.get_global @__constant_32x32xf16 : memref<32x32xf16>
+    %B = memref.get_global @__constant_B32x32xf16 : memref<32x32xf16>
+    %bcast = memref.get_global @__constant_1x32xf16 : memref<1x32xf16>
+
+    %Out_cpu = memref.alloc() : memref<32x32xf32>
+
+    %A_random = memref.cast %A : memref<32x32xf16> to memref<*xf16>
+    %B_random = memref.cast %B : memref<32x32xf16> to memref<*xf16>
     %bcast_random = memref.cast %bcast : memref<1x32xf16> to memref<*xf16>
-    call @fillResource1DRandomF16(%A_random, %lower, %upper, %false) : (memref<*xf16>, f32, f32, i1) -> ()
-    call @fillResource1DRandomF16(%B_random, %lower, %upper, %false) : (memref<*xf16>, f32, f32, i1) -> ()
-    call @fillResource1DRandomF16(%bcast_random, %lower, %upper, %false) : (memref<*xf16>, f32, f32, i1) -> ()
+
     // run GPU version
-    %Out_gpu = call @test(%A, %B, %bcast) : (memref<8x16xf16>, memref<16x16xf16>, memref<1x32xf16>) -> memref<8x16xf32>
-    %Out_gpu_cast = memref.cast %Out_gpu : memref<8x16xf32> to memref<*xf32>
+    %Out_gpu = call @test(%A, %B, %bcast) : (memref<32x32xf16>, memref<32x32xf16>, memref<1x32xf16>) -> memref<32x32xf32>
+    %Out_gpu_cast = memref.cast %Out_gpu : memref<32x32xf32> to memref<*xf32>
     // run CPU version
-    scf.for %i = %c0 to %c8 step %c1 {
-      scf.for %j = %c0 to  %c16 step %c1 {
+    scf.for %i = %c0 to %c32 step %c1 {
+      scf.for %j = %c0 to  %c32 step %c1 {
         %v0_init = arith.constant 0.0 : f32
-        %result:1 = scf.for %k = %c0 to %c16 step %c1 iter_args(%v0 = %v0_init) -> f32 {
-          %a0 = memref.load %A[%i, %k] : memref<8x16xf16>
-          %b0 = memref.load %B[%k, %j] : memref<16x16xf16>
+        %result:1 = scf.for %k = %c0 to %c32 step %c1 iter_args(%v0 = %v0_init) -> f32 {
+          %a0 = memref.load %A[%i, %k] : memref<32x32xf16>
+          %b0 = memref.load %B[%k, %j] : memref<32x32xf16>
           %bcast_val = memref.load %bcast[%c0, %i] : memref<1x32xf16>
           %t1 = arith.addf %a0, %bcast_val : f16
           %a0_f32 = arith.extf %t1 : f16 to f32
@@ -93,21 +121,15 @@ module @gemm attributes {gpu.container_module} {
           scf.yield %v0_new : f32
         }
         // only update the first 8x8 of the result, next 8x8 is value 1
-        memref.store %result#0, %Out_cpu[%i, %j] : memref<8x16xf32>
+        memref.store %result#0, %Out_cpu[%i, %j] : memref<32x32xf32>
       }
     }
-    %Out_cpu_cast = memref.cast %Out_cpu : memref<8x16xf32> to memref<*xf32>
+    %Out_cpu_cast = memref.cast %Out_cpu : memref<32x32xf32> to memref<*xf32>
     // print GPU and CPU outs
     // call @printMemrefF32(%Out_cpu_cast) : (memref<*xf32>) -> ()
     // call @printMemrefF32(%Out_gpu_cast) : (memref<*xf32>) -> ()
     // CHECK: [ALLCLOSE: TRUE]
     call @printAllcloseF32(%Out_gpu_cast, %Out_cpu_cast) : (memref<*xf32>, memref<*xf32>) -> ()
-    // dealloc
-    memref.dealloc %A : memref<8x16xf16>
-    memref.dealloc %B : memref<16x16xf16>
-    memref.dealloc %Out_cpu : memref<8x16xf32>
-    // gpu dealloc
-    gpu.dealloc %Out_gpu : memref<8x16xf32>
     return
   }
   func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface}
diff --git a/test/Integration/Dialect/XeGPU/vector_extract_strided_slice_1.vc.mlir b/test/Integration/Dialect/XeGPU/vector_extract_strided_slice_1.vc.mlir
index 53494e0cf..46ded4f4f 100644
--- a/test/Integration/Dialect/XeGPU/vector_extract_strided_slice_1.vc.mlir
+++ b/test/Integration/Dialect/XeGPU/vector_extract_strided_slice_1.vc.mlir
@@ -7,41 +7,63 @@
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
 module @gemm attributes {gpu.container_module} {
-  func.func @test(%A: memref<8x16xf16>, %B: memref<16x16xf16> ) -> memref<8x16xf32> attributes {llvm.emit_c_interface} {
+  memref.global "private" @__constant_8x32xf16 : memref<8x32xf16> = dense<1.0>
+  memref.global "private" @__constant_16x32xf16 : memref<16x32xf16> = dense<2.0>
+
+  func.func @test(%A: memref<8x32xf16>, %B: memref<16x32xf16> ) -> memref<8x32xf32> attributes {llvm.emit_c_interface} {
     %c1 = arith.constant 1 : index
-    %memref = gpu.alloc  host_shared () : memref<8x16xf16>
-    %memref_1 = gpu.alloc  host_shared () : memref<16x16xf16>
-    memref.copy %A, %memref : memref<8x16xf16> to memref<8x16xf16>
-    memref.copy %B, %memref_1 : memref<16x16xf16> to memref<16x16xf16>
-    %memref_2 = gpu.alloc  host_shared () : memref<8x16xf32>
-    gpu.launch_func  @module0::@test_kernel blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<8x16xf16>, %memref_1 : memref<16x16xf16>, %memref_2 : memref<8x16xf32>)
-    gpu.dealloc  %memref : memref<8x16xf16>
-    gpu.dealloc  %memref_1 : memref<16x16xf16>
-    return %memref_2 : memref<8x16xf32>
+    %memref = gpu.alloc  host_shared () : memref<8x32xf16>
+    %memref_1 = gpu.alloc  host_shared () : memref<16x32xf16>
+    memref.copy %A, %memref : memref<8x32xf16> to memref<8x32xf16>
+    memref.copy %B, %memref_1 : memref<16x32xf16> to memref<16x32xf16>
+    %memref_2 = gpu.alloc  host_shared () : memref<8x32xf32>
+    gpu.launch_func  @module0::@test_kernel blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<8x32xf16>, %memref_1 : memref<16x32xf16>, %memref_2 : memref<8x32xf32>)
+    gpu.dealloc  %memref : memref<8x32xf16>
+    gpu.dealloc  %memref_1 : memref<16x32xf16>
+    return %memref_2 : memref<8x32xf32>
   }
 
     gpu.module @module0 attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, SubgroupDispatch, VectorComputeINTEL, VectorAnyINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
-    gpu.func @test_kernel(%A: memref<8x16xf16>, %B: memref<16x16xf16>, %Out: memref<8x16xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+    gpu.func @test_kernel(%A: memref<8x32xf16>, %B: memref<16x32xf16>, %C: memref<8x32xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
       %c0 = arith.constant 0 : index
       %c16 = arith.constant 16 : index
       // load A tile
-      %a_tile0 = xegpu.create_nd_tdesc %A [%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
-      %val0 = xegpu.load_nd %a_tile0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
+      %A0 = xegpu.create_nd_tdesc %A[%c0, %c0] : memref<8x32xf16> -> !xegpu.tensor_desc<8x16xf16>
+      %A1 = xegpu.create_nd_tdesc %A[%c0, %c16] : memref<8x32xf16> -> !xegpu.tensor_desc<8x16xf16>
+      %A0_val = xegpu.load_nd %A0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
+      %A1_val = xegpu.load_nd %A1 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
+
       // load B tile
-      %b_tile0 = xegpu.create_nd_tdesc %B [%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-      %val2 = xegpu.load_nd %b_tile0 {packed} : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16>
+      %B0 = xegpu.create_nd_tdesc %B[%c0, %c0] : memref<16x32xf16> -> !xegpu.tensor_desc<16x16xf16>
+      %B1 = xegpu.create_nd_tdesc %B[%c0, %c16] : memref<16x32xf16> -> !xegpu.tensor_desc<16x16xf16>
+      %B0_val = xegpu.load_nd %B0 {packed} : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16>
+      %B1_val = xegpu.load_nd %B1 {packed} : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16>
+
       // do DPAS
-      %val4 = xegpu.dpas %val0, %val2 : vector<8x16xf16>, vector<8x16x2xf16> -> vector<8x16xf32>
+      %dpas0 = xegpu.dpas %A0_val, %B0_val : vector<8x16xf16>, vector<8x16x2xf16> -> vector<8x16xf32>
+      %dpas1 = xegpu.dpas %A1_val, %B1_val : vector<8x16xf16>, vector<8x16x2xf16> -> vector<8x16xf32>
+
       // extract second 8x8
-      %val5 = vector.extract_strided_slice %val4 {sizes = [8, 8], strides = [1, 1], offsets = [0, 8]} : vector<8x16xf32> to vector<8x8xf32>
+      %val5_0 = vector.extract_strided_slice %dpas0 {sizes = [8, 8], strides = [1, 1], offsets = [0, 8]} : vector<8x16xf32> to vector<8x8xf32>
+      %val5_1 = vector.extract_strided_slice %dpas1 {sizes = [8, 8], strides = [1, 1], offsets = [0, 8]} : vector<8x16xf32> to vector<8x8xf32>
+
       %cst_8x8_flat = arith.constant dense<1.0> : vector<64xf32>
       %cst_8x8 = vector.shape_cast %cst_8x8_flat : vector<64xf32> to vector<8x8xf32>
       // shift the first half to left and use %cst_8x8 as the second half
-      %val6 = vector.shuffle %val5, %cst_8x8 [0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15] : vector<8x8xf32>, vector<8x8xf32>
-      %val7 = vector.shape_cast %val6 : vector<16x8xf32> to vector<8x16xf32>
+
+      %val6_0 = vector.shuffle %val5_0, %cst_8x8 [0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15] : vector<8x8xf32>, vector<8x8xf32>
+      %val6_1 = vector.shuffle %val5_1, %cst_8x8 [0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15] : vector<8x8xf32>, vector<8x8xf32>
+
+      %val7_0 = vector.shape_cast %val6_0 : vector<16x8xf32> to vector<8x16xf32>
+      %val7_1 = vector.shape_cast %val6_1 : vector<16x8xf32> to vector<8x16xf32>
+
       // store
-      %out_tile = xegpu.create_nd_tdesc %Out [%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-      xegpu.store_nd %val7, %out_tile  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+      %out_tile_0 = xegpu.create_nd_tdesc %C [%c0, %c0] : memref<8x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+      %out_tile_1 = xegpu.create_nd_tdesc %C [%c0, %c16] : memref<8x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+
+      xegpu.store_nd %val7_0, %out_tile_0  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+      xegpu.store_nd %val7_1, %out_tile_1  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+
       gpu.return
     }
   }
@@ -52,51 +74,67 @@ module @gemm attributes {gpu.container_module} {
     %c8 = arith.constant 8 : index
     %c16 = arith.constant 16 : index
     %c1_f32 = arith.constant 1.0 : f32
+    %c24 = arith.constant 24 : index
+
+    %c32 = arith.constant 32 : index
+
     // random init
     %lower = arith.constant -1.0 : f32
     %upper = arith.constant 1.0 : f32
     %false = arith.constant 0 : i1
-    %A = memref.alloc() : memref<8x16xf16>
-    %B = memref.alloc() : memref<16x16xf16>
-    %Out_cpu = memref.alloc() : memref<8x16xf32>
-    %A_random = memref.cast %A : memref<8x16xf16> to memref<*xf16>
-    %B_random = memref.cast %B : memref<16x16xf16> to memref<*xf16>
-    call @fillResource1DRandomF16(%A_random, %lower, %upper, %false) : (memref<*xf16>, f32, f32, i1) -> ()
-    call @fillResource1DRandomF16(%B_random, %lower, %upper, %false) : (memref<*xf16>, f32, f32, i1) -> ()
+    %A = memref.get_global @__constant_8x32xf16 : memref<8x32xf16>
+    %B =memref.get_global @__constant_16x32xf16 : memref<16x32xf16>
+    %Out_cpu = memref.alloc() : memref<8x32xf32>
     // run GPU version
-    %Out_gpu = call @test(%A, %B) : (memref<8x16xf16>, memref<16x16xf16>) -> memref<8x16xf32>
-    %Out_gpu_cast = memref.cast %Out_gpu : memref<8x16xf32> to memref<*xf32>
+   %Out_gpu = call @test(%A, %B) : (memref<8x32xf16>, memref<16x32xf16>) -> memref<8x32xf32>
+   %Out_gpu_cast = memref.cast %Out_gpu : memref<8x32xf32> to memref<*xf32>
     // run CPU version
     scf.for %i = %c0 to %c8 step %c1 {
       scf.for %j = %c8 to  %c16 step %c1 {
         %v0_init = arith.constant 0.0 : f32
         %result:1 = scf.for %k = %c0 to %c16 step %c1 iter_args(%v0 = %v0_init) -> f32 {
-          %a0 = memref.load %A[%i, %k] : memref<8x16xf16>
-          %b0 = memref.load %B[%k, %j] : memref<16x16xf16>
+          %a0 = memref.load %A[%i, %k] : memref<8x32xf16>
+          %b0 = memref.load %B[%k, %j] : memref<16x32xf16>
+          %a0_f32 = arith.extf %a0 : f16 to f32
+          %b0_f32 = arith.extf %b0 : f16 to f32
+          %t0 = arith.mulf %a0_f32, %b0_f32 : f32
+          %v0_new = arith.addf %v0, %t0 : f32
+          scf.yield %v0_new : f32
+        }
+        // only update the 8x8 of first half of 8x32 of the result, next 8x8 is value 1
+        %shifted_j = arith.subi %j, %c8 : index
+        memref.store %result#0, %Out_cpu[%i, %shifted_j] : memref<8x32xf32>
+        memref.store %c1_f32, %Out_cpu[%i, %j] : memref<8x32xf32>
+      }
+    }
+
+    // run CPU version
+    scf.for %i = %c0 to %c8 step %c1 {
+      scf.for %j = %c24 to  %c32 step %c1 {
+        %v0_init = arith.constant 0.0 : f32
+        %result:1 = scf.for %k = %c0 to %c16 step %c1 iter_args(%v0 = %v0_init) -> f32 {
+          %a0 = memref.load %A[%i, %k] : memref<8x32xf16>
+          %b0 = memref.load %B[%k, %j] : memref<16x32xf16>
           %a0_f32 = arith.extf %a0 : f16 to f32
           %b0_f32 = arith.extf %b0 : f16 to f32
           %t0 = arith.mulf %a0_f32, %b0_f32 : f32
           %v0_new = arith.addf %v0, %t0 : f32
           scf.yield %v0_new : f32
         }
-        // only update the first 8x8 of the result, next 8x8 is value 1
+        // only update the 8x8 of second half of 8x32 of the result, next 8x8 is value 1
         %shifted_j = arith.subi %j, %c8 : index
-        memref.store %result#0, %Out_cpu[%i, %shifted_j] : memref<8x16xf32>
-        memref.store %c1_f32, %Out_cpu[%i, %j] : memref<8x16xf32>
+        memref.store %result#0, %Out_cpu[%i, %shifted_j] : memref<8x32xf32>
+        memref.store %c1_f32, %Out_cpu[%i, %j] : memref<8x32xf32>
       }
     }
-    %Out_cpu_cast = memref.cast %Out_cpu : memref<8x16xf32> to memref<*xf32>
+    %Out_cpu_cast = memref.cast %Out_cpu : memref<8x32xf32> to memref<*xf32>
+
     // print GPU and CPU outs
     // call @printMemrefF32(%Out_cpu_cast) : (memref<*xf32>) -> ()
     // call @printMemrefF32(%Out_gpu_cast) : (memref<*xf32>) -> ()
     // CHECK: [ALLCLOSE: TRUE]
     call @printAllcloseF32(%Out_gpu_cast, %Out_cpu_cast) : (memref<*xf32>, memref<*xf32>) -> ()
-    // dealloc
-    memref.dealloc %A : memref<8x16xf16>
-    memref.dealloc %B : memref<16x16xf16>
-    memref.dealloc %Out_cpu : memref<8x16xf32>
-    // gpu dealloc
-    gpu.dealloc %Out_gpu : memref<8x16xf32>
+
     return
   }
   func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface}
diff --git a/test/Integration/Dialect/XeGPU/vector_insert_1.mlir b/test/Integration/Dialect/XeGPU/vector_insert_1.mlir
index 53b57afc8..9ee4a30d0 100644
--- a/test/Integration/Dialect/XeGPU/vector_insert_1.mlir
+++ b/test/Integration/Dialect/XeGPU/vector_insert_1.mlir
@@ -7,31 +7,30 @@
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
 module @gemm attributes {gpu.container_module} {
-  func.func @test(%A: memref<8x16xf16> ) -> memref<8x16xf32> attributes {llvm.emit_c_interface} {
+  func.func @test(%A: memref<8x16xf32> ) -> memref<8x16xf32> attributes {llvm.emit_c_interface} {
     %c1 = arith.constant 1 : index
-    %memref = gpu.alloc  host_shared () : memref<8x16xf16>
-    memref.copy %A, %memref : memref<8x16xf16> to memref<8x16xf16>
+    %memref = gpu.alloc  host_shared () : memref<8x16xf32>
+    memref.copy %A, %memref : memref<8x16xf32> to memref<8x16xf32>
     %memref_2 = gpu.alloc  host_shared () : memref<8x16xf32>
-    gpu.launch_func  @module0::@test_kernel blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<8x16xf16>, %memref_2 : memref<8x16xf32>)
-    gpu.dealloc  %memref : memref<8x16xf16>
+    gpu.launch_func  @module0::@test_kernel blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<8x16xf32>, %memref_2 : memref<8x16xf32>)
+    gpu.dealloc  %memref : memref<8x16xf32>
     return %memref_2 : memref<8x16xf32>
   }
 
     gpu.module @module0 attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, SubgroupDispatch, VectorComputeINTEL, VectorAnyINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
-    gpu.func @test_kernel(%A: memref<8x16xf16>, %Out: memref<8x16xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+    gpu.func @test_kernel(%A: memref<8x16xf32>, %Out: memref<8x16xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
       %c0 = arith.constant 0 : index
       %c16 = arith.constant 16 : index
       // load tile
-      %a_tile0 = xegpu.create_nd_tdesc %A [%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
-      %val0 = xegpu.load_nd %a_tile0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
+      %a_tile0 = xegpu.create_nd_tdesc %A [%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+      %val0 = xegpu.load_nd %a_tile0 : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
       // extract row at pos 2
-      %a_row = vector.extract %val0 [2] : vector<16xf16> from vector<8x16xf16>
+      %a_row = vector.extract %val0 [2] : vector<16xf32> from vector<8x16xf32>
       // insert row at pos 7
-      %val3 = vector.insert %a_row, %val0 [7] : vector<16xf16> into vector<8x16xf16>
-      %val4 = arith.extf %val3 : vector<8x16xf16> to vector<8x16xf32>
+      %val3 = vector.insert %a_row, %val0 [7] : vector<16xf32> into vector<8x16xf32>
       // store
       %out_tile = xegpu.create_nd_tdesc %Out [%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-      xegpu.store_nd %val4, %out_tile  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+      xegpu.store_nd %val3, %out_tile  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
       gpu.return
     }
   }
@@ -45,33 +44,29 @@ module @gemm attributes {gpu.container_module} {
     %c16 = arith.constant 16 : index
     %c1_f32 = arith.constant 1.0 : f32
     %c2_f32 = arith.constant 2.0 : f32
-    %cst = arith.constant 2.0 : f16
+    %cst = arith.constant 2.0 : f32
     // random init
     %lower = arith.constant -3.0 : f32
     %upper = arith.constant 3.0 : f32
     %false = arith.constant 0 : i1
-    %A = memref.alloc() : memref<8x16xf16>
-    %B = memref.alloc() : memref<16x16xf16>
+    %A = memref.alloc() : memref<8x16xf32>
     %Out_cpu = memref.alloc() : memref<8x16xf32>
-    %A_random = memref.cast %A : memref<8x16xf16> to memref<*xf16>
-    call @fillResource1DRandomF16(%A_random, %lower, %upper, %false) : (memref<*xf16>, f32, f32, i1) -> ()
-    // call @fillResource1DF16(%A_random, %c1_f32) : (memref<*xf16>, f32) -> ()
+    %A_random = memref.cast %A : memref<8x16xf32> to memref<*xf32>
+    call @fillResource1DRandomF32(%A_random, %lower, %upper, %false) : (memref<*xf32>, f32, f32, i1) -> ()
     // run GPU version
-    %Out_gpu = call @test(%A) : (memref<8x16xf16>) -> memref<8x16xf32>
+    %Out_gpu = call @test(%A) : (memref<8x16xf32>) -> memref<8x16xf32>
     %Out_gpu_cast = memref.cast %Out_gpu : memref<8x16xf32> to memref<*xf32>
 
     // run CPU version
     scf.for %i = %c0 to %c8 step %c1 {
       scf.for %j = %c0 to %c16 step %c1 {
-        %v = memref.load %A[%i, %j] : memref<8x16xf16>
-        %v_f32 = arith.extf %v : f16 to f32
-        memref.store %v_f32, %Out_cpu[%i, %j] : memref<8x16xf32>
+        %v = memref.load %A[%i, %j] : memref<8x16xf32>
+        memref.store %v, %Out_cpu[%i, %j] : memref<8x16xf32>
       }
     }
     scf.for %i = %c0 to %c16 step %c1 {
-      %v = memref.load %A[%c2, %i] : memref<8x16xf16>
-      %v_f32 = arith.extf %v : f16 to f32
-      memref.store %v_f32, %Out_cpu[%c7, %i] : memref<8x16xf32>
+      %v = memref.load %A[%c2, %i] : memref<8x16xf32>
+      memref.store %v, %Out_cpu[%c7, %i] : memref<8x16xf32>
     }
 
     %Out_cpu_cast = memref.cast %Out_cpu : memref<8x16xf32> to memref<*xf32>
@@ -81,15 +76,13 @@ module @gemm attributes {gpu.container_module} {
     // CHECK: [ALLCLOSE: TRUE]
     call @printAllcloseF32(%Out_gpu_cast, %Out_cpu_cast) : (memref<*xf32>, memref<*xf32>) -> ()
     // dealloc
-    memref.dealloc %A : memref<8x16xf16>
-    memref.dealloc %B : memref<16x16xf16>
+    memref.dealloc %A : memref<8x16xf32>
     memref.dealloc %Out_cpu : memref<8x16xf32>
     // gpu dealloc
     gpu.dealloc %Out_gpu : memref<8x16xf32>
     return
   }
   func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface}
-  func.func private @fillResource1DRandomF16(memref<*xf16>, f32, f32, i1) attributes {llvm.emit_c_interface}
   func.func private @printAllcloseF32(memref<*xf32>, memref<*xf32>) attributes {llvm.emit_c_interface}
-  func.func private @fillResource1DF16(memref<*xf16>, f32) attributes {llvm.emit_c_interface}
+  func.func private @fillResource1DRandomF32(memref<*xf32>, f32, f32, i1) attributes {llvm.emit_c_interface}
 }
diff --git a/test/Integration/Dialect/XeGPU/vector_insert_2.mlir b/test/Integration/Dialect/XeGPU/vector_insert_2.mlir
index bcca52e98..aec7eef99 100644
--- a/test/Integration/Dialect/XeGPU/vector_insert_2.mlir
+++ b/test/Integration/Dialect/XeGPU/vector_insert_2.mlir
@@ -7,31 +7,30 @@
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
 module @gemm attributes {gpu.container_module} {
-  func.func @test(%A: memref<8x16xf16> ) -> memref<8x16xf32> attributes {llvm.emit_c_interface} {
+  func.func @test(%A: memref<8x16xf32> ) -> memref<8x16xf32> attributes {llvm.emit_c_interface} {
     %c1 = arith.constant 1 : index
-    %memref = gpu.alloc  host_shared () : memref<8x16xf16>
-    memref.copy %A, %memref : memref<8x16xf16> to memref<8x16xf16>
+    %memref = gpu.alloc  host_shared () : memref<8x16xf32>
+    memref.copy %A, %memref : memref<8x16xf32> to memref<8x16xf32>
     %memref_2 = gpu.alloc  host_shared () : memref<8x16xf32>
-    gpu.launch_func  @module0::@test_kernel blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<8x16xf16>, %memref_2 : memref<8x16xf32>)
-    gpu.dealloc  %memref : memref<8x16xf16>
+    gpu.launch_func  @module0::@test_kernel blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<8x16xf32>, %memref_2 : memref<8x16xf32>)
+    gpu.dealloc  %memref : memref<8x16xf32>
     return %memref_2 : memref<8x16xf32>
   }
 
     gpu.module @module0 attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, SubgroupDispatch, VectorComputeINTEL, VectorAnyINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
-    gpu.func @test_kernel(%A: memref<8x16xf16>, %Out: memref<8x16xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+    gpu.func @test_kernel(%A: memref<8x16xf32>, %Out: memref<8x16xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
       %c0 = arith.constant 0 : index
       %c16 = arith.constant 16 : index
       // load tile
-      %a_tile0 = xegpu.create_nd_tdesc %A [%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
-      %val0 = xegpu.load_nd %a_tile0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
+      %a_tile0 = xegpu.create_nd_tdesc %A [%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+      %val0 = xegpu.load_nd %a_tile0 : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
       // define const vector
-      %cst = arith.constant dense<1.23> : vector<16xf16>
+      %cst = arith.constant dense<1.23> : vector<16xf32>
       // insert row at pos 7
-      %val3 = vector.insert %cst, %val0 [7] : vector<16xf16> into vector<8x16xf16>
-      %val4 = arith.extf %val3 : vector<8x16xf16> to vector<8x16xf32>
+      %val3 = vector.insert %cst, %val0 [7] : vector<16xf32> into vector<8x16xf32>
       // store
       %out_tile = xegpu.create_nd_tdesc %Out [%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-      xegpu.store_nd %val4, %out_tile  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
+      xegpu.store_nd %val3, %out_tile  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
       gpu.return
     }
   }
@@ -45,32 +44,28 @@ module @gemm attributes {gpu.container_module} {
     %c16 = arith.constant 16 : index
     %c1_f32 = arith.constant 1.0 : f32
     %c2_f32 = arith.constant 2.0 : f32
-    %cst = arith.constant 1.23 : f16
+    %cst = arith.constant 1.23 : f32
     // random init
     %lower = arith.constant -3.0 : f32
     %upper = arith.constant 3.0 : f32
     %false = arith.constant 0 : i1
-    %A = memref.alloc() : memref<8x16xf16>
-    %B = memref.alloc() : memref<16x16xf16>
+    %A = memref.alloc() : memref<8x16xf32>
     %Out_cpu = memref.alloc() : memref<8x16xf32>
-    %A_random = memref.cast %A : memref<8x16xf16> to memref<*xf16>
-    call @fillResource1DRandomF16(%A_random, %lower, %upper, %false) : (memref<*xf16>, f32, f32, i1) -> ()
-    // call @fillResource1DF16(%A_random, %c1_f32) : (memref<*xf16>, f32) -> ()
+    %A_random = memref.cast %A : memref<8x16xf32> to memref<*xf32>
+    call @fillResource1DRandomF32(%A_random, %lower, %upper, %false) : (memref<*xf32>, f32, f32, i1) -> ()
     // run GPU version
-    %Out_gpu = call @test(%A) : (memref<8x16xf16>) -> memref<8x16xf32>
+    %Out_gpu = call @test(%A) : (memref<8x16xf32>) -> memref<8x16xf32>
     %Out_gpu_cast = memref.cast %Out_gpu : memref<8x16xf32> to memref<*xf32>
 
     // run CPU version
     scf.for %i = %c0 to %c8 step %c1 {
       scf.for %j = %c0 to %c16 step %c1 {
-        %v = memref.load %A[%i, %j] : memref<8x16xf16>
-        %v_f32 = arith.extf %v : f16 to f32
-        memref.store %v_f32, %Out_cpu[%i, %j] : memref<8x16xf32>
+        %v = memref.load %A[%i, %j] : memref<8x16xf32>
+        memref.store %v, %Out_cpu[%i, %j] : memref<8x16xf32>
       }
     }
     scf.for %i = %c0 to %c16 step %c1 {
-      %cst_f32 = arith.extf %cst : f16 to f32
-      memref.store %cst_f32, %Out_cpu[%c7, %i] : memref<8x16xf32>
+      memref.store %cst, %Out_cpu[%c7, %i] : memref<8x16xf32>
     }
 
     %Out_cpu_cast = memref.cast %Out_cpu : memref<8x16xf32> to memref<*xf32>
@@ -80,15 +75,13 @@ module @gemm attributes {gpu.container_module} {
     // CHECK: [ALLCLOSE: TRUE]
     call @printAllcloseF32(%Out_gpu_cast, %Out_cpu_cast) : (memref<*xf32>, memref<*xf32>) -> ()
     // dealloc
-    memref.dealloc %A : memref<8x16xf16>
-    memref.dealloc %B : memref<16x16xf16>
+    memref.dealloc %A : memref<8x16xf32>
     memref.dealloc %Out_cpu : memref<8x16xf32>
     // gpu dealloc
     gpu.dealloc %Out_gpu : memref<8x16xf32>
     return
   }
   func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface}
-  func.func private @fillResource1DRandomF16(memref<*xf16>, f32, f32, i1) attributes {llvm.emit_c_interface}
+  func.func private @fillResource1DRandomF32(memref<*xf32>, f32, f32, i1) attributes {llvm.emit_c_interface}
   func.func private @printAllcloseF32(memref<*xf32>, memref<*xf32>) attributes {llvm.emit_c_interface}
-  func.func private @fillResource1DF16(memref<*xf16>, f32) attributes {llvm.emit_c_interface}
 }
diff --git a/test/Integration/Dialect/XeGPU/xegpu-to-vc.mlir b/test/Integration/Dialect/XeGPU/xegpu-to-vc.mlir
index fe13bcc47..c16cc3815 100644
--- a/test/Integration/Dialect/XeGPU/xegpu-to-vc.mlir
+++ b/test/Integration/Dialect/XeGPU/xegpu-to-vc.mlir
@@ -8,46 +8,77 @@
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
 module @gemm attributes {gpu.container_module,
 spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, SubgroupDispatch, VectorComputeINTEL, VectorAnyINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
-  memref.global "private" constant @__constant_8x16xf16 : memref<8x16xf16> = dense<5.000000e-01>
-  memref.global "private" constant @__constant_16x16xf16 : memref<16x16xf16> = dense<1.099610e+00>
-  func.func @test(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>) -> memref<8x16xf32> {
+  memref.global "private" constant @__constant_32x32xf16 : memref<32x32xf16> = dense<5.000000e-01>
+  memref.global "private" constant @__Bconstant_32x32xf16 : memref<32x32xf16> = dense<1.099610e+00>
+  func.func @test(%arg0: memref<32x32xf16>, %arg1: memref<32x32xf16>) -> memref<32x32xf32> {
     %c1 = arith.constant 1 : index
-    %memref_0 = gpu.alloc  host_shared () : memref<8x16xf16>
-    memref.copy %arg0, %memref_0 : memref<8x16xf16> to memref<8x16xf16>
-    %memref_1 = gpu.alloc  host_shared () : memref<16x16xf16>
-    memref.copy %arg1, %memref_1 : memref<16x16xf16> to memref<16x16xf16>
-    %memref_c = gpu.alloc  host_shared () : memref<8x16xf32>
-    gpu.launch_func @test_kernel::@test_kernel blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref_0 : memref<8x16xf16>, %memref_1 : memref<16x16xf16>, %memref_c : memref<8x16xf32>)
-    %result = memref.alloc() :  memref<8x16xf32>
-    memref.copy %memref_c, %result: memref<8x16xf32> to memref<8x16xf32>
-    gpu.dealloc  %memref_0 : memref<8x16xf16>
-    gpu.dealloc  %memref_1 : memref<16x16xf16>
-    gpu.dealloc  %memref_c :memref<8x16xf32>
+    %c4 = arith.constant 4 : index
+    %c2 = arith.constant 2 : index
+    %memref_0 = gpu.alloc  host_shared () : memref<32x32xf16>
+    memref.copy %arg0, %memref_0 : memref<32x32xf16> to memref<32x32xf16>
+    %memref_1 = gpu.alloc  host_shared () : memref<32x32xf16>
+    memref.copy %arg1, %memref_1 : memref<32x32xf16> to memref<32x32xf16>
+    %memref_c = gpu.alloc  host_shared () : memref<32x32xf32>
+    gpu.launch_func @test_kernel::@test_kernel blocks in (%c4, %c2, %c1) threads in (%c1, %c1, %c1) args(%memref_0 : memref<32x32xf16>, %memref_1 : memref<32x32xf16>, %memref_c : memref<32x32xf32>)
+    %result = memref.alloc() :  memref<32x32xf32>
+    memref.copy %memref_c, %result: memref<32x32xf32> to memref<32x32xf32>
+    gpu.dealloc  %memref_0 : memref<32x32xf16>
+    gpu.dealloc  %memref_1 : memref<32x32xf16>
+    gpu.dealloc  %memref_c :memref<32x32xf32>
 
-    return %result : memref<8x16xf32>
+    return %result : memref<32x32xf32>
   }
   gpu.module @test_kernel {
-   gpu.func @test_kernel(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>}{
+   gpu.func @test_kernel(%arg0: memref<32x32xf16>, %arg1: memref<32x32xf16>, %arg2: memref<32x32xf32>) kernel attributes {VectorComputeFunctionINTEL, gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 4, 2, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>}{
+      %c0 = arith.constant 0 : index
+      %c2 = arith.constant 2 : index
+      %c16 = arith.constant 16 : index
+      %c32 = arith.constant 32 : index
+      %c128 = arith.constant 128 : index
+      %c8 = arith.constant 8 : index
+
+      %0 = gpu.block_id  x
+      %1 = gpu.block_id  y
+
+      %2 = arith.muli %0, %c8 : index
+      %3 = arith.muli %1, %c16 : index
+      %128 = arith.muli %c8, %c16 : index
+      %256 = arith.muli %128, %c2 : index
+      %x = arith.muli %256, %0 : index
+      %y = arith.muli %128, %1 : index
+
+      %c_index = arith.addi %x, %y : index
+      %arg02 = memref.reinterpret_cast %arg2 to offset: [0], sizes: [1024], strides: [1] : memref<32x32xf32> to memref<1024xf32>
+      %C0 = xegpu.create_nd_tdesc %arg02[%c_index] : memref<1024xf32> -> !xegpu.tensor_desc<128xf32>
+      %5 = xegpu.load_nd %C0 : !xegpu.tensor_desc<128xf32> -> vector<128xf32>
+
+      %arg00 = memref.reinterpret_cast %arg0 to offset: [0], sizes: [1024], strides: [1] : memref<32x32xf16> to memref<1024xf16>
+
+      %6 = scf.for %arg3 = %c0 to %c32 step %c16 iter_args(%arg4 = %5) -> (vector<128xf32>) {
+        %a_index = arith.addi %x, %arg3 : index
+        %A0 = xegpu.create_nd_tdesc %arg00[%a_index]: memref<1024xf16> -> !xegpu.tensor_desc<128xf16>
+        %A0_val = xegpu.load_nd %A0 : !xegpu.tensor_desc<128xf16> -> vector<128xf16>
+
+        %B0 = xegpu.create_nd_tdesc %arg1[%arg3, %3] {boundary_check = true} : memref<32x32xf16> -> !xegpu.tensor_desc<16x16xf16>
+        %B0_val = xegpu.load_nd %B0 {packed} : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16>
+
+        %A0_cast = vector.shape_cast %A0_val : vector<128xf16> to vector<8x8x2xf16>
+
+        %dpas0 = xegpu.dpas %A0_cast, %B0_val : vector<8x8x2xf16>, vector<8x16x2xf16> -> vector<8x16xf32>
+        %dpas0_cast = vector.shape_cast %dpas0: vector<8x16xf32> to vector<128xf32>
+
+        scf.yield %dpas0_cast : vector<128xf32>
+      }
+      xegpu.store_nd %6, %C0 : vector<128xf32>, !xegpu.tensor_desc<128xf32>
 
-     %arg00 = memref.reinterpret_cast %arg0 to offset: [0], sizes: [128], strides: [1] : memref<8x16xf16> to memref<128xf16>
-     %0 = xegpu.create_nd_tdesc %arg00[0]: memref<128xf16> -> !xegpu.tensor_desc<128xf16>
-     %1 = xegpu.create_nd_tdesc %arg1[0, 0] {boundary_check = true} : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-     %arg02 = memref.reinterpret_cast %arg2 to offset: [0], sizes: [128], strides: [1] : memref<8x16xf32> to memref<128xf32>
-     %2 = xegpu.create_nd_tdesc %arg02[0] : memref<128xf32> -> !xegpu.tensor_desc<128xf32>
-     %3 = xegpu.load_nd %0 : !xegpu.tensor_desc<128xf16> -> vector<128xf16>
-     %4 = xegpu.load_nd %1 {packed} : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16>
-     %6 = vector.shape_cast %3: vector<128xf16> to vector<8x8x2xf16>
-     %5 = xegpu.dpas %6, %4 : vector<8x8x2xf16>, vector<8x16x2xf16> -> vector<8x16xf32>
-     %7 = vector.shape_cast %5: vector<8x16xf32> to vector<128xf32>
-     xegpu.store_nd %7, %2 : vector<128xf32>, !xegpu.tensor_desc<128xf32>
       gpu.return
     }
   }
   func.func @main() {
-    %0 = memref.get_global @__constant_8x16xf16 : memref<8x16xf16>
-    %1 = memref.get_global @__constant_16x16xf16 : memref<16x16xf16>
-    %2 = call @test(%0, %1) : (memref<8x16xf16>, memref<16x16xf16>) -> memref<8x16xf32>
-    %cast = memref.cast %2 : memref<8x16xf32> to memref<*xf32>
+    %0 = memref.get_global @__constant_32x32xf16 : memref<32x32xf16>
+    %1 = memref.get_global @__Bconstant_32x32xf16 : memref<32x32xf16>
+    %2 = call @test(%0, %1) : (memref<32x32xf16>, memref<32x32xf16>) -> memref<32x32xf32>
+    %cast = memref.cast %2 : memref<32x32xf32> to memref<*xf32>
     call @printMemrefF32(%cast) : (memref<*xf32>) -> ()
     return
   }
@@ -55,12 +86,36 @@ spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Addresses, Float16Buffer,
 }
 
 // CHECK: Unranked Memref base@{{(0x)?[-9a-f]*}}
-// CHECK-SAME: rank = 2 offset = 0 sizes = [8, 16] strides = [16, 1] data =
-// CHECK: [8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688],
-// CHECK: [8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688],
-// CHECK: [8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688],
-// CHECK: [8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688],
-// CHECK: [8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688],
-// CHECK: [8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688],
-// CHECK: [8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688],
-// CHECK: [8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688]
+// CHECK-SAME: rank = 2 offset = 0 sizes = [32, 32] strides = [32, 1] data =
+// CHECK: [8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688],
+// CHECK: [8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688],
+// CHECK: [8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688],
+// CHECK: [8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688],
+// CHECK: [8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688],
+// CHECK: [8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688],
+// CHECK: [8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688],
+// CHECK: [8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688],
+// CHECK: [8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688],
+// CHECK: [8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688],
+// CHECK: [8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688],
+// CHECK: [8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688],
+// CHECK: [8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688],
+// CHECK: [8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688],
+// CHECK: [8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688],
+// CHECK: [8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688],
+// CHECK: [8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688],
+// CHECK: [8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688],
+// CHECK: [8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688],
+// CHECK: [8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688],
+// CHECK: [8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688],
+// CHECK: [8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688],
+// CHECK: [8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688],
+// CHECK: [8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688],
+// CHECK: [8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688],
+// CHECK: [8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688],
+// CHECK: [8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688],
+// CHECK: [8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688],
+// CHECK: [8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688],
+// CHECK: [8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688],
+// CHECK: [8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688],
+// CHECK: [8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688,   8.79688]]

From eb8c81ac78d553f11e4472be25b7146a4affb6e7 Mon Sep 17 00:00:00 2001
From: Chao Chen <116223022+chencha3@users.noreply.github.com>
Date: Wed, 28 Aug 2024 13:50:32 -0500
Subject: [PATCH 2/2] Reimplement the blocking pass with backward dataflow
 analysis framework. (#848)

---
 include/imex/Dialect/XeTile/IR/XeTileOps.td   |   9 +-
 .../imex/Dialect/XeTile/Transforms/Passes.h   |   2 +
 .../imex/Dialect/XeTile/Transforms/Passes.td  |  26 +
 .../XeTileToXeGPU/XeTileOpConversion.cpp      |  13 +-
 lib/Dialect/XeTile/IR/XeTileOps.cpp           |   4 +-
 lib/Dialect/XeTile/Transforms/Blocking.cpp    |  23 +-
 .../XeTile/Transforms/BlockingAnalysis.cpp    | 778 ++++++++++++++++
 .../XeTile/Transforms/BlockingAnalysis.h      |  68 ++
 .../XeTile/Transforms/BlockingRewrite.cpp     | 875 ++++++++++++++++++
 lib/Dialect/XeTile/Transforms/CMakeLists.txt  |   6 +-
 .../XeTile/Transforms/Canonicalization.cpp    |   2 +-
 lib/Dialect/XeTile/Transforms/PassDetail.h    |   4 +
 test/Conversion/XeTileToXeGPU/sg_softmax.mlir |   4 +-
 .../XeTileToXeGPU/sg_tiled_softmax.mlir       |   4 +-
 test/Dialect/XeTile/IR/invalid.mlir           |   2 +-
 test/Dialect/XeTile/IR/ops.mlir               |   8 +-
 .../Blocking/persistent_kernel.mlir           | 201 ++++
 .../Blocking/sg_gemm_1k_1k_1k_f16_f32.mlir    |  99 ++
 .../sg_gemm_1k_1k_1k_f16_f32_slm.mlir         |  45 +
 .../Blocking/sg_gemm_1k_1k_1k_i8_i32.mlir     |  79 ++
 .../Transforms/Blocking/unit_tests.mlir       | 339 +++++++
 test/Dialect/XeTile/Transforms/blocking.mlir  |  12 +-
 .../XeTile/block_reduce_dim_0_fp32.mlir       |   2 +-
 .../XeTile/block_reduce_dim_1_fp32.mlir       |   2 +-
 .../XeTile/block_softmax_dim_0_fp32.mlir      |   2 +-
 .../XeTile/block_softmax_dim_1_fp32.mlir      |   2 +-
 26 files changed, 2567 insertions(+), 44 deletions(-)
 create mode 100644 lib/Dialect/XeTile/Transforms/BlockingAnalysis.cpp
 create mode 100644 lib/Dialect/XeTile/Transforms/BlockingAnalysis.h
 create mode 100644 lib/Dialect/XeTile/Transforms/BlockingRewrite.cpp
 create mode 100644 test/Dialect/XeTile/Transforms/Blocking/persistent_kernel.mlir
 create mode 100644 test/Dialect/XeTile/Transforms/Blocking/sg_gemm_1k_1k_1k_f16_f32.mlir
 create mode 100644 test/Dialect/XeTile/Transforms/Blocking/sg_gemm_1k_1k_1k_f16_f32_slm.mlir
 create mode 100644 test/Dialect/XeTile/Transforms/Blocking/sg_gemm_1k_1k_1k_i8_i32.mlir
 create mode 100644 test/Dialect/XeTile/Transforms/Blocking/unit_tests.mlir

diff --git a/include/imex/Dialect/XeTile/IR/XeTileOps.td b/include/imex/Dialect/XeTile/IR/XeTileOps.td
index 55399a7a9..cf0f6eed5 100644
--- a/include/imex/Dialect/XeTile/IR/XeTileOps.td
+++ b/include/imex/Dialect/XeTile/IR/XeTileOps.td
@@ -447,6 +447,9 @@ def XeTile_TileMMAOp : XeTile_Op<"tile_mma", []> {
         mlir::Type getElementType() {
             return getA().getType().getElementType();
         }
+        mlir::VectorType getOutputType() {
+            return getOutput().getType();
+        }
     }];
 
     let hasVerifier = 1;
@@ -581,7 +584,7 @@ def XeTile_TransposeOp: XeTile_Op<"transpose", []> {
     let hasVerifier = 1;
 }
 
-def XeTile_ReduceOp: XeTile_Op<"reduce", []> {
+def XeTile_ReductionOp: XeTile_Op<"reduction", []> {
     let summary = "performs a reduction operation over a 2D vector.";
     let description = [{
         It has the same semantics as the `vector.multi_reduction`,
@@ -591,10 +594,10 @@ def XeTile_ReduceOp: XeTile_Op<"reduce", []> {
 
     let arguments = (ins Vector_CombiningKindAttr: $kind,
                          XeTile_2DOr4DVector: $source,
-                         DenseI64ArrayAttr: $reduction_dim);
+                         DenseI64ArrayAttr: $reduction_dims);
     let results = (outs XeTile_2DOr4DVector: $result);
     let assemblyFormat = [{
-        $kind `,` $source $reduction_dim attr-dict `:` type($source) `->` type($result)
+        $kind `,` $source $reduction_dims attr-dict `:` type($source) `->` type($result)
     }];
 
     let hasVerifier = 1;
diff --git a/include/imex/Dialect/XeTile/Transforms/Passes.h b/include/imex/Dialect/XeTile/Transforms/Passes.h
index 0f1d948a4..91b002c76 100644
--- a/include/imex/Dialect/XeTile/Transforms/Passes.h
+++ b/include/imex/Dialect/XeTile/Transforms/Passes.h
@@ -40,6 +40,8 @@ std::unique_ptr<mlir::Pass> createXeTileInitDuplicatePass();
 
 std::unique_ptr<mlir::Pass>
 createXeTileBlockingPass(const std::string &device = "pvc");
+std::unique_ptr<mlir::Pass>
+createNewXeTileBlockingPass(const std::string &device = "pvc");
 std::unique_ptr<mlir::Pass> createXeTileBlockAligningPass();
 std::unique_ptr<mlir::Pass> createXeTileWgToSgPass();
 std::unique_ptr<mlir::Pass> createXeTileOptimizeTransposePass();
diff --git a/include/imex/Dialect/XeTile/Transforms/Passes.td b/include/imex/Dialect/XeTile/Transforms/Passes.td
index d0737931c..242a90a5e 100644
--- a/include/imex/Dialect/XeTile/Transforms/Passes.td
+++ b/include/imex/Dialect/XeTile/Transforms/Passes.td
@@ -130,5 +130,31 @@ def XeTileCanonicalization : Pass<"xetile-canonicalization", "::mlir::gpu::GPUMo
   ];
 }
 
+def NewXeTileBlocking : Pass<"new-xetile-blocking", "::mlir::gpu::GPUModuleOp">{
+  let summary = "transform XeTile large tiles(input) into arrays of smaller "
+                "blocks with appropriate size, such that the operator on each "
+                "of the blocks can be mapped into one hardware instruction.";
+
+  let description = [{
+    This transform pass preprocesses the xetile program by decomposing large XeTile tiles
+    into smaller ones that can be handled by a hardware instruction. It is going to replace
+    the xetile-blocking pass.
+  }];
+
+  let constructor = "imex::createNewXeTileBlockingPass()";
+  let dependentDialects = ["imex::xetile::XeTileDialect",
+                           "mlir::arith::ArithDialect",
+                           "mlir::math::MathDialect",
+                           "mlir::gpu::GPUDialect",
+                           "mlir::memref::MemRefDialect",
+                           "mlir::vector::VectorDialect"];
+
+  let options = [
+     Option<"device", "device", "std::string",
+            /*default=*/"\"pvc\"",
+            "gpu platform architecture where these ops are running">
+ ];
+}
+
 
 #endif // _XeTile_PASSES_TD_INCLUDED_
diff --git a/lib/Conversion/XeTileToXeGPU/XeTileOpConversion.cpp b/lib/Conversion/XeTileToXeGPU/XeTileOpConversion.cpp
index d471c8f1b..83d392e54 100644
--- a/lib/Conversion/XeTileToXeGPU/XeTileOpConversion.cpp
+++ b/lib/Conversion/XeTileToXeGPU/XeTileOpConversion.cpp
@@ -736,15 +736,16 @@ extern llvm::SmallVector<mlir::Value> lowerInnerReductionWithVectorReduction(
     mlir::vector::CombiningKind kind, mlir::Location loc, mlir::Type elemTy,
     XeOneToNPatternRewriter &rewriter);
 
-struct SgTileReduceOpPattern : public XeOneToNConversion<xetile::ReduceOp> {
-  using XeOneToNConversion<xetile::ReduceOp>::XeOneToNConversion;
+struct SgTileReductionOpPattern
+    : public XeOneToNConversion<xetile::ReductionOp> {
+  using XeOneToNConversion<xetile::ReductionOp>::XeOneToNConversion;
 
   mlir::LogicalResult
-  matchAndRewrite(xetile::ReduceOp op, OpAdaptor adaptor,
+  matchAndRewrite(xetile::ReductionOp op, OpAdaptor adaptor,
                   XeOneToNPatternRewriter &rewriter) const override {
     auto srcTy = op.getSource().getType();
     auto elemTy = srcTy.getElementType();
-    auto dims = op.getReductionDim();
+    auto dims = op.getReductionDims();
     // its input should be a 4D vector, and has 2 reduction dims,
     // otherwise run the blocking pass first.
     if (dims.size() != 2 || srcTy.getRank() != 4)
@@ -1092,8 +1093,8 @@ void populateXeTileOpConversionPatterns(imex::XeOneToNTypeConverter &converter,
       SgTileMMAOpPattern, SgUpdateTileOffsetOpPattern,
       SgTransposeOpPattern<mlir::vector::TransposeOp>,
       SgTransposeOpPattern<xetile::TransposeOp>, SgBroadcastOpPattern,
-      SgTileReduceOpPattern, SgVectorCreateMaskOpPattern>(patterns.getContext(),
-                                                          converter, analysis);
+      SgTileReductionOpPattern, SgVectorCreateMaskOpPattern>(
+      patterns.getContext(), converter, analysis);
   patterns.insert<ElementWiseOpPattern<mlir::arith::NegFOp, 1>,
                   ElementWiseOpPattern<mlir::math::ExpOp, 1>,
                   ElementWiseOpPattern<mlir::math::SinOp, 1>,
diff --git a/lib/Dialect/XeTile/IR/XeTileOps.cpp b/lib/Dialect/XeTile/IR/XeTileOps.cpp
index d52caf164..060b93a69 100644
--- a/lib/Dialect/XeTile/IR/XeTileOps.cpp
+++ b/lib/Dialect/XeTile/IR/XeTileOps.cpp
@@ -859,8 +859,8 @@ mlir::LogicalResult TransposeOp::verify() {
   return mlir::success();
 }
 
-mlir::LogicalResult ReduceOp::verify() {
-  auto dims = getReductionDim();
+mlir::LogicalResult ReductionOp::verify() {
+  auto dims = getReductionDims();
   auto resShape = getResult().getType().getShape();
   for (auto i : dims)
     if (resShape[i] != 1)
diff --git a/lib/Dialect/XeTile/Transforms/Blocking.cpp b/lib/Dialect/XeTile/Transforms/Blocking.cpp
index fdefa8e76..7bbec87d1 100644
--- a/lib/Dialect/XeTile/Transforms/Blocking.cpp
+++ b/lib/Dialect/XeTile/Transforms/Blocking.cpp
@@ -556,15 +556,16 @@ struct VectorMultiDimReductionOpPattern
   }
 };
 
-struct TileReduceOpPattern
-    : public XeTileConversion<xetile::ReduceOp, TileUsageAnalysis> {
+struct TileReductionOpPattern
+    : public XeTileConversion<xetile::ReductionOp, TileUsageAnalysis> {
 
-  using XeTileConversion<xetile::ReduceOp, TileUsageAnalysis>::XeTileConversion;
+  using XeTileConversion<xetile::ReductionOp,
+                         TileUsageAnalysis>::XeTileConversion;
 
-  TileReduceOpPattern(mlir::MLIRContext *context,
-                      imex::XeTypeConverter &converter,
-                      TileUsageAnalysis &analysis,
-                      std::shared_ptr<XeuArchInterface> ptruArch)
+  TileReductionOpPattern(mlir::MLIRContext *context,
+                         imex::XeTypeConverter &converter,
+                         TileUsageAnalysis &analysis,
+                         std::shared_ptr<XeuArchInterface> ptruArch)
       : XeTileConversion(context, converter, analysis) {
     this->uArchInterface = ptruArch;
   }
@@ -572,13 +573,13 @@ struct TileReduceOpPattern
   std::shared_ptr<XeuArchInterface> uArchInterface = nullptr;
 
   mlir::LogicalResult
-  matchAndRewrite(xetile::ReduceOp op, OpAdaptor adaptor,
+  matchAndRewrite(xetile::ReductionOp op, OpAdaptor adaptor,
                   OpPatternRewriter &rewriter) const override {
     auto loc = op.getLoc();
     auto srcTy = op.getSource().getType();
     auto elemTy = srcTy.getElementType();
     auto shape = srcTy.getShape();
-    auto reductionDims = op.getReductionDim();
+    auto reductionDims = op.getReductionDims();
 
     if (srcTy.getRank() != 2 || reductionDims.size() != 1)
       return rewriter.notifyMatchFailure(
@@ -611,7 +612,7 @@ struct TileReduceOpPattern
 
     auto newSource =
         addPackOp(adaptor.getSource(), {blkSizes[0], blkSizes[1]}, rewriter);
-    auto newDest = rewriter.create<xetile::ReduceOp>(
+    auto newDest = rewriter.create<xetile::ReductionOp>(
         loc, newDestType, op.getKind(), newSource, newReductionDims);
     auto unpack = addUnpackOp(newDest.getResult(), rewriter);
     rewriter.replaceOp(op, unpack);
@@ -1161,7 +1162,7 @@ void populateXeTileBlockingPatterns(
                   VectorizableOpPattern, SCFForOpPattern, SCFYieldOpPattern,
                   InitTileOpPattern, LoadTileOpPattern, StoreTileOpPattern,
                   TileMMAOpPattern, UpdateTileOffsetOpPattern,
-                  VectorMultiDimReductionOpPattern, TileReduceOpPattern,
+                  VectorMultiDimReductionOpPattern, TileReductionOpPattern,
                   TileBroadcastOpPattern>(patterns.getContext(), converter,
                                           analysis, ptruArch);
   patterns.insert<TransposeOpPattern<mlir::vector::TransposeOp>,
diff --git a/lib/Dialect/XeTile/Transforms/BlockingAnalysis.cpp b/lib/Dialect/XeTile/Transforms/BlockingAnalysis.cpp
new file mode 100644
index 000000000..ada144241
--- /dev/null
+++ b/lib/Dialect/XeTile/Transforms/BlockingAnalysis.cpp
@@ -0,0 +1,778 @@
+#include <llvm/ADT/DenseMapInfo.h>
+#include <llvm/ADT/DenseSet.h>
+#include <llvm/Support/Debug.h>
+
+#include "BlockingAnalysis.h"
+
+namespace llvm {
+using imex::Block;
+// Implementation of llvm::DenseMapInfo for Block, required for
+// using Block as a value in DenseMap.
+template <> struct DenseMapInfo<Block> {
+  static inline Block getEmptyKey() {
+    return Block(-1, -1); // the empty key
+  }
+
+  static inline Block getTombstoneKey() {
+    return Block(-2, -2); // the tombstone key
+  }
+
+  static unsigned getHashValue(const Block &b) {
+    return hash_combine(b[0], b[1]);
+  }
+
+  static bool isEqual(const Block &lhs, const Block &rhs) { return lhs == rhs; }
+};
+} // namespace llvm
+
+namespace imex {
+
+// ===------------------ Block Implementation --------------------------===//
+
+int64_t &Block::operator[](size_t index) {
+  assert(index < 2 && "Index out of bounds");
+  return values[index];
+}
+
+const int64_t &Block::operator[](size_t index) const {
+  assert(index < 2 && "Index out of bounds");
+  return values[index];
+}
+
+bool Block::operator==(Block &other) const {
+  return values[0] == other.values[0] && values[1] == other.values[1];
+}
+
+bool Block::operator==(const Block &other) const {
+  return values[0] == other.values[0] && values[1] == other.values[1];
+}
+
+void Block::print(llvm::raw_ostream &os) const {
+  os << "[" << values[0] << ", " << values[1] << "]";
+}
+
+llvm::ArrayRef<int64_t> Block::asArrayRef() const {
+  return llvm::ArrayRef<int64_t>(values, 2);
+}
+
+llvm::raw_ostream &operator<<(llvm::raw_ostream &os, Block blk) {
+  blk.print(os);
+  return os;
+}
+
+// ===------------------ BlockRequests Implementation --------------------===//
+// A class holding all blocking requests for a given mlir::Value.
+// For convience, it also tracks the UsePoint of the value.
+class BlockingRequests {
+public:
+  BlockingRequests() = default;
+  BlockingRequests(int64_t h, int64_t w, mlir::Operation *user, int64_t pos)
+      : BlockingRequests(h, w, UsePoint(user, pos)) {}
+
+  BlockingRequests(int64_t h, int64_t w, UsePoint point)
+      : BlockingRequests(Block(h, w), point) {}
+
+  BlockingRequests(llvm::ArrayRef<int64_t> shape, UsePoint point)
+      : BlockingRequests(shape[0], shape[1], point) {
+    assert(shape.size() == 2 && "Invalid block size.");
+  }
+
+  BlockingRequests(Block block, UsePoint point);
+
+  bool operator==(const BlockingRequests &other) const;
+  bool operator!=(const BlockingRequests &other) const;
+
+  Block getDefBlock() const;
+  Block getUseBlock(UsePoint point) const;
+
+  void print(llvm::raw_ostream &os) const;
+
+  static BlockingRequests meet(const BlockingRequests &lhs,
+                               const BlockingRequests &rhs);
+
+  static BlockingRequests join(const BlockingRequests &lhs,
+                               const BlockingRequests &rhs);
+
+  // indicate that one use of the result operand
+  // has decided on the inner block size.
+  bool isInitialized() const { return requests.size() != 0; }
+
+  int64_t getNumUniqRequests() const { return getRequests().size(); }
+
+  llvm::SmallVector<Block> getRequests() const {
+    llvm::SmallDenseSet<Block, 8> reqs;
+    for (auto [point, block] : requests)
+      reqs.insert(block);
+    return llvm::SmallVector<Block>(reqs.begin(), reqs.end());
+  }
+
+  void updateDefBlock(Block block) { def = block; }
+
+private:
+  Block def;
+  llvm::DenseMap<UsePoint, Block> requests;
+};
+
+BlockingRequests::BlockingRequests(Block block, UsePoint point) {
+  assert(block && "Invalid block.");
+  requests.try_emplace(point, block);
+}
+
+Block BlockingRequests::getDefBlock() const {
+  if (def)
+    return def;
+  if (requests.size())
+    return (requests.begin()->second);
+  return Block();
+}
+
+Block BlockingRequests::getUseBlock(UsePoint point) const {
+  return requests.lookup(point);
+}
+
+void BlockingRequests::print(llvm::raw_ostream &os) const {
+  if (!isInitialized()) {
+    os << "Uninitialized";
+  } else {
+    os << "Requests (" << requests.size() << ", "
+       << "def: " << def << "): ";
+    for (auto [i, iter] : llvm::enumerate(requests)) {
+      os << "(" << *(iter.first).first << ", " << (iter.first).second
+         << "): \n\t" << iter.second;
+      if (i != requests.size() - 1)
+        os << ", ";
+    }
+  }
+}
+
+bool BlockingRequests::operator==(const BlockingRequests &other) const {
+  return requests == other.requests;
+}
+
+bool BlockingRequests::operator!=(const BlockingRequests &other) const {
+  return !(*this == other);
+}
+
+BlockingRequests BlockingRequests::meet(const BlockingRequests &lhs,
+                                        const BlockingRequests &rhs) {
+  return join(lhs, rhs);
+}
+
+BlockingRequests BlockingRequests::join(const BlockingRequests &lhs,
+                                        const BlockingRequests &rhs) {
+  BlockingRequests newReq;
+  if (lhs.isInitialized()) {
+    for (auto [point, block] : lhs.requests) {
+      newReq.requests.try_emplace(point, block);
+    }
+  }
+  if (rhs.isInitialized()) {
+    for (auto [point, block] : rhs.requests) {
+      newReq.requests.try_emplace(point, block);
+    }
+  }
+  return newReq;
+}
+
+llvm::raw_ostream &operator<<(llvm::raw_ostream &os,
+                              BlockingRequests requests) {
+  requests.print(os);
+  return os;
+}
+
+// ===---------------- BlockingLattice Implementation -----------------===//
+// A lattice wrapper for BlockingRequests
+struct BlockingLattice : public mlir::dataflow::Lattice<BlockingRequests> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(BlockingLattice)
+  using Lattice::Lattice;
+
+  mlir::ChangeResult join(const AbstractSparseLattice &rhs) override {
+    return join(static_cast<const BlockingLattice &>(rhs).getValue());
+  }
+
+  mlir::ChangeResult join(const BlockingRequests &other) {
+    auto &val = getValue();
+    BlockingRequests newValue = BlockingRequests::join(val, other);
+    if (newValue == val)
+      return mlir::ChangeResult::NoChange;
+    val = newValue;
+    return mlir::ChangeResult::Change;
+  }
+};
+
+// ===----------------------BlockingAnalysisImpl ---------------------===//
+class BlockingAnalysisImpl
+    : public mlir::dataflow::SparseBackwardDataFlowAnalysis<BlockingLattice> {
+public:
+  BlockingAnalysisImpl(mlir::DataFlowSolver &solver,
+                       mlir::SymbolTableCollection &symbolTable,
+                       std::shared_ptr<XeuArchInterface> uArch)
+      : SparseBackwardDataFlowAnalysis(solver, symbolTable), uArch(uArch) {}
+
+  void visitOperation(mlir::Operation *op,
+                      mlir::ArrayRef<BlockingLattice *> operands,
+                      mlir::ArrayRef<const BlockingLattice *> results) override;
+
+  void visitBranchOperand(mlir::OpOperand &operand) override {}
+
+  void visitCallOperand(mlir::OpOperand &operand) override {}
+
+  void setToExitState(BlockingLattice *lattice) override {}
+
+private:
+  void visitPrefetchTileOp(xetile::PrefetchTileOp op,
+                           mlir::ArrayRef<BlockingLattice *> operands,
+                           mlir::ArrayRef<const BlockingLattice *> results);
+
+  void visitLoadTileOp(xetile::LoadTileOp op,
+                       mlir::ArrayRef<BlockingLattice *> operands,
+                       mlir::ArrayRef<const BlockingLattice *> results);
+
+  void visitStoreTileOp(xetile::StoreTileOp op,
+                        mlir::ArrayRef<BlockingLattice *> operands,
+                        mlir::ArrayRef<const BlockingLattice *> results);
+
+  void visitUpdateTileOp(xetile::UpdateTileOffsetOp op,
+                         mlir::ArrayRef<BlockingLattice *> operands,
+                         mlir::ArrayRef<const BlockingLattice *> results);
+
+  void visitTileMMAOp(xetile::TileMMAOp op,
+                      mlir::ArrayRef<BlockingLattice *> operands,
+                      mlir::ArrayRef<const BlockingLattice *> results);
+
+  void visitVectorizableOp(mlir::Operation *op,
+                           mlir::ArrayRef<BlockingLattice *> operands,
+                           mlir::ArrayRef<const BlockingLattice *> results);
+
+  void visitShapecastOp(mlir::vector::ShapeCastOp op,
+                        mlir::ArrayRef<BlockingLattice *> operands,
+                        mlir::ArrayRef<const BlockingLattice *> results);
+
+  void visitReductionOp(xetile::ReductionOp op,
+                        mlir::ArrayRef<BlockingLattice *> operands,
+                        mlir::ArrayRef<const BlockingLattice *> results);
+
+  void visitBroadcastOp(xetile::BroadcastOp op,
+                        mlir::ArrayRef<BlockingLattice *> operands,
+                        mlir::ArrayRef<const BlockingLattice *> results);
+
+  void visitTransposeOp(xetile::TransposeOp op,
+                        mlir::ArrayRef<BlockingLattice *> operands,
+                        mlir::ArrayRef<const BlockingLattice *> results);
+
+  int getMaxSLMBlockSize(int elemBitWidth, int height);
+
+  template <typename Integertype>
+  Block getInnerBlockSize(mlir::Operation *op, mlir::Type elemTy,
+                          llvm::ArrayRef<Integertype> &shape,
+                          int memorySpace = 0);
+
+  llvm::SmallVector<unsigned int>
+  getMMASize(mlir::Type elemTy, const int APrecision, const int BPrecision,
+             const int CPrecision, const int DPrecision);
+
+private:
+  std::shared_ptr<XeuArchInterface> uArch = nullptr;
+};
+
+void BlockingAnalysisImpl::visitOperation(
+    mlir::Operation *op, mlir::ArrayRef<BlockingLattice *> operands,
+    mlir::ArrayRef<const BlockingLattice *> results) {
+
+  if (auto updateTileOp = mlir::dyn_cast<xetile::UpdateTileOffsetOp>(op))
+    visitUpdateTileOp(updateTileOp, operands, results);
+
+  if (auto prefetchOp = mlir::dyn_cast<xetile::PrefetchTileOp>(op))
+    visitPrefetchTileOp(prefetchOp, operands, results);
+
+  if (auto loadOp = mlir::dyn_cast<xetile::LoadTileOp>(op))
+    visitLoadTileOp(loadOp, operands, results);
+
+  if (auto storeOp = mlir::dyn_cast<xetile::StoreTileOp>(op))
+    visitStoreTileOp(storeOp, operands, results);
+
+  if (auto tileMMAOp = mlir::dyn_cast<xetile::TileMMAOp>(op))
+    visitTileMMAOp(tileMMAOp, operands, results);
+
+  if (auto reductionOp = mlir::dyn_cast<xetile::ReductionOp>(op))
+    visitReductionOp(reductionOp, operands, results);
+
+  if (auto transposeOp = mlir::dyn_cast<xetile::TransposeOp>(op))
+    visitTransposeOp(transposeOp, operands, results);
+
+  if (auto broadcastOp = mlir::dyn_cast<xetile::BroadcastOp>(op))
+    visitBroadcastOp(broadcastOp, operands, results);
+
+  if (op->hasTrait<mlir::OpTrait::Vectorizable>())
+    visitVectorizableOp(op, operands, results);
+
+  if (auto shapecastOp = mlir::dyn_cast<mlir::vector::ShapeCastOp>(op))
+    visitShapecastOp(shapecastOp, operands, results);
+}
+
+void BlockingAnalysisImpl::visitPrefetchTileOp(
+    xetile::PrefetchTileOp op, mlir::ArrayRef<BlockingLattice *> operands,
+    mlir::ArrayRef<const BlockingLattice *> results) {
+  auto tileTy = op.getTile().getType();
+  auto elemTy = tileTy.getElementType();
+  auto shape = tileTy.getShape();
+  auto memSpace = tileTy.getMemoryScopeAsInt();
+  // initialized with a default size queried from the architecture
+  auto size = getInnerBlockSize(op, elemTy, shape, memSpace);
+  if (!size)
+    return; // do nothing if didnot get a valid block size
+  auto BlockingRequest = BlockingRequests(size, UsePoint(op, 0));
+  propagateIfChanged(operands[0], operands[0]->join(BlockingRequest));
+}
+
+void BlockingAnalysisImpl::visitLoadTileOp(
+    xetile::LoadTileOp op, mlir::ArrayRef<BlockingLattice *> operands,
+    mlir::ArrayRef<const BlockingLattice *> results) {
+  auto lattice = results[0]->getValue();
+
+  if (lattice.getNumUniqRequests() > 1)
+    op.emitWarning("multiple users requesting different blocking sizes.");
+
+  auto tileTy = op.getSource().getType();
+  auto elemTy = tileTy.getElementType();
+  auto shape = tileTy.getShape();
+  auto memSpace = tileTy.getMemoryScopeAsInt();
+  // initialized with a default size queried from the architecture
+  Block block = getInnerBlockSize(op, elemTy, shape, memSpace);
+
+  // It has users but users' requirements are not available yet.
+  // Worth to wait until all users are visited.
+  if (!op.getValue().use_empty() && !lattice.isInitialized())
+    return;
+
+  // adjust according to user's requirements if it is available
+  if (lattice.isInitialized()) {
+    // align the height dimension if user is a transpose op,
+    // otherwise align the width dimension to minimize the
+    // in-register data movements.
+    bool hasTransposeUser = op.getValue().hasOneUse() &&
+                            mlir::isa<xetile::TransposeOp>(*(op->user_begin()));
+
+    int dim = hasTransposeUser ? 0 : 1;
+    for (auto rq : lattice.getRequests())
+      block[dim] = std::min(block[dim], rq[dim]);
+  }
+
+  if (!block)
+    return; // do nothing if didnot get a valid block size
+
+  auto BlockingRequest = BlockingRequests(block, UsePoint({op, 0}));
+  // propagate the blocking size to its def op
+  propagateIfChanged(operands[0], operands[0]->join(BlockingRequest));
+
+  // update the def block size for the result value
+  BlockingRequests &def = getLatticeElement(op.getValue())->getValue();
+  def.updateDefBlock(block);
+}
+
+void BlockingAnalysisImpl::visitStoreTileOp(
+    xetile::StoreTileOp op, mlir::ArrayRef<BlockingLattice *> operands,
+    mlir::ArrayRef<const BlockingLattice *> results) {
+  auto tileTy = op.getTile().getType();
+  auto elemTy = tileTy.getElementType();
+  auto shape = tileTy.getShape();
+  auto memSpace = tileTy.getMemoryScopeAsInt();
+  auto size = getInnerBlockSize(op, elemTy, shape, memSpace);
+
+  if (!size)
+    return; // do nothing if didnot get a valid block size
+
+  for (auto &&[i, inputOpr] : llvm::enumerate(operands)) {
+    auto blockingRequest = BlockingRequests(size, UsePoint(op, i));
+    propagateIfChanged(inputOpr, inputOpr->join(blockingRequest));
+  }
+}
+
+void BlockingAnalysisImpl::visitUpdateTileOp(
+    xetile::UpdateTileOffsetOp op, mlir::ArrayRef<BlockingLattice *> operands,
+    mlir::ArrayRef<const BlockingLattice *> results) {
+  auto lattice = results[0]->getValue();
+  if (lattice.isInitialized()) {
+    auto block = lattice.getRequests()[0];
+    auto request = BlockingRequests(block, UsePoint(op, 0));
+    propagateIfChanged(operands[0], operands[0]->join(request));
+  }
+}
+
+void BlockingAnalysisImpl::visitTileMMAOp(
+    xetile::TileMMAOp op, mlir::ArrayRef<BlockingLattice *> operands,
+    mlir::ArrayRef<const BlockingLattice *> results) {
+
+  auto getElemBitWidth = [](mlir::VectorType vecTy) {
+    return vecTy.getElementType().getIntOrFloatBitWidth();
+  };
+
+  auto C = op.getC();
+  auto aPrecision = getElemBitWidth(op.getAType());
+  auto bPrecision = getElemBitWidth(op.getBType());
+  auto dPrecision = getElemBitWidth(op.getOutputType());
+  auto cPrecision = !C ? dPrecision : getElemBitWidth(C.getType());
+
+  auto mmaSize = getMMASize(op.getElementType(), aPrecision, bPrecision,
+                            cPrecision, dPrecision);
+
+  auto blockSizeForA =
+      BlockingRequests(mmaSize[0], mmaSize[1], UsePoint({op, 0}));
+  auto blockSizeForB =
+      BlockingRequests(mmaSize[1], mmaSize[2], UsePoint({op, 1}));
+
+  propagateIfChanged(operands[0], operands[0]->join(blockSizeForA));
+  propagateIfChanged(operands[1], operands[1]->join(blockSizeForB));
+  if (C) {
+    auto blockSizeForC =
+        BlockingRequests(mmaSize[0], mmaSize[2], UsePoint(op, 2));
+    propagateIfChanged(operands[2], operands[2]->join(blockSizeForC));
+  }
+
+  // update the def block size for the result value
+  BlockingRequests &def = getLatticeElement(op.getOutput())->getValue();
+  def.updateDefBlock(Block(mmaSize[0], mmaSize[2]));
+}
+
+void BlockingAnalysisImpl::visitReductionOp(
+    xetile::ReductionOp op, mlir::ArrayRef<BlockingLattice *> operands,
+    mlir::ArrayRef<const BlockingLattice *> results) {
+  auto srcTy = op.getSource().getType();
+  auto dims = op.getReductionDims();
+  // We only support reduction on 2D types now.
+  if (srcTy.getRank() != 2 || dims.size() != 1)
+    return;
+
+  auto elemTy = srcTy.getElementType();
+  auto shape = srcTy.getShape();
+  // ReductionOp is special. Its blocking size is fixed to {1,
+  // min(subgroupSize, width)}
+  auto size = getInnerBlockSize(op, elemTy, shape);
+  if (!size)
+    return; // do nothing if didnot get a valid block size
+
+  auto blockingRequest = BlockingRequests(size, UsePoint(op, 0));
+  propagateIfChanged(operands[0], operands[0]->join(blockingRequest));
+}
+
+void BlockingAnalysisImpl::visitBroadcastOp(
+    xetile::BroadcastOp op, mlir::ArrayRef<BlockingLattice *> operands,
+    mlir::ArrayRef<const BlockingLattice *> results) {
+  auto srcTy = op.getSource().getType();
+  auto dims = op.getBroadcastDim();
+  // We only support reduction on 2D types now.
+  if (srcTy.getRank() != 2 || dims.size() != 1)
+    return;
+
+  auto elemTy = srcTy.getElementType();
+  auto shape = srcTy.getShape();
+  // BroadcastOp is special. Its blocking size is fixed to {1,
+  // min(subgroupSize, width)}
+  auto size = getInnerBlockSize(op, elemTy, shape);
+  if (!size)
+    return; // do nothing if didnot get a valid block size
+
+  auto blockingRequest = BlockingRequests(size, UsePoint(op, 0));
+  propagateIfChanged(operands[0], operands[0]->join(blockingRequest));
+}
+
+void BlockingAnalysisImpl::visitTransposeOp(
+    xetile::TransposeOp op, mlir::ArrayRef<BlockingLattice *> operands,
+    mlir::ArrayRef<const BlockingLattice *> results) {
+
+  auto permutation = op.getPermutation();
+  auto resType = op.getResult().getType();
+  // we only support true 2D transpose now
+  if (resType.getRank() != 2 || permutation != mlir::ArrayRef<int64_t>({1, 0}))
+    return;
+
+  auto lattice = results[0]->getValue();
+
+  // Wait for requests from users.
+  if (!op->use_empty() && !lattice.isInitialized())
+    return;
+
+  Block block;
+
+  // use the default size if no users
+  if (op->use_empty()) {
+    auto srcTy = op.getVector().getType();
+    auto shape = srcTy.getShape();
+    block = getInnerBlockSize(op, srcTy.getElementType(), shape);
+  }
+
+  // TransposeOp determines its blocking size based on requests from
+  // its users, by swapping the blocking size of its users.
+  if (lattice.isInitialized()) {
+    // TODO: handle multiple users
+    if (lattice.getNumUniqRequests() == 1) {
+      auto req = lattice.getRequests()[0];
+      block = Block(req[1], req[0]);
+    }
+  }
+
+  if (!block)
+    return; // do nothing if didnot get a valid block size
+
+  auto request = BlockingRequests(block, UsePoint(op, 0));
+  propagateIfChanged(operands[0], operands[0]->join(request));
+
+  // update the def block size for the result value
+  BlockingRequests &def = getLatticeElement(op.getResult())->getValue();
+  def.updateDefBlock(Block(block[1], block[0]));
+}
+
+void BlockingAnalysisImpl::visitVectorizableOp(
+    mlir::Operation *op, mlir::ArrayRef<BlockingLattice *> operands,
+    mlir::ArrayRef<const BlockingLattice *> results) {
+  // Currently only supports simple elementwise math ops.
+  if (op->getNumResults() != 1)
+    return;
+
+  auto type = mlir::dyn_cast<mlir::VectorType>(op->getResult(0).getType());
+  if (!type)
+    return;
+
+  auto lattice = results[0]->getValue();
+
+  // Wait for requests from users.
+  if (!op->use_empty() && !lattice.isInitialized())
+    return;
+
+  auto elemTy = type.getElementType();
+  auto shape = type.getShape();
+  Block block = getInnerBlockSize(op, elemTy, shape);
+
+  // elementwise operations are not sensitive to the block size.
+  // It will use the block size requested by its users.
+  if (lattice.isInitialized()) {
+    block[0] = 0;
+    for (auto &req : lattice.getRequests()) {
+      block[0] = std::max(block[0], req[0]);
+      block[1] = std::min(block[1], req[1]);
+    }
+  }
+
+  // do nothing if get an invalid block
+  if (!block)
+    return;
+
+  // propagate the block size on its operands
+  for (auto &&[i, inputOpr] : llvm::enumerate(operands)) {
+    auto req = BlockingRequests(block, UsePoint(op, i));
+    propagateIfChanged(inputOpr, inputOpr->join(req));
+  }
+
+  // update the def block size for the result value
+  BlockingRequests &def = getLatticeElement(op->getResult(0))->getValue();
+  def.updateDefBlock(block);
+}
+
+void BlockingAnalysisImpl::visitShapecastOp(
+    mlir::vector::ShapeCastOp op, mlir::ArrayRef<BlockingLattice *> operands,
+    mlir::ArrayRef<const BlockingLattice *> results) {
+  auto shape = op.getSource().getType().getShape();
+  if (shape.size() == 2) {
+    auto BlockingRequest = BlockingRequests(shape, UsePoint(op, 0));
+    propagateIfChanged(operands[0], operands[0]->join(BlockingRequest));
+  }
+}
+
+int BlockingAnalysisImpl::getMaxSLMBlockSize(int elemBitWidth, int height) {
+  // TODO: use uArch to get max vec size?
+  const int lscConstraint = 512; // lsc supports upto 512 bytes per load/store
+  int numElems = (lscConstraint * 8) / elemBitWidth;
+  int width = numElems / height;
+  return width;
+}
+
+// Determine the inner block size for the given operation based on the
+// operand's element data type, shape, and also memory space.
+template <typename Integertype>
+Block BlockingAnalysisImpl::getInnerBlockSize(
+    mlir::Operation *op, mlir::Type elemTy, llvm::ArrayRef<Integertype> &shape,
+    int memorySpace) {
+  assert(elemTy.isIntOrFloat() && "only support int or float element type.");
+
+  // TODO: get from uArch ?
+  const int64_t subgroupSize = 16;
+  int elemSize = elemTy.getIntOrFloatBitWidth();
+
+  int maxHeight = 0, minHeight = 0, maxWidth = 0, minWidth = 0;
+  if (mlir::isa<xetile::ReductionOp>(op) ||
+      mlir::isa<xetile::BroadcastOp>(op)) {
+    // for reduction and broadcast ops, we simply using
+    // [1, subgroupSize] as innerblock size
+    maxWidth = subgroupSize;
+    minWidth = 1;
+    maxHeight = 1;
+    minHeight = 1;
+  } else if (op->hasTrait<mlir::OpTrait::Vectorizable>()) {
+    // for elementwise operations, they are pretty flexiable
+    // on the block size. But we expect its second dimension
+    // is subgroupSize aligned.
+    minWidth = 1;
+    minHeight = 1;
+    maxWidth = std::min<int>(shape[1], subgroupSize);
+    maxHeight = shape[0];
+  } else if (mlir::isa<xetile::TransposeOp>(op)) {
+    // for transpose op, we will use the original shape
+    // as the default size, and adjust it if it is defined
+    // by a load op
+    minWidth = 1;
+    minHeight = 1;
+    maxWidth = shape[1];
+    maxHeight = shape[0];
+
+    // if the transpose follows a load op, and data element is 32-bit
+    // or 64-bit, it is expected to be folded with a load, and need to
+    // be aligned to hardware constraints.
+    auto defOp = op->getOperand(0).getDefiningOp<xetile::LoadTileOp>();
+    if (defOp && elemSize >= 32) {
+      auto params = uArch->get2DLoadConfig(defOp, elemSize, false, true);
+      minHeight = params->blockHeight.min;
+      minWidth = params->blockWidth.min;
+      // to be compatible with the SIMT instrinsic, the maximum height is
+      // limited to 16, which is maximum supported value by SIMT instrinsic.
+      maxHeight = std::min<int>(params->blockHeight.max, 16);
+      maxWidth = params->blockWidth.max;
+    }
+  } else if (memorySpace == 3) {
+    // this is supposed for load/store from/to SLM, they will use regular
+    // load/store instructions with chunk size. lsc instrinsic and hardware
+    // has serveral limits on the size per load/store.
+    minHeight = minWidth = 1;
+    // If shape[0] is divisible by subgroup size, we use regular load (with
+    // chunk size) with XeGPU.load_gather (maxHeight = 16). Otherwise, we
+    // use 1D load with XeGPU.load_nd(1d, maxHeight = 1).
+    maxHeight = shape[0] % subgroupSize == 0 ? subgroupSize : 1;
+    maxWidth = getMaxSLMBlockSize(elemSize, maxHeight);
+  } else { // for load/store from/to global memory
+    mlir::FailureOr<LoadStore2DConfig> params;
+    if (mlir::isa<xetile::StoreTileOp>(op))
+      params = uArch->get2DStoreConfig(elemSize);
+    if (mlir::isa<xetile::PrefetchTileOp>(op) ||
+        mlir::isa<xetile::LoadTileOp>(op)) {
+      bool transpose = false;
+      // if its user is a transpose op, and data element is 32-bit
+      // or 64-bit, we will use the transpose supported size.
+      if (auto loadOp = mlir::dyn_cast<xetile::LoadTileOp>(op)) {
+        auto value = loadOp.getValue();
+        transpose = elemSize >= 32 && value.hasOneUse() &&
+                    mlir::isa<xetile::TransposeOp>(*(value.user_begin()));
+      }
+      params = uArch->get2DLoadConfig(op, elemSize, false, transpose);
+    }
+    if (mlir::succeeded(params)) {
+      maxHeight = params->blockHeight.max;
+      minHeight = params->blockHeight.min;
+      maxWidth = params->blockWidth.max;
+      minWidth = params->blockWidth.min;
+    }
+  }
+
+  auto findLargestDivisorInRange = [&](int64_t v, int64_t l, int64_t h) {
+    for (int i = h; i >= l; i--) {
+      if (v % i == 0)
+        return i;
+    }
+    // irregular shape or shape is not in the supported range.
+    return 0;
+  };
+
+  auto height = findLargestDivisorInRange(shape[0], minHeight, maxHeight);
+  auto width = findLargestDivisorInRange(shape[1], minWidth, maxWidth);
+  return Block(height, width);
+}
+
+llvm::SmallVector<unsigned int>
+BlockingAnalysisImpl::getMMASize(mlir::Type elemTy, const int APrecision,
+                                 const int BPrecision, const int CPrecision,
+                                 const int DPrecision) {
+  assert(elemTy.isIntOrFloat() && "only support int or float data type.");
+  auto dpasParams =
+      uArch->getDPASConfig(APrecision, BPrecision, CPrecision, DPrecision);
+  return llvm::SmallVector<unsigned int>(
+      {dpasParams.m, dpasParams.k, dpasParams.n});
+}
+
+// ===--------------------------------BlockingAnalysis---------------------------------===//
+
+mlir::LogicalResult BlockingAnalysis::run(mlir::Operation *op) {
+  mlir::SymbolTableCollection symbolTable;
+  // BlockingAnalysisImpl is using default initialize method
+  // provided by SparseBackwardDataFlowAnalysis. And this default
+  // initialize method relies on results of DeadCodeAnalysis to
+  // skip analysis on the dead code.
+  solver.load<mlir::dataflow::DeadCodeAnalysis>();
+  solver.load<mlir::dataflow::SparseConstantPropagation>();
+  solver.load<BlockingAnalysisImpl>(symbolTable, uArch);
+  target = op;
+  return solver.initializeAndRun(op);
+}
+
+void BlockingAnalysis::printAnalysisResult() {
+  llvm::dbgs() << "\n\nBlockingAnalysis Results:\n";
+  target->walk([&](mlir::Operation *op) {
+    if (op->getNumRegions() == 0 && op->getNumResults() == 1) {
+      auto resTy = op->getResult(0).getType();
+      if (mlir::isa<mlir::VectorType>(resTy) ||
+          mlir::isa<xetile::TileType>(resTy)) {
+        llvm::dbgs() << "\nOp: " << *op;
+        for (auto [i, inputOpr] : llvm::enumerate(op->getOperands())) {
+          if (mlir::isa<mlir::VectorType>(inputOpr.getType()) ||
+              mlir::isa<xetile::TileType>(inputOpr.getType())) {
+            UsePoint p(op, i);
+            llvm::dbgs() << "\n   opr[" << i << "]: " << inputOpr
+                         << " --> blkSZ: " << getUseBlockSize(inputOpr, p);
+          }
+        }
+
+        for (auto [i, res] : llvm::enumerate(op->getResults()))
+          llvm::dbgs() << "\n   res[" << i << "]: " << res
+                       << " --> blkSZ: " << getDefBlockSize(res);
+        llvm::dbgs() << "\n";
+      }
+    } else if (auto forOp = mlir::dyn_cast<mlir::scf::ForOp>(op)) {
+      llvm::dbgs() << "\nOp: " << op->getName();
+      for (auto [i, arg] : llvm::enumerate(forOp.getRegionIterArgs()))
+        llvm::dbgs() << "\n   arg[" << i << "]: "
+                     << " --> blkSZ: " << getDefBlockSize(arg);
+
+      for (auto [i, res] : llvm::enumerate(forOp.getResults()))
+        llvm::dbgs() << "\n   res[" << i << "]: "
+                     << " --> blkSZ: " << getDefBlockSize(res);
+      llvm::dbgs() << "\n";
+    } else if (auto YieldOp = mlir::dyn_cast<mlir::scf::YieldOp>(op)) {
+      llvm::dbgs() << "\nOp: " << op->getName();
+      for (auto [i, res] : llvm::enumerate(YieldOp.getResults()))
+        llvm::dbgs() << "\n   res[" << i << "]: " << res
+                     << " --> blkSZ: " << getDefBlockSize(res) << ", "
+                     << getUseBlockSize(res, UsePoint(op, i));
+      llvm::dbgs() << "\n";
+    } else if (auto StoreOp = mlir::dyn_cast<xetile::StoreTileOp>(op)) {
+      llvm::dbgs() << "\nOp: " << *op;
+      for (auto [i, inputOpr] : llvm::enumerate(op->getOperands())) {
+        llvm::dbgs() << "\n   opr[" << i << "]: " << inputOpr << " --> blkSZ: "
+                     << getUseBlockSize(inputOpr, UsePoint(StoreOp, i));
+      }
+      llvm::dbgs() << "\n";
+    }
+  });
+}
+
+Block BlockingAnalysis::getUseBlockSize(mlir::Value val, UsePoint point) const {
+  auto *state = solver.lookupState<BlockingLattice>(val);
+  if (!state)
+    return Block();
+  return state->getValue().getUseBlock(point);
+}
+
+Block BlockingAnalysis::getDefBlockSize(mlir::Value val) const {
+  auto *state = solver.lookupState<BlockingLattice>(val);
+  if (!state)
+    return Block();
+  return state->getValue().getDefBlock();
+}
+
+} // namespace imex
diff --git a/lib/Dialect/XeTile/Transforms/BlockingAnalysis.h b/lib/Dialect/XeTile/Transforms/BlockingAnalysis.h
new file mode 100644
index 000000000..96f2249e2
--- /dev/null
+++ b/lib/Dialect/XeTile/Transforms/BlockingAnalysis.h
@@ -0,0 +1,68 @@
+
+#ifndef IMEX_BLOCKING_ANALYSIS_H
+#define IMEX_BLOCKING_ANALYSIS_H
+
+#include <mlir/Analysis/DataFlow/ConstantPropagationAnalysis.h>
+#include <mlir/Analysis/DataFlow/DeadCodeAnalysis.h>
+#include <mlir/Analysis/DataFlow/SparseAnalysis.h>
+
+#include <llvm/ADT/SetVector.h>
+
+#include "imex/Utils/XeArch.h"
+
+namespace imex {
+
+/// a class representing a inner block size, provides some
+/// convinient methods for manipulation.
+class Block {
+public:
+  Block() : values{0, 0} {}
+
+  Block(int64_t h, int64_t w) : values{h, w} {}
+
+  int64_t &operator[](size_t index);
+  const int64_t &operator[](size_t index) const;
+
+  bool operator==(Block &other) const;
+  bool operator==(const Block &other) const;
+
+  bool operator!=(Block &other) const { return !(*this == other); }
+  bool operator!=(const Block &other) const { return !(*this == other); }
+
+  void print(llvm::raw_ostream &os) const;
+
+  llvm::ArrayRef<int64_t> asArrayRef() const;
+
+  operator bool() const { return values[0] != 0 && values[1] != 0; }
+
+private:
+  int64_t values[2];
+};
+
+llvm::raw_ostream &operator<<(llvm::raw_ostream &os, Block blk);
+
+// A pair of operator and operand index number representing
+// the use point of a value.
+typedef std::pair<mlir::Operation *, int64_t> UsePoint;
+
+class BlockingAnalysis {
+public:
+  explicit BlockingAnalysis(std::shared_ptr<XeuArchInterface> uArch) {
+    this->uArch = uArch;
+  };
+
+  mlir::LogicalResult run(mlir::Operation *op);
+
+  Block getUseBlockSize(mlir::Value val, UsePoint point) const;
+  Block getDefBlockSize(mlir::Value val) const;
+  void printAnalysisResult();
+
+private:
+  mlir::DataFlowSolver solver;
+  std::shared_ptr<XeuArchInterface> uArch;
+  mlir::Operation *target;
+};
+
+} // namespace imex
+
+#endif // IMEX_BLOCKING_ANALYSIS_H
diff --git a/lib/Dialect/XeTile/Transforms/BlockingRewrite.cpp b/lib/Dialect/XeTile/Transforms/BlockingRewrite.cpp
new file mode 100644
index 000000000..74604dd52
--- /dev/null
+++ b/lib/Dialect/XeTile/Transforms/BlockingRewrite.cpp
@@ -0,0 +1,875 @@
+//===-------------- Blocking.cpp --------- Blocking Pass  -------*- C++ -*-===//
+//
+// Copyright 2024 Intel Corporation
+// Part of the IMEX Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains lowering transformation for determing the problem size
+/// that can be handled by an XeGPU operator (hardware instruction). XeTile
+/// program can work one bigger problem size that cannot be handled by a
+/// hardware instruction. But it needs to be decomposed into smaller pieces
+/// such that each pieces can be handled by a hardware instruction.
+///
+//===----------------------------------------------------------------------===//
+#include <mlir/Analysis/DataFlow/ConstantPropagationAnalysis.h>
+#include <mlir/Analysis/DataFlow/DeadCodeAnalysis.h>
+#include <mlir/Analysis/DataFlow/SparseAnalysis.h>
+#include <mlir/Conversion/LLVMCommon/TypeConverter.h>
+#include <mlir/Dialect/Arith/IR/Arith.h>
+#include <mlir/Dialect/Func/IR/FuncOps.h>
+#include <mlir/Dialect/GPU/IR/GPUDialect.h>
+#include <mlir/Dialect/Index/IR/IndexDialect.h>
+#include <mlir/Dialect/Linalg/IR/Linalg.h>
+#include <mlir/Dialect/Linalg/Utils/Utils.h>
+#include <mlir/Dialect/Math/IR/Math.h>
+#include <mlir/Dialect/MemRef/IR/MemRef.h>
+#include <mlir/Dialect/SCF/IR/SCF.h>
+#include <mlir/IR/BuiltinAttributes.h>
+#include <mlir/IR/BuiltinTypes.h>
+#include <mlir/IR/PatternMatch.h>
+#include <mlir/Support/LogicalResult.h>
+#include <mlir/Transforms/DialectConversion.h>
+#include <mlir/Transforms/GreedyPatternRewriteDriver.h>
+
+#include <llvm/ADT/DenseMapInfo.h>
+#include <llvm/ADT/STLExtras.h>
+#include <llvm/ADT/SetVector.h>
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/Support/Debug.h>
+
+#include <algorithm>
+#include <optional>
+#include <set>
+#include <tuple>
+
+#include "imex/Dialect/XeTile/Transforms/Passes.h"
+#include "imex/Utils/DebugUtils.h"
+#include "imex/Utils/XeArch.h"
+
+#include "BlockingAnalysis.h"
+#include "PassDetail.h"
+
+using namespace mlir;
+using namespace llvm;
+using namespace imex;
+namespace imex {
+#define GEN_PASS_DECL_NEWXETILEBLOCKING
+#define GEN_PASS_DEF_NEWXETILEBLOCKING
+#include "imex/Dialect/XeTile/Transforms/Passes.h.inc"
+} // namespace imex
+
+namespace imex {
+namespace Blocking {
+
+static xetile::TileUnpackOp
+addUnpackOp(mlir::Value src, mlir::ConversionPatternRewriter &rewriter) {
+  auto srcTy = llvm::dyn_cast_if_present<mlir::VectorType>(src.getType());
+  assert(srcTy && srcTy.getRank() == 4);
+  auto shape = srcTy.getShape();
+  auto grids = shape.take_front(2);
+  auto innerBlocks = shape.take_back(2);
+  llvm::SmallVector<int64_t> unpackShape(
+      {grids[0] * innerBlocks[0], grids[1] * innerBlocks[1]});
+
+  auto unpackTy = mlir::VectorType::get(unpackShape, srcTy.getElementType());
+  return rewriter.create<xetile::TileUnpackOp>(
+      src.getLoc(), unpackTy, src,
+      mlir::DenseI64ArrayAttr::get(src.getContext(), innerBlocks));
+}
+
+static mlir::Value addPackOp(mlir::Value src,
+                             llvm::ArrayRef<int64_t> targetBlkSizes,
+                             mlir::ConversionPatternRewriter &rewriter) {
+  auto srcTy = mlir::dyn_cast<mlir::VectorType>(src.getType());
+  assert(srcTy && targetBlkSizes.size() == 2);
+  auto shape = srcTy.getShape();
+  llvm::SmallVector<int64_t> packShape({shape[0] / targetBlkSizes[0],
+                                        shape[1] / targetBlkSizes[1],
+                                        targetBlkSizes[0], targetBlkSizes[1]});
+
+  auto packTy = mlir::VectorType::get(packShape, srcTy.getElementType());
+  auto packOp = rewriter.create<xetile::TilePackOp>(
+      src.getLoc(), packTy, src,
+      mlir::DenseI64ArrayAttr::get(src.getContext(), targetBlkSizes));
+  return packOp;
+}
+
+/// OpConversionPatternWithAnalysis is a wrapper around OpConversionPattern
+/// but takes an extra AnalysisT object as an argument, such that patterns
+/// can leverage the analysis results.
+template <typename SourceOp, typename AnalysisT>
+class OpConversionPatternWithAnalysis
+    : public mlir::OpConversionPattern<SourceOp> {
+public:
+  using OpPatternRewriter = typename mlir::ConversionPatternRewriter;
+
+  OpConversionPatternWithAnalysis(mlir::MLIRContext *context,
+                                  AnalysisT &analysis)
+      : mlir::OpConversionPattern<SourceOp>(context), analysis(analysis) {}
+
+protected:
+  AnalysisT &analysis;
+};
+
+/// OpTraitConversionPatternWithAnalysis is a wrapper around
+/// OpTraitConversionPattern but takes an extra AnalysisT object as an argument,
+/// such that patterns can leverage the analysis results.
+template <template <typename> class TraitType, typename AnalysisT>
+class OpTraitConversionPatternWithAnalysis
+    : public mlir::OpTraitConversionPattern<TraitType> {
+public:
+  OpTraitConversionPatternWithAnalysis(mlir::MLIRContext *context,
+                                       AnalysisT &analysis,
+                                       PatternBenefit benefit = 1)
+      : mlir::OpTraitConversionPattern<TraitType>(context, benefit),
+        analysis(analysis) {}
+
+protected:
+  AnalysisT &analysis;
+};
+
+// It blocks/extends a 2D constant dense vector into a
+// 4D vector with the last 2 dim corresponding to block size.
+// which is chosed based on requests from its users.
+// example: arith.constant dense<0.0>: vector<32x32xf16>
+//      --> arith.constant dense<0.0>: vector<4x2x8x16xf16>
+// assuming [8, 16] is the block size.
+struct ArithConstantOpPattern
+    : public OpConversionPatternWithAnalysis<mlir::arith::ConstantOp,
+                                             BlockingAnalysis> {
+
+  using OpConversionPatternWithAnalysis<
+      mlir::arith::ConstantOp,
+      BlockingAnalysis>::OpConversionPatternWithAnalysis;
+
+  mlir::LogicalResult
+  matchAndRewrite(mlir::arith::ConstantOp op, OpAdaptor adaptor,
+                  OpPatternRewriter &rewriter) const override {
+    auto value = llvm::dyn_cast<mlir::DenseElementsAttr>(op.getValue());
+
+    // TODO: it maybe unstable to determine whether doing blocking or not
+    //  for a constant op simply based on its 2D shape.
+    if (!value || value.getType().getRank() != 2)
+      return mlir::failure();
+
+    auto blockSize = analysis.getDefBlockSize(op.getResult());
+    if (!blockSize)
+      return rewriter.notifyMatchFailure(op, "Invalid block size.");
+
+    auto shape = value.getType().getShape();
+    auto newTy =
+        mlir::VectorType::get({shape[0] / blockSize[0], shape[1] / blockSize[1],
+                               blockSize[0], blockSize[1]},
+                              value.getElementType());
+
+    // TODO: it is logically incorrect to reshape a dense value.
+    // it doesn't show the impact of pack effect. It works on some
+    // cases in which all elements has the same value, but not general.
+    value = value.reshape(newTy);
+    auto loc = op.getLoc();
+    auto newOp = rewriter.create<mlir::arith::ConstantOp>(loc, value);
+    auto unpack = addUnpackOp(newOp, rewriter);
+
+    rewriter.replaceOp(op, unpack);
+    return mlir::success();
+  }
+};
+
+// It updates init_tile by attaching innerBlock attribute to the result
+// tile. The block size is choosed based on requests from its users.
+struct InitTileOpPattern
+    : public OpConversionPatternWithAnalysis<xetile::InitTileOp,
+                                             BlockingAnalysis> {
+
+  using OpConversionPatternWithAnalysis<
+      xetile::InitTileOp, BlockingAnalysis>::OpConversionPatternWithAnalysis;
+
+  ::mlir::LogicalResult
+  matchAndRewrite(xetile::InitTileOp op, OpAdaptor adaptor,
+                  OpPatternRewriter &rewriter) const override {
+    auto tileTy = op.getType();
+    auto shape = tileTy.getShape();
+    if (tileTy.getRank() != 2)
+      return rewriter.notifyMatchFailure(
+          op, "Skipped InitTileOp because the result tile is not rank 2.\n");
+
+    auto innerBlockAttr = tileTy.getInnerBlocks();
+
+    // skip it if innerBlocks has been set by user or compiler.
+    if (innerBlockAttr)
+      return mlir::failure();
+
+    auto blockSize = analysis.getDefBlockSize(op.getTile());
+    if (!blockSize)
+      return rewriter.notifyMatchFailure(op, "Invalid block size.");
+
+    innerBlockAttr =
+        mlir::DenseI64ArrayAttr::get(getContext(), blockSize.asArrayRef());
+
+    if (innerBlockAttr.empty())
+      return rewriter.notifyMatchFailure(op, "Invalid inner block sizes ");
+
+    auto attr = imex::xetile::XeTileAttr::get(
+        op.getContext(), tileTy.getSgMap(), tileTy.getWgMap(),
+        tileTy.getOrder(), innerBlockAttr, tileTy.getMemoryScope());
+
+    auto elemTy = tileTy.getElementType();
+    auto newTileTy = imex::xetile::TileType::get(shape, elemTy, attr);
+
+    auto newOp = rewriter.create<xetile::InitTileOp>(
+        op.getLoc(), mlir::TypeRange({newTileTy}), op->getOperands(),
+        op->getAttrs());
+
+    rewriter.replaceOp(op, newOp);
+
+    return mlir::success();
+  }
+};
+
+// It updates tile operand of prefetch_tile.
+struct PrefetchTileOpPattern
+    : public OpConversionPatternWithAnalysis<xetile::PrefetchTileOp,
+                                             BlockingAnalysis> {
+
+  using OpConversionPatternWithAnalysis<
+      xetile::PrefetchTileOp,
+      BlockingAnalysis>::OpConversionPatternWithAnalysis;
+
+  ::mlir::LogicalResult
+  matchAndRewrite(xetile::PrefetchTileOp op, OpAdaptor adaptor,
+                  OpPatternRewriter &rewriter) const override {
+    auto tile = adaptor.getTile();
+    auto tileTy = mlir::cast<xetile::TileType>(tile.getType());
+    auto blockSize = tileTy.getInnerBlocks();
+    // define op is not updated yet.
+    if (!blockSize)
+      return failure();
+
+    rewriter.startOpModification(op);
+    op->setOperand(0, tile);
+    rewriter.finalizeOpModification(op);
+
+    return mlir::success();
+  }
+};
+
+// It updates load_tile to reveal effects of innerblock attribute by
+// representing value as 4D vector. An unpack op is added at the end
+// to make this change to be transparent to its users.
+struct LoadTileOpPattern
+    : public OpConversionPatternWithAnalysis<xetile::LoadTileOp,
+                                             BlockingAnalysis> {
+
+  using OpConversionPatternWithAnalysis<
+      xetile::LoadTileOp, BlockingAnalysis>::OpConversionPatternWithAnalysis;
+
+  ::mlir::LogicalResult
+  matchAndRewrite(xetile::LoadTileOp op, OpAdaptor adaptor,
+                  OpPatternRewriter &rewriter) const override {
+    auto source = adaptor.getSource();
+    auto tileTy = mlir::cast<xetile::TileType>(source.getType());
+    auto blockSize = tileTy.getInnerBlocks();
+    auto rank = op.getValue().getType().getRank();
+
+    if (!blockSize || rank == 4)
+      return rewriter.notifyMatchFailure(
+          op, "Input is not updated or the op has been updated.\n");
+
+    auto shape = tileTy.getShape();
+    auto vecTy = ::mlir::VectorType::get({shape[0] / blockSize[0],
+                                          shape[1] / blockSize[1], blockSize[0],
+                                          blockSize[1]},
+                                         tileTy.getElementType());
+    mlir::Value newOp = rewriter.create<xetile::LoadTileOp>(
+        op.getLoc(), vecTy, adaptor.getSource(),
+        op.getPadding().value_or(mlir::Attribute()));
+    newOp = addUnpackOp(newOp, rewriter);
+    rewriter.replaceOp(op, newOp);
+    return mlir::success();
+  }
+};
+
+// It updates store_tile to reveal effects of innerblock attribute.
+// It uses pack op to align the shape of its vector value to the tile shape.
+struct StoreTileOpPattern
+    : public OpConversionPatternWithAnalysis<xetile::StoreTileOp,
+                                             BlockingAnalysis> {
+
+  using OpConversionPatternWithAnalysis<
+      xetile::StoreTileOp, BlockingAnalysis>::OpConversionPatternWithAnalysis;
+
+  ::mlir::LogicalResult
+  matchAndRewrite(xetile::StoreTileOp op, OpAdaptor adaptor,
+                  OpPatternRewriter &rewriter) const override {
+    auto value = adaptor.getValue();
+    auto valTy = mlir::dyn_cast<mlir::VectorType>(value.getType());
+    auto tile = adaptor.getTile();
+    auto tileTy = mlir::cast<xetile::TileType>(tile.getType());
+    auto blockSize = tileTy.getInnerBlocks();
+
+    // its inputs has not been updated yet.
+    if (blockSize && valTy.getRank() == 2) {
+      value = addPackOp(value, blockSize.asArrayRef(), rewriter);
+      rewriter.replaceOpWithNewOp<xetile::StoreTileOp>(op, value, tile);
+      return mlir::success();
+    }
+    return mlir::failure();
+  }
+};
+
+// It updates update_tile_offset to reveal effects of innerblock attribute
+// by updating the type of it result.
+struct UpdateTileOffsetOpPattern
+    : public OpConversionPatternWithAnalysis<xetile::UpdateTileOffsetOp,
+                                             BlockingAnalysis> {
+
+  using OpConversionPatternWithAnalysis<
+      xetile::UpdateTileOffsetOp,
+      BlockingAnalysis>::OpConversionPatternWithAnalysis;
+
+  ::mlir::LogicalResult
+  matchAndRewrite(xetile::UpdateTileOffsetOp op, OpAdaptor adaptor,
+                  OpPatternRewriter &rewriter) const override {
+    auto tile = adaptor.getTile();
+    auto tileTy = mlir::cast<xetile::TileType>(tile.getType());
+    auto blockSize = tileTy.getInnerBlocks();
+    // define op is not updated yet.
+    if (!blockSize)
+      return failure();
+
+    rewriter.replaceOpWithNewOp<xetile::UpdateTileOffsetOp>(
+        op, tileTy, tile, adaptor.getOffsetX(), adaptor.getOffsetY());
+    return mlir::success();
+  }
+};
+
+// It updates tile_mma to reveal effects of innerblock attribute.
+// Values will be reprented as 4D vectors. An unpack op is applied
+// to its result to make the change transparent to its users.
+struct TileMMAOpPattern
+    : public OpConversionPatternWithAnalysis<xetile::TileMMAOp,
+                                             BlockingAnalysis> {
+
+  using OpConversionPatternWithAnalysis<
+      xetile::TileMMAOp, BlockingAnalysis>::OpConversionPatternWithAnalysis;
+
+  ::mlir::LogicalResult
+  matchAndRewrite(xetile::TileMMAOp op, OpAdaptor adaptor,
+                  OpPatternRewriter &rewriter) const override {
+    auto resultTy = op.getResult().getType();
+    if (resultTy.getRank() != 2)
+      return rewriter.notifyMatchFailure(
+          op, "The result of tile_mma must be 2D vector.\n");
+
+    auto a = adaptor.getA();
+    auto b = adaptor.getB();
+    auto c = adaptor.getC();
+
+    assert(a && b && "a operand or b operand is (are) missing.\n");
+
+    auto getBlockingSize = [&](mlir::Value val,
+                               int pos) -> mlir::FailureOr<Block> {
+      auto blk = analysis.getUseBlockSize(val, UsePoint(op, pos));
+      if (!blk)
+        return rewriter.notifyMatchFailure(op, "Invalid block size.");
+      return blk;
+    };
+
+    auto aBlockSize = getBlockingSize(op.getA(), 0);
+    auto bBlockSize = getBlockingSize(op.getB(), 1);
+    if (mlir::failed(aBlockSize) || mlir::failed(bBlockSize))
+      return mlir::failure();
+    if (c) {
+      auto cBlockSize = getBlockingSize(op.getC(), 2);
+      if (mlir::failed(cBlockSize))
+        return mlir::failure();
+      c = addPackOp(c, cBlockSize->asArrayRef(), rewriter);
+    }
+
+    a = addPackOp(a, aBlockSize->asArrayRef(), rewriter);
+    b = addPackOp(b, bBlockSize->asArrayRef(), rewriter);
+
+    assert(
+        mlir::dyn_cast<mlir::VectorType>(a.getType()).getRank() == 4 &&
+        mlir::dyn_cast<mlir::VectorType>(b.getType()).getRank() == 4 &&
+        (!c || mlir::dyn_cast<mlir::VectorType>(c.getType()).getRank() == 4) &&
+        "a, b and c (if has) should be transformed into 4D vectors.\n");
+
+    Block dBlockSize(aBlockSize->asArrayRef()[0], bBlockSize->asArrayRef()[1]);
+    auto shape = resultTy.getShape();
+    auto vecTy = ::mlir::VectorType::get({shape[0] / dBlockSize[0],
+                                          shape[1] / dBlockSize[1],
+                                          dBlockSize[0], dBlockSize[1]},
+                                         resultTy.getElementType());
+
+    mlir::Value newOp = rewriter.create<imex::xetile::TileMMAOp>(
+        op.getLoc(), vecTy, a, b, c, nullptr, nullptr, nullptr);
+    newOp = addUnpackOp(newOp, rewriter);
+    rewriter.replaceOp(op, newOp);
+    return mlir::success();
+  }
+};
+
+struct TileReductionOpPattern
+    : public OpConversionPatternWithAnalysis<xetile::ReductionOp,
+                                             BlockingAnalysis> {
+
+  using OpConversionPatternWithAnalysis<
+      xetile::ReductionOp, BlockingAnalysis>::OpConversionPatternWithAnalysis;
+
+  mlir::LogicalResult
+  matchAndRewrite(xetile::ReductionOp op, OpAdaptor adaptor,
+                  OpPatternRewriter &rewriter) const override {
+    auto source = op.getSource();
+    auto srcTy = source.getType();
+    auto reductionDims = op.getReductionDims();
+
+    if (srcTy.getRank() != 2 || reductionDims.size() != 1)
+      return rewriter.notifyMatchFailure(
+          op, "source type is not 2D vector or reduction dims are not 1");
+
+    auto blkSize = analysis.getUseBlockSize(source, UsePoint(op, 0));
+    if (!blkSize)
+      return rewriter.notifyMatchFailure(op, "Invalid block size.");
+
+    // reduction on one dim becomes reduction on two dims after blocking.
+    // For example:
+    // reduce<add>, %e [1]: vector<16x32xf16> to vector<16x1xf16>
+    // will be transformed to
+    // reduce<add>, %e [1, 3]: vector<16x2x1x16xf16> to
+    // vector<16x1x1x1xf16>
+    auto dim = reductionDims[0];
+
+    auto ctx = op.getContext();
+    auto shape = srcTy.getShape();
+    auto newReductionDims = mlir::DenseI64ArrayAttr::get(ctx, {dim, dim + 2});
+    llvm::SmallVector<int64_t> resultShape(
+        {shape[0] / blkSize[0], shape[1] / blkSize[1], blkSize[0], blkSize[1]});
+    for (auto dim : newReductionDims.asArrayRef())
+      resultShape[dim] = 1;
+
+    auto elemTy = srcTy.getElementType();
+    auto resultType = mlir::VectorType::get(resultShape, elemTy);
+
+    auto newSource = addPackOp(source, {blkSize[0], blkSize[1]}, rewriter);
+    mlir::Value newOp = rewriter.create<xetile::ReductionOp>(
+        op.getLoc(), resultType, op.getKind(), newSource, newReductionDims);
+    newOp = addUnpackOp(newOp, rewriter);
+    rewriter.replaceOp(op, newOp);
+    return mlir::success();
+  }
+};
+
+struct TileBroadcastOpPattern
+    : public OpConversionPatternWithAnalysis<xetile::BroadcastOp,
+                                             BlockingAnalysis> {
+
+  using OpConversionPatternWithAnalysis<
+      xetile::BroadcastOp, BlockingAnalysis>::OpConversionPatternWithAnalysis;
+
+  mlir::LogicalResult
+  matchAndRewrite(xetile::BroadcastOp op, OpAdaptor adaptor,
+                  OpPatternRewriter &rewriter) const override {
+    auto loc = op.getLoc();
+    auto src = op.getSource();
+    auto srcTy = src.getType();
+    auto elemTy = srcTy.getElementType();
+    auto broadcastDims = op.getBroadcastDim();
+
+    if (srcTy.getRank() != 2 || broadcastDims.size() != 1)
+      return rewriter.notifyMatchFailure(
+          op, "source type is not 2D vector or rank of broadcastDims is not 1");
+
+    auto srcBlkSize = analysis.getUseBlockSize(src, UsePoint(op, 0));
+    auto resBlkSize = analysis.getDefBlockSize(op.getResult());
+    if (!srcBlkSize || !resBlkSize)
+      return rewriter.notifyMatchFailure(op, "Invalid block size.");
+
+    auto outShape = op.getResult().getType().getShape();
+
+    // TODO: move this into analysis
+    llvm::SmallVector<int64_t> resultShape(
+        {outShape[0], outShape[1] / resBlkSize[1], 1, resBlkSize[1]});
+
+    auto newSource = addPackOp(adaptor.getSource(),
+                               {srcBlkSize[0], srcBlkSize[1]}, rewriter);
+
+    auto resultType = mlir::VectorType::get(resultShape, elemTy);
+
+    // broadcast on one dim becomes broadcast on two dims after blocking.
+    // For example:
+    // broadcast %a [0]: vector<1x32xf16> to vector<16x32xf16>
+    // will be transformed to
+    // broadcast %a [0, 2]: vector<1x2x1x16xf16> to vector<16x2x1x16xf16>
+    auto dim = broadcastDims[0];
+    auto newBroadcastDims =
+        mlir::DenseI64ArrayAttr::get(op.getContext(), {dim, dim + 2});
+    mlir::Value newOp = rewriter.create<xetile::BroadcastOp>(
+        loc, resultType, newSource, newBroadcastDims);
+    newOp = addUnpackOp(newOp, rewriter);
+    rewriter.replaceOp(op, newOp);
+    return mlir::success();
+  }
+};
+
+struct TileTransposeOpPattern
+    : public OpConversionPatternWithAnalysis<xetile::TransposeOp,
+                                             BlockingAnalysis> {
+  using OpConversionPatternWithAnalysis<
+      xetile::TransposeOp, BlockingAnalysis>::OpConversionPatternWithAnalysis;
+
+  mlir::LogicalResult
+  matchAndRewrite(xetile::TransposeOp op, OpAdaptor adaptor,
+                  OpPatternRewriter &rewriter) const override {
+    auto input = op.getVector();
+    auto inputTy = input.getType();
+    auto result = op.getResult();
+    auto resultTy = result.getType();
+    if (resultTy.getRank() != 2)
+      return rewriter.notifyMatchFailure(op, "type is not 2D vector");
+
+    auto permutation = op.getPermutation();
+    if (permutation != mlir::ArrayRef<int64_t>({1, 0}))
+      return rewriter.notifyMatchFailure(op, "Unsupported permutation");
+
+    UsePoint p(op, 0);
+    auto inBlockSize = analysis.getUseBlockSize(input, p);
+    auto outBlockSize = analysis.getDefBlockSize(result);
+    if (!inBlockSize || !outBlockSize)
+      return rewriter.notifyMatchFailure(op, "Invalid block size.");
+
+    auto srcShape = inputTy.getShape();
+    auto newSrcTy = mlir::VectorType::get({srcShape[0] / inBlockSize[0],
+                                           srcShape[1] / inBlockSize[1],
+                                           inBlockSize[0], inBlockSize[1]},
+                                          inputTy.getElementType());
+    auto resShape = resultTy.getShape();
+    auto newDstTy = mlir::VectorType::get({resShape[0] / outBlockSize[0],
+                                           resShape[1] / outBlockSize[1],
+                                           outBlockSize[0], outBlockSize[1]},
+                                          resultTy.getElementType());
+
+    mlir::Value src = adaptor.getVector();
+
+    auto ctxt = op.getContext();
+    auto blockAttr =
+        mlir::DenseI64ArrayAttr::get(ctxt, inBlockSize.asArrayRef());
+    Location loc = op->getLoc();
+    mlir::Value pack =
+        rewriter.create<xetile::TilePackOp>(loc, newSrcTy, src, blockAttr);
+
+    int64_t newPermutation[4] = {1, 0, 3, 2};
+    mlir::Value transpose = rewriter.create<xetile::TransposeOp>(
+        loc, newDstTy, pack, newPermutation);
+
+    blockAttr = mlir::DenseI64ArrayAttr::get(ctxt, outBlockSize.asArrayRef());
+    mlir::Value unpack = rewriter.create<xetile::TileUnpackOp>(
+        loc, resultTy, transpose, blockAttr);
+
+    rewriter.replaceOp(op, unpack);
+
+    return mlir::success();
+  }
+};
+
+struct VectorizableOpPattern
+    : public OpTraitConversionPatternWithAnalysis<mlir::OpTrait::Vectorizable,
+                                                  BlockingAnalysis> {
+
+  using OpTraitConversionPatternWithAnalysis::
+      OpTraitConversionPatternWithAnalysis;
+
+  mlir::LogicalResult
+  matchAndRewrite(mlir::Operation *op, llvm::ArrayRef<mlir::Value> operands,
+                  mlir::ConversionPatternRewriter &rewriter) const override {
+    if (op->getNumResults() != 1)
+      return rewriter.notifyMatchFailure(op, "op must have 1 result");
+
+    auto res = op->getResult(0);
+    auto resType = mlir::dyn_cast<mlir::VectorType>(res.getType());
+    if (!resType || resType.getRank() != 2)
+      return rewriter.notifyMatchFailure(op, "type is not 2D vector");
+
+    auto blockSize =
+        analysis.getUseBlockSize(op->getOperand(0), UsePoint(op, 0));
+    if (!blockSize)
+      return rewriter.notifyMatchFailure(op, "Invalid block size.");
+
+    auto shape = resType.getShape();
+    auto elemTy = resType.getElementType();
+    auto blockSizeAttr =
+        mlir::DenseI64ArrayAttr::get(getContext(), blockSize.asArrayRef());
+    int64_t packShape[] = {shape[0] / blockSize[0], shape[1] / blockSize[1],
+                           blockSize[0], blockSize[1]};
+
+    auto newTy = mlir::VectorType::get(packShape, elemTy);
+
+    Location loc = op->getLoc();
+    mlir::OpBuilder::InsertionGuard g(rewriter);
+    llvm::SmallVector<mlir::Value> newOperands;
+    for (auto &&[i, arg] : llvm::enumerate(operands)) {
+      mlir::Value packOp =
+          rewriter.create<xetile::TilePackOp>(loc, newTy, arg, blockSizeAttr);
+      newOperands.push_back(packOp);
+    }
+
+    mlir::OperationState opState(loc, op->getName(), newOperands,
+                                 mlir::TypeRange(newTy), op->getAttrs(),
+                                 op->getSuccessors());
+
+    auto newOp = rewriter.create(opState);
+    auto unpack = rewriter.create<xetile::TileUnpackOp>(
+        loc, resType, newOp->getResult(0), blockSizeAttr);
+    rewriter.replaceOp(op, unpack);
+    return mlir::success();
+  }
+};
+
+// It rewrites the SCF forOp, it mainly updates the arguments of its
+// region block. unpack ops are added for VectorType operands if needed.
+struct SCFForOpPattern
+    : public OpConversionPatternWithAnalysis<mlir::scf::ForOp,
+                                             BlockingAnalysis> {
+
+  using OpConversionPatternWithAnalysis<
+      mlir::scf::ForOp, BlockingAnalysis>::OpConversionPatternWithAnalysis;
+
+  ::mlir::LogicalResult
+  matchAndRewrite(mlir::scf::ForOp op, OpAdaptor adaptor,
+                  OpPatternRewriter &rewriter) const override {
+    // we don't need to update the forOp if it has no region
+    // iter args, or the region iter args type are not changed.
+    bool changed = false;
+    for (auto [arg1, arg2] :
+         llvm::zip_equal(op.getInitArgs(), adaptor.getInitArgs())) {
+      changed |= (arg1 != arg2);
+    }
+    if (!changed)
+      return mlir::failure();
+
+    // add packOp for vector type operands if needed.
+    llvm::SmallVector<mlir::Value> newInitArgs;
+    for (auto [arg1, arg2] :
+         llvm::zip_equal(op.getRegionIterArgs(), adaptor.getInitArgs())) {
+      auto block = analysis.getDefBlockSize(arg1);
+      if (mlir::isa<mlir::VectorType>(arg1.getType()) && block) {
+        auto pack = addPackOp(arg2, block.asArrayRef(), rewriter);
+        newInitArgs.push_back(pack);
+      } else {
+        newInitArgs.push_back(arg2);
+      }
+    }
+
+    auto newOp = rewriter.create<mlir::scf::ForOp>(
+        op.getLoc(), adaptor.getLowerBound(), adaptor.getUpperBound(),
+        adaptor.getStep(), newInitArgs);
+    mlir::Block *newBlock = newOp.getBody();
+    // remove the terminator of the new block
+    if (newBlock->mightHaveTerminator())
+      rewriter.eraseOp(newBlock->getTerminator());
+
+    auto savedIP = rewriter.saveInsertionPoint();
+    mlir::OpBuilder::InsertionGuard g(rewriter);
+    rewriter.setInsertionPointToStart(newBlock);
+
+    mlir::Block *block = op.getBody();
+
+    // contruct the inputs for the new scf::for block.
+    // An unpackOp is inserted for the corresponding init arg
+    // of the new block if its init value is updated with a pack op.
+    llvm::SmallVector<mlir::Value> newArguments;
+    for (auto [arg1, arg2] :
+         llvm::zip_equal(block->getArguments(), newBlock->getArguments())) {
+      auto block = analysis.getDefBlockSize(arg1);
+      if (mlir::isa<mlir::VectorType>(arg1.getType()) && block) {
+        auto unpack = addUnpackOp(arg2, rewriter);
+        newArguments.push_back(unpack);
+      } else {
+        newArguments.push_back(arg2);
+      }
+    }
+
+    rewriter.restoreInsertionPoint(savedIP);
+    rewriter.mergeBlocks(block, newBlock, newArguments);
+
+    llvm::SmallVector<mlir::Value> newValues;
+    for (auto [i, result] : llvm::enumerate(newOp->getResults())) {
+      // if corresponding init arg is updated with a pack op
+      // an unpack op is needed for the result to make it
+      // transparent to its users.
+      if (newInitArgs[i].getDefiningOp<xetile::TilePackOp>()) {
+        auto unpack = addUnpackOp(result, rewriter);
+        newValues.push_back(unpack);
+      } else {
+        newValues.push_back(result);
+      }
+    }
+    rewriter.replaceOp(op, newValues);
+    return mlir::success();
+  }
+};
+
+// It serves to insert pack ops for approriate vales if needed.
+// for example, tile_mma result is vector<32x32xf16> (after unpack),
+// but its corresponding argument in forOp is with type vector<1x2x32x16xf16>
+// This op pattern will insert a pack op to make it consistent with the
+// corresponding argument type.
+struct SCFYieldOpPattern
+    : public OpConversionPatternWithAnalysis<mlir::scf::YieldOp,
+                                             BlockingAnalysis> {
+
+  using OpConversionPatternWithAnalysis<
+      mlir::scf::YieldOp, BlockingAnalysis>::OpConversionPatternWithAnalysis;
+
+  ::mlir::LogicalResult
+  matchAndRewrite(mlir::scf::YieldOp op, OpAdaptor adaptor,
+                  OpPatternRewriter &rewriter) const override {
+
+    llvm::SmallVector<mlir::Value> newResults;
+    for (auto [arg1, arg2] :
+         llvm::zip_equal(op.getResults(), adaptor.getResults())) {
+      auto block = analysis.getDefBlockSize(arg1);
+      if (mlir::isa<mlir::VectorType>(arg1.getType()) && block) {
+        auto pack = addPackOp(arg2, block.asArrayRef(), rewriter);
+        newResults.push_back(pack);
+      } else {
+        newResults.push_back(arg2);
+      }
+    }
+
+    auto newOp = rewriter.create<mlir::scf::YieldOp>(op.getLoc(), newResults);
+    rewriter.replaceOp(op, newOp);
+    return mlir::success();
+  }
+};
+
+} // namespace Blocking
+
+void populateNewXeTileBlockingPatterns(mlir::RewritePatternSet &patterns,
+                                       BlockingAnalysis &analysis) {
+  patterns
+      .insert<Blocking::ArithConstantOpPattern, Blocking::InitTileOpPattern,
+              Blocking::PrefetchTileOpPattern, Blocking::LoadTileOpPattern,
+              Blocking::StoreTileOpPattern, Blocking::UpdateTileOffsetOpPattern,
+              Blocking::TileMMAOpPattern, Blocking::TileReductionOpPattern,
+              Blocking::TileBroadcastOpPattern,
+              Blocking::TileTransposeOpPattern, Blocking::VectorizableOpPattern,
+              Blocking::SCFForOpPattern, Blocking::SCFYieldOpPattern>(
+          patterns.getContext(), analysis);
+}
+
+// Lowers XeTile to blocked layout with high-dim vector
+class NewXeTileBlockingPass
+    : public imex::impl::NewXeTileBlockingBase<imex::NewXeTileBlockingPass> {
+
+public:
+  NewXeTileBlockingPass() = default;
+
+  NewXeTileBlockingPass(const std::string &deviceName) {
+    if (this->device.getNumOccurrences() == 0) {
+      this->device = deviceName;
+
+      if (deviceName == "pvc") {
+        uArchInterface = std::make_shared<XePVCuArch>();
+      }
+    }
+  }
+
+  void runOnOperation() override {
+    auto mod = this->getOperation();
+    // skip functions with XeTile.TileType inputs and outputs
+    if (!isSupportedModule(mod)) {
+      mod.emitOpError(
+          "Currently FunctionType with xetile.TileType is not supported.");
+      return signalPassFailure();
+    }
+
+    if (!uArchInterface) {
+      mod.emitOpError("Can not get GPU Arch Definition for given Arch param");
+      return signalPassFailure();
+    }
+
+    BlockingAnalysis analysis(uArchInterface);
+    if (mlir::failed(analysis.run(mod)))
+      return signalPassFailure();
+
+    // analysis.printAnalysisResult();
+
+    mlir::MLIRContext &context = getContext();
+
+    mlir::RewritePatternSet patterns(&context);
+    populateNewXeTileBlockingPatterns(patterns, analysis);
+
+    mlir::ConversionTarget target(context);
+    target.addLegalOp<xetile::TilePackOp>();
+    target.addLegalOp<xetile::TileUnpackOp>();
+    target.addLegalOp<mlir::vector::ShapeCastOp>();
+
+    target.addDynamicallyLegalOp<mlir::arith::ConstantOp>(
+        [&](mlir::arith::ConstantOp op) -> bool {
+          auto vecTy = mlir::dyn_cast<mlir::VectorType>(op.getType());
+          return (!vecTy || vecTy.getRank() != 2);
+        });
+
+    target.addDynamicallyLegalOp<xetile::InitTileOp>(
+        [&](xetile::InitTileOp op) -> bool {
+          return (op && op.getTile().getType().getInnerBlocks());
+        });
+
+    target.addDynamicallyLegalOp<xetile::PrefetchTileOp>(
+        [&](xetile::PrefetchTileOp op) -> bool {
+          return (op && op.getTile().getType().getInnerBlocks());
+        });
+
+    target.addDynamicallyLegalOp<xetile::UpdateTileOffsetOp>(
+        [&](xetile::UpdateTileOffsetOp op) -> bool {
+          return (op && op.getTile().getType().getInnerBlocks());
+        });
+
+    target.addDynamicallyLegalOp<xetile::LoadTileOp>(
+        [&](xetile::LoadTileOp op) -> bool {
+          return (op && op.getValue().getType().getRank() == 4);
+        });
+
+    target.addDynamicallyLegalOp<xetile::StoreTileOp>(
+        [&](xetile::StoreTileOp op) -> bool {
+          return (op && op.getValue().getType().getRank() == 4);
+        });
+
+    target.addDynamicallyLegalOp<xetile::TileMMAOp>(
+        [&](xetile::TileMMAOp op) -> bool {
+          return (op && op.getOutput().getType().getRank() == 4);
+        });
+
+    target.markUnknownOpDynamicallyLegal([&](mlir::Operation *op) {
+      bool result = true;
+      for (auto ty : op->getOperandTypes()) {
+        if (auto vecTy = mlir::dyn_cast<mlir::VectorType>(ty))
+          result &= (vecTy.getRank() != 2);
+      }
+      for (auto ty : op->getResultTypes()) {
+        if (auto vecTy = mlir::dyn_cast<mlir::VectorType>(ty))
+          result &= (vecTy.getRank() != 2);
+      }
+      return result;
+    });
+
+    auto status = applyPartialConversion(mod, target, std::move(patterns));
+    if (failed(status)) {
+      return signalPassFailure();
+    }
+  }
+
+private:
+  std::shared_ptr<XeuArchInterface> uArchInterface = nullptr;
+};
+
+/// Create a pass
+std::unique_ptr<::mlir::Pass>
+createNewXeTileBlockingPass(const std::string &deviceName) {
+  return std::make_unique<NewXeTileBlockingPass>(deviceName);
+}
+} // namespace imex
diff --git a/lib/Dialect/XeTile/Transforms/CMakeLists.txt b/lib/Dialect/XeTile/Transforms/CMakeLists.txt
index 45c10621f..7e9df5d97 100644
--- a/lib/Dialect/XeTile/Transforms/CMakeLists.txt
+++ b/lib/Dialect/XeTile/Transforms/CMakeLists.txt
@@ -1,10 +1,12 @@
 add_imex_dialect_library(IMEXXeTileTransforms
+  BlockAligning.cpp
   Blocking.cpp
+  BlockingAnalysis.cpp
+  BlockingRewrite.cpp
   InitDuplicate.cpp
-  BlockAligning.cpp
-  WgToSg.cpp
   OptimizeTranspose.cpp
   Canonicalization.cpp
+  WgToSg.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${PROJECT_SOURCE_DIR}/include/imex/Dialect/XeTile
diff --git a/lib/Dialect/XeTile/Transforms/Canonicalization.cpp b/lib/Dialect/XeTile/Transforms/Canonicalization.cpp
index 05b7a5a2a..902ce05fe 100644
--- a/lib/Dialect/XeTile/Transforms/Canonicalization.cpp
+++ b/lib/Dialect/XeTile/Transforms/Canonicalization.cpp
@@ -291,7 +291,7 @@ struct VectorMultiReductionToXeTileReduce
         (reduceDim == 0 ? llvm::ArrayRef<int64_t>({1, resultTy.getDimSize(0)})
                         : llvm::ArrayRef<int64_t>({resultTy.getDimSize(0), 1})),
         resultTy.getElementType());
-    auto reduceOp = rewriter.create<imex::xetile::ReduceOp>(
+    auto reduceOp = rewriter.create<imex::xetile::ReductionOp>(
         op->getLoc(), xetileResultTy, op.getKind(), op.getSource(),
         mlir::ArrayRef<int64_t>({reduceDim}));
     // Shape cast the result back to original shape.
diff --git a/lib/Dialect/XeTile/Transforms/PassDetail.h b/lib/Dialect/XeTile/Transforms/PassDetail.h
index 355bbe763..c6a798762 100644
--- a/lib/Dialect/XeTile/Transforms/PassDetail.h
+++ b/lib/Dialect/XeTile/Transforms/PassDetail.h
@@ -27,6 +27,10 @@ namespace arith {
 class ArithDialect;
 } // namespace arith
 
+namespace math {
+class MathDialect;
+} // namespace math
+
 // FIXME define other dependent MLIR dialects
 
 } // namespace mlir
diff --git a/test/Conversion/XeTileToXeGPU/sg_softmax.mlir b/test/Conversion/XeTileToXeGPU/sg_softmax.mlir
index 58457381c..4cf6e1594 100644
--- a/test/Conversion/XeTileToXeGPU/sg_softmax.mlir
+++ b/test/Conversion/XeTileToXeGPU/sg_softmax.mlir
@@ -19,7 +19,7 @@ gpu.module @test_kernel {
       //CHECK-COUNT-128: {{.*}} = math.exp %{{.*}} : vector<1x16xf16>
       %3 = math.exp %2: vector<32x64xf16>
       //CHECK-COUNT-124: arith.addf {{.*}}, {{.*}} : vector<1x16xf16>
-      %4 = xetile.reduce <add>, %3 [0]: vector<32x64xf16> -> vector<1x64xf16>
+      %4 = xetile.reduction <add>, %3 [0]: vector<32x64xf16> -> vector<1x64xf16>
       %5 = xetile.broadcast %4 [0]: vector<1x64xf16> -> vector<32x64xf16>
       //CHECK-COUNT-128: arith.divf {{.*}}, {{.*}} : vector<1x16xf16>
       %6 = arith.divf %3, %5: vector<32x64xf16>
@@ -261,7 +261,7 @@ gpu.module @test_kernel {
       //CHECK: {{.*}} = vector.splat {{.*}} : vector<1x1xf16>
       //CHECK: {{.*}} = vector.extractelement {{.*}}[{{.*}} : i32] : vector<16xf16>
       //CHECK: {{.*}} = vector.splat {{.*}} : vector<1x1xf16>
-      %4 = xetile.reduce <add>, %3 [1]: vector<32x64xf16> -> vector<32x1xf16>
+      %4 = xetile.reduction <add>, %3 [1]: vector<32x64xf16> -> vector<32x1xf16>
 
       //CHECK: {{.*}} = vector.extract {{.*}}[0, 0] : f16 from vector<1x1xf16>
       //CHECK: {{.*}} = vector.splat {{.*}} : vector<1x16xf16>
diff --git a/test/Conversion/XeTileToXeGPU/sg_tiled_softmax.mlir b/test/Conversion/XeTileToXeGPU/sg_tiled_softmax.mlir
index cd736f738..c98695565 100644
--- a/test/Conversion/XeTileToXeGPU/sg_tiled_softmax.mlir
+++ b/test/Conversion/XeTileToXeGPU/sg_tiled_softmax.mlir
@@ -21,7 +21,7 @@ gpu.module @test_kernel {
       //CHECK-COUNT-128: {{.*}} = math.exp %{{.*}} : vector<1x16xf16>
       %4 = math.exp %3 : vector<32x4x1x16xf16>
       //CHECK-COUNT-124: arith.addf {{.*}}, {{.*}} : vector<1x16xf16>
-      %5 = xetile.reduce <add>, %4 [0, 2] : vector<32x4x1x16xf16> -> vector<1x4x1x16xf16>
+      %5 = xetile.reduction <add>, %4 [0, 2] : vector<32x4x1x16xf16> -> vector<1x4x1x16xf16>
       %6 = xetile.broadcast %5 [0, 2] : vector<1x4x1x16xf16> -> vector<32x4x1x16xf16>
       //CHECK-COUNT-128: arith.divf {{.*}}, {{.*}} : vector<1x16xf16>
       %7 = arith.divf %4, %6 : vector<32x4x1x16xf16>
@@ -268,7 +268,7 @@ gpu.module @test_kernel {
       //CHECK: {{.*}} = vector.splat {{.*}} : vector<1x1xf16>
       //CHECK: {{.*}} = vector.extractelement {{.*}}[{{.*}} : i32] : vector<16xf16>
       //CHECK: {{.*}} = vector.splat {{.*}} : vector<1x1xf16>
-      %5 = xetile.reduce <add>, %4 [1, 3] : vector<32x4x1x16xf16> -> vector<32x1x1x1xf16>
+      %5 = xetile.reduction <add>, %4 [1, 3] : vector<32x4x1x16xf16> -> vector<32x1x1x1xf16>
 
       //CHECK: {{.*}} = vector.extract {{.*}}[0, 0] : f16 from vector<1x1xf16>
       //CHECK: {{.*}} = vector.splat {{.*}} : vector<1x16xf16>
diff --git a/test/Dialect/XeTile/IR/invalid.mlir b/test/Dialect/XeTile/IR/invalid.mlir
index b3db0e00f..76d47d9cc 100644
--- a/test/Dialect/XeTile/IR/invalid.mlir
+++ b/test/Dialect/XeTile/IR/invalid.mlir
@@ -202,7 +202,7 @@ func.func @test_transpose(%source: vector<8x16xf16>) {
 // -----
 func.func @test_reduce(%source: vector<8x16xf16>) {
   // expected-error@+1 {{reduction dimension of result must have size 1}}
-  %1 = xetile.reduce <add>, %source [0] : vector<8x16xf16> -> vector<2x16xf16>
+  %1 = xetile.reduction <add>, %source [0] : vector<8x16xf16> -> vector<2x16xf16>
   return
 }
 
diff --git a/test/Dialect/XeTile/IR/ops.mlir b/test/Dialect/XeTile/IR/ops.mlir
index dbfead4a6..e8ee5e792 100644
--- a/test/Dialect/XeTile/IR/ops.mlir
+++ b/test/Dialect/XeTile/IR/ops.mlir
@@ -329,14 +329,14 @@ func.func @test_transpose(%source: vector<8x16xf16>) {
 }
 
 func.func @test_reduce(%source: vector<8x16xf16>) {
-  // CHECK: xetile.reduce {{.*}} [0] : vector<8x16xf16> -> vector<1x16xf16>
-  %1 = xetile.reduce <add>, %source [0] : vector<8x16xf16> -> vector<1x16xf16>
+  // CHECK: xetile.reduction {{.*}} [0] : vector<8x16xf16> -> vector<1x16xf16>
+  %1 = xetile.reduction <add>, %source [0] : vector<8x16xf16> -> vector<1x16xf16>
   return
 }
 
 func.func @test_reduce_map(%source: vector<256x128xf16>) {
-  // CHECK: xetile.reduce {{.*}} [1] {map1 = #xetile.wg_map<sg_layout = [32, 1], sg_data = [8, 128]>, map2 = #xetile.wg_map<sg_layout = [32, 1], sg_data = [8, 1]>} : vector<256x128xf16> -> vector<256x1xf16>
-  %1 = xetile.reduce <add>, %source [1] {map1 = #wg_map_a, map2 = #wg_map_a2} : vector<256x128xf16> -> vector<256x1xf16>
+  // CHECK: xetile.reduction {{.*}} [1] {map1 = #xetile.wg_map<sg_layout = [32, 1], sg_data = [8, 128]>, map2 = #xetile.wg_map<sg_layout = [32, 1], sg_data = [8, 1]>} : vector<256x128xf16> -> vector<256x1xf16>
+  %1 = xetile.reduction <add>, %source [1] {map1 = #wg_map_a, map2 = #wg_map_a2} : vector<256x128xf16> -> vector<256x1xf16>
   return
 }
 
diff --git a/test/Dialect/XeTile/Transforms/Blocking/persistent_kernel.mlir b/test/Dialect/XeTile/Transforms/Blocking/persistent_kernel.mlir
new file mode 100644
index 000000000..1695b61c0
--- /dev/null
+++ b/test/Dialect/XeTile/Transforms/Blocking/persistent_kernel.mlir
@@ -0,0 +1,201 @@
+// RUN:  imex-opt --split-input-file --xetile-init-duplicate --new-xetile-blocking --canonicalize %s -verify-diagnostics -o -| FileCheck %s
+module @gemm attributes {gpu.container_module} {
+  gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, SubgroupDispatch, VectorComputeINTEL, VectorAnyINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
+
+    //CHECK: gpu.func @test_kernel(%[[arg0:.*]]: memref<4096x4096xf16>, %[[arg1:.*]]: memref<4096x4096xf16>, %[[arg2:.*]]: memref<4096x4096xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+    gpu.func @test_kernel(%A: memref<4096x4096xf16>, %B: memref<4096x4096xf16>, %C: memref<4096x4096xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+      //CHECK: %[[c0_i8:.*]] = arith.constant 0 : i8
+      //CHECK: %[[c1_i8:.*]] = arith.constant 1 : i8
+      //CHECK: %[[cst:.*]] = arith.constant dense<0.000000e+00> : vector<4x4x8x16xf32>
+      //CHECK: %[[c0_i32:.*]] = arith.constant 0 : i32
+      //CHECK: %[[c64:.*]] = arith.constant 64 : index
+      //CHECK: %[[c8:.*]] = arith.constant 8 : index
+      //CHECK: %[[c4:.*]] = arith.constant 4 : index
+      //CHECK: %[[c4096:.*]] = arith.constant 4096 : index
+      //CHECK: %[[c32:.*]] = arith.constant 32 : index
+      //CHECK: %[[c256:.*]] = arith.constant 256 : index
+      //CHECK: %[[c0:.*]] = arith.constant 0 : index
+      //CHECK: %[[c3:.*]] = arith.constant 3 : index
+      //CHECK: %[[c1:.*]] = arith.constant 1 : index
+      %c0_23 = arith.constant 0 : index
+      %c3 = arith.constant 3 : index
+      %c1_24 = arith.constant 1 : index
+      //CHECK: scf.for %[[arg3:.*]] = %[[c0]] to %[[c3]] step %[[c1]] {
+      scf.for %arg17 = %c0_23 to %c3 step %c1_24 {
+        %c256 = arith.constant 256 : index
+        %c512 = arith.constant 512 : index
+        %c128 = arith.constant 128 : index
+        %c32 = arith.constant 32 : index
+        %c4096 = arith.constant 4096 : index
+        %c4 = arith.constant 4 : index
+        %c8 = arith.constant 8 : index
+        %c64 = arith.constant 64 : index
+        %c1 = arith.constant 1 : index
+        %c48 = arith.constant 48 : index
+        %c16 = arith.constant 16 : index
+        %c24 = arith.constant 24 : index
+        %c0 = arith.constant 0 : index
+        %c0_i32 = arith.constant 0 : i32
+        //CHECK: %[[R0:.*]] = gpu.block_id  x
+        //CHECK: %[[R1:.*]] = gpu.block_id  y
+        //CHECK: %[[R2:.*]] = gpu.global_id  x
+        //CHECK: %[[R3:.*]] = gpu.global_id  y
+        //CHECK: %[[R4:.*]] = arith.remui %[[R2]], %[[c8]] : index
+        //CHECK: %[[R5:.*]] = arith.remui %[[R3]], %[[c4]] : index
+        //CHECK: %[[R6:.*]] = arith.muli %[[R2:.*]], %[[c32]] : index
+        //CHECK: %[[R7:.*]] = arith.muli %[[R3:.*]], %[[c64]] : index
+        //CHECK: %[[R8:.*]] = arith.muli %[[R0:.*]], %[[c256]] : index
+        //CHECK: %[[R9:.*]] = arith.muli %[[R1:.*]], %[[c256]] : index
+        //CHECK: %[[R10:.*]] = arith.muli %[[R4]], %[[c4]] : index
+        //CHECK: %[[R11:.*]] = arith.addi %[[R10]], %[[R5]] : index
+        //CHECK: %[[R12:.*]] = arith.muli %[[R11]], %[[c8]] : index
+        //CHECK: %[[R13:.*]] = arith.addi %[[R12]], %[[R8]] : index
+        %wg_id_x = gpu.block_id x
+        %wg_id_y = gpu.block_id y
+        %global_sg_id_x = gpu.global_id x
+        %global_sg_id_y = gpu.global_id y
+        %local_sg_id_x = arith.remui %global_sg_id_x, %c8 : index
+        %local_sg_id_y = arith.remui %global_sg_id_y, %c4 : index
+        %C_sg_tile_offset_x = arith.muli %global_sg_id_x, %c32 : index
+        %C_sg_tile_offset_y = arith.muli %global_sg_id_y, %c64 : index
+        %wg_tile_offset_x = arith.muli %wg_id_x, %c256 : index
+        %wg_tile_offset_y = arith.muli %wg_id_y, %c256 : index
+        %local_sg_id_temp = arith.muli %local_sg_id_x, %c4 : index
+        %local_sg_id = arith.addi %local_sg_id_temp, %local_sg_id_y : index
+        %A_sg_prefetch_offset_x_temp = arith.muli %local_sg_id, %c8 : index
+        %A_sg_prefetch_offset_x = arith.addi %A_sg_prefetch_offset_x_temp, %wg_tile_offset_x : index
+
+        //CHECK: %[[R14:.*]] = xetile.init_tile %[[arg0]][%[[R13]], %[[c0]]] : memref<4096x4096xf16> -> !xetile.tile<8x32xf16, #xetile.tile_attr<inner_blocks = [8, 32]>>
+        //CHECK: xetile.prefetch_tile %[[R14]] : !xetile.tile<8x32xf16, #xetile.tile_attr<inner_blocks = [8, 32]>>
+        //CHECK: %[[R15:.*]] = xetile.update_tile_offset %[[R14]], [%[[c0]],  %[[c32]]] : !xetile.tile<8x32xf16, #xetile.tile_attr<inner_blocks = [8, 32]>>, index, index -> !xetile.tile<8x32xf16, #xetile.tile_attr<inner_blocks = [8, 32]>>
+        //CHECK: xetile.prefetch_tile %[[R15]] : !xetile.tile<8x32xf16, #xetile.tile_attr<inner_blocks = [8, 32]>>
+        //CHECK: %[[R16:.*]] = xetile.update_tile_offset %[[R15]], [%[[c0]],  %[[c32]]] : !xetile.tile<8x32xf16, #xetile.tile_attr<inner_blocks = [8, 32]>>, index, index -> !xetile.tile<8x32xf16, #xetile.tile_attr<inner_blocks = [8, 32]>>
+        //CHECK: xetile.prefetch_tile %[[R16]] : !xetile.tile<8x32xf16, #xetile.tile_attr<inner_blocks = [8, 32]>>
+        //CHECK: %[[R17:.*]] = xetile.update_tile_offset %[[R16]], [%[[c0]],  %[[c32]]] : !xetile.tile<8x32xf16, #xetile.tile_attr<inner_blocks = [8, 32]>>, index, index -> !xetile.tile<8x32xf16, #xetile.tile_attr<inner_blocks = [8, 32]>>
+        //CHECK: %[[R18:.*]] = arith.remui %[[R4]], %[[c4]] : index
+        //CHECK: %[[R19:.*]] = arith.muli %[[R18]], %[[c8]] : index
+        //CHECK: %[[R20:.*]] = arith.muli %[[R5]], %[[c64]] : index
+        //CHECK: %[[R21:.*]] = arith.divui %[[R4]], %[[c4]] : index
+        //CHECK: %[[R22:.*]] = arith.muli %[[R21]], %[[c32]] : index
+        //CHECK: %[[R23:.*]] = arith.addi %[[R20]], %[[R22]] : index
+        //CHECK: %[[R24:.*]] = arith.addi %[[R9]], %[[R23]] : index
+
+
+        %A_sg_prefetch_tile_iter0 = xetile.init_tile %A[%A_sg_prefetch_offset_x, %c0] : memref<4096x4096xf16> -> !xetile.tile<8x32xf16>
+        xetile.prefetch_tile %A_sg_prefetch_tile_iter0 : !xetile.tile<8x32xf16>
+        %A_sg_prefetch_tile_iter1 = xetile.update_tile_offset %A_sg_prefetch_tile_iter0, [%c0, %c32] : !xetile.tile<8x32xf16>, index, index -> !xetile.tile<8x32xf16>
+        xetile.prefetch_tile %A_sg_prefetch_tile_iter1 : !xetile.tile<8x32xf16>
+        %A_sg_prefetch_tile_iter2 = xetile.update_tile_offset %A_sg_prefetch_tile_iter1, [%c0, %c32] : !xetile.tile<8x32xf16>, index, index -> !xetile.tile<8x32xf16>
+        xetile.prefetch_tile %A_sg_prefetch_tile_iter2 : !xetile.tile<8x32xf16>
+        %A_sg_prefetch_tile_iter3 = xetile.update_tile_offset %A_sg_prefetch_tile_iter2, [%c0, %c32] : !xetile.tile<8x32xf16>, index, index -> !xetile.tile<8x32xf16>
+        %B_sg_prefetch_offset_x_temp0 = arith.remui %local_sg_id_x, %c4 : index
+        %B_sg_prefetch_offset_x = arith.muli %B_sg_prefetch_offset_x_temp0, %c8 : index
+        %B_sg_prefetch_offset_y_temp0 = arith.muli %local_sg_id_y, %c64 : index
+        %B_sg_prefetch_offset_y_temp1 = arith.divui %local_sg_id_x, %c4 : index
+        %B_sg_prefetch_offset_y_temp2 = arith.muli %B_sg_prefetch_offset_y_temp1, %c32 : index
+        %B_sg_prefetch_offset_y_temp3 = arith.addi %B_sg_prefetch_offset_y_temp0, %B_sg_prefetch_offset_y_temp2 : index
+        %B_sg_prefetch_offset_y = arith.addi %wg_tile_offset_y, %B_sg_prefetch_offset_y_temp3 : index
+
+        //CHECK: %[[R25:.*]] = xetile.init_tile %[[arg1]][%[[R19]], %[[R24]]] : memref<4096x4096xf16> -> !xetile.tile<8x32xf16, #xetile.tile_attr<inner_blocks = [8, 32]>>
+        //CHECK: xetile.prefetch_tile %[[R25]] : !xetile.tile<8x32xf16, #xetile.tile_attr<inner_blocks = [8, 32]>>
+        //CHECK: %[[R26:.*]] = xetile.update_tile_offset %[[R25]], [%[[c32]],  %[[c0]]] : !xetile.tile<8x32xf16, #xetile.tile_attr<inner_blocks = [8, 32]>>, index, index -> !xetile.tile<8x32xf16, #xetile.tile_attr<inner_blocks = [8, 32]>>
+        //CHECK: xetile.prefetch_tile %[[R26]] : !xetile.tile<8x32xf16, #xetile.tile_attr<inner_blocks = [8, 32]>>
+        //CHECK: %[[R27:.*]] = xetile.update_tile_offset %[[R26]], [%[[c32]],  %[[c0]]] : !xetile.tile<8x32xf16, #xetile.tile_attr<inner_blocks = [8, 32]>>, index, index -> !xetile.tile<8x32xf16, #xetile.tile_attr<inner_blocks = [8, 32]>>
+        //CHECK: xetile.prefetch_tile %[[R27]] : !xetile.tile<8x32xf16, #xetile.tile_attr<inner_blocks = [8, 32]>>
+        //CHECK: %[[R28:.*]] = xetile.update_tile_offset %[[R27]], [%[[c32]],  %[[c0]]] : !xetile.tile<8x32xf16, #xetile.tile_attr<inner_blocks = [8, 32]>>, index, index -> !xetile.tile<8x32xf16, #xetile.tile_attr<inner_blocks = [8, 32]>>
+        //CHECK: %[[R29:.*]] = xetile.init_tile %[[arg0]][%[[R6]], %[[c0]]] : memref<4096x4096xf16> -> !xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 16]>>
+        //CHECK: %[[R30:.*]] = xetile.init_tile %[[arg1]][%[[c0]], %[[R7]]] : memref<4096x4096xf16> -> !xetile.tile<32x64xf16, #xetile.tile_attr<inner_blocks = [32, 16]>>
+        //CHECK: xegpu.alloc_nbarrier 16
+        %B_sg_prefetch_tile_iter0 = xetile.init_tile %B[%B_sg_prefetch_offset_x, %B_sg_prefetch_offset_y]  : memref<4096x4096xf16> -> !xetile.tile<8x32xf16>
+        xetile.prefetch_tile %B_sg_prefetch_tile_iter0 : !xetile.tile<8x32xf16>
+        %B_sg_prefetch_tile_iter1 = xetile.update_tile_offset %B_sg_prefetch_tile_iter0, [%c32, %c0] : !xetile.tile<8x32xf16>, index, index -> !xetile.tile<8x32xf16>
+        xetile.prefetch_tile %B_sg_prefetch_tile_iter1 : !xetile.tile<8x32xf16>
+        %B_sg_prefetch_tile_iter2 = xetile.update_tile_offset %B_sg_prefetch_tile_iter1, [%c32, %c0] : !xetile.tile<8x32xf16>, index, index -> !xetile.tile<8x32xf16>
+        xetile.prefetch_tile %B_sg_prefetch_tile_iter2 : !xetile.tile<8x32xf16>
+        %B_sg_prefetch_tile_iter3 = xetile.update_tile_offset %B_sg_prefetch_tile_iter2, [%c32, %c0] : !xetile.tile<8x32xf16>, index, index -> !xetile.tile<8x32xf16>
+        %A_sg_init_tile = xetile.init_tile %A[%C_sg_tile_offset_x, %c0] : memref<4096x4096xf16> -> !xetile.tile<32x32xf16>
+        %B_sg_init_tile = xetile.init_tile %B[%c0, %C_sg_tile_offset_y] : memref<4096x4096xf16> -> !xetile.tile<32x64xf16>
+        %c_init_val = arith.constant dense<0.0> : vector<32x64xf32>
+        xegpu.alloc_nbarrier 16
+
+        %nbarrier_id = arith.constant 1 : i8
+        %nbarrier_role = arith.constant 0 : i8
+        //CHECK: %[[R31:.*]] = xegpu.init_nbarrier %[[c1_i8]], %[[c0_i8]] : i8, i8 -> !xegpu.nbarrier
+        %nbarrier = xegpu.init_nbarrier %nbarrier_id, %nbarrier_role : i8, i8 -> !xegpu.nbarrier
+        //CHECK: %[[R32:.*]]:5 = scf.for %[[arg4:.*]] = %[[c0]] to %[[c4096]] step %[[c32]] iter_args(%[[arg5:.*]] = %[[R29]], %[[arg6:.*]] = %[[R30]], %[[arg7:.*]] = %[[cst]], %[[arg8:.*]] = %[[R17]], %[[arg9:.*]] = %[[R28]]) -> (!xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 16]>>, !xetile.tile<32x64xf16, #xetile.tile_attr<inner_blocks = [32, 16]>>, vector<4x4x8x16xf32>, !xetile.tile<8x32xf16, #xetile.tile_attr<inner_blocks = [8, 32]>>, !xetile.tile<8x32xf16, #xetile.tile_attr<inner_blocks = [8, 32]>>) {
+        %k_loop_result:5 = scf.for %k = %c0 to %c4096 step %c32 iter_args (
+            %A_tile = %A_sg_init_tile,
+            %B_tile = %B_sg_init_tile,
+            %c_val = %c_init_val,
+
+            %A_prefetch_tile = %A_sg_prefetch_tile_iter3,
+            %B_prefetch_tile = %B_sg_prefetch_tile_iter3
+            ) ->
+            (!xetile.tile<32x32xf16>, !xetile.tile<32x64xf16>,
+            vector<32x64xf32>,
+            !xetile.tile<8x32xf16>, !xetile.tile<8x32xf16>
+            )
+            {
+          //CHECK: %[[R36:.*]] = arith.remui %[[arg4]], %[[c256]] : index
+          //CHECK: %[[R37:.*]] = arith.index_cast %[[R36]] : index to i32
+          //CHECK: %[[R38:.*]] = arith.cmpi eq, %[[R37]], %[[c0_i32]] : i32
+          %every_8th_iter = arith.remui %k, %c256 : index
+          %every_8th_iter_i32 = arith.index_cast %every_8th_iter : index to i32
+          %every_8th_iter_cond = arith.cmpi eq, %every_8th_iter_i32, %c0_i32 : i32
+          //CHECK: scf.if %[[R38]]
+          scf.if %every_8th_iter_cond  {
+            //CHECK: xegpu.nbarrier_arrive %[[R31]] : !xegpu.nbarrier
+            xegpu.nbarrier_arrive %nbarrier : !xegpu.nbarrier
+          }
+          //CHECK: %[[R39:.*]] = xetile.load_tile %[[arg5]] { padding = 0.000000e+00 : f32 }  : !xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 16]>> -> vector<1x2x32x16xf16>
+          //CHECK: %[[R40:.*]] = xetile.tile_unpack %[[R39]] { inner_blocks = [32, 16] }  : vector<1x2x32x16xf16> -> vector<32x32xf16>
+          %a_val = xetile.load_tile %A_tile : !xetile.tile<32x32xf16> -> vector<32x32xf16>
+          //CHECK: %[[R41:.*]] = xetile.load_tile %[[arg6]] { padding = 0.000000e+00 : f32 }  : !xetile.tile<32x64xf16, #xetile.tile_attr<inner_blocks = [32, 16]>> -> vector<1x4x32x16xf16>
+          //CHECK: %[[R42:.*]] = xetile.tile_unpack %[[R41]] { inner_blocks = [32, 16] }  : vector<1x4x32x16xf16> -> vector<32x64xf16>
+          %b_val = xetile.load_tile %B_tile  : !xetile.tile<32x64xf16> -> vector<32x64xf16>
+          //CHECK: xegpu.compile_hint
+          xegpu.compile_hint
+          //CHECK: xetile.prefetch_tile %[[arg8]] : !xetile.tile<8x32xf16, #xetile.tile_attr<inner_blocks = [8, 32]>>
+          //CHECK: xetile.prefetch_tile %[[arg9]] : !xetile.tile<8x32xf16, #xetile.tile_attr<inner_blocks = [8, 32]>>
+          xetile.prefetch_tile %A_prefetch_tile : !xetile.tile<8x32xf16>
+          xetile.prefetch_tile %B_prefetch_tile : !xetile.tile<8x32xf16>
+          //CHECK: xegpu.compile_hint
+          xegpu.compile_hint
+          //CHECK: %[[R43:.*]] = xetile.update_tile_offset %[[arg8]], [%[[c0]],  %[[c32]]] : !xetile.tile<8x32xf16, #xetile.tile_attr<inner_blocks = [8, 32]>>, index, index -> !xetile.tile<8x32xf16, #xetile.tile_attr<inner_blocks = [8, 32]>>
+          //CHECK: %[[R44:.*]] = xetile.update_tile_offset %[[arg9]], [%[[c32]],  %[[c0]]] : !xetile.tile<8x32xf16, #xetile.tile_attr<inner_blocks = [8, 32]>>, index, index -> !xetile.tile<8x32xf16, #xetile.tile_attr<inner_blocks = [8, 32]>>
+          //CHECK: %[[R45:.*]] = xetile.update_tile_offset %[[arg5]], [%[[c0]],  %[[c32]]] : !xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 16]>>, index, index -> !xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 16]>>
+          //CHECK: %[[R46:.*]] = xetile.update_tile_offset %[[arg6]], [%[[c32]],  %[[c0]]] : !xetile.tile<32x64xf16, #xetile.tile_attr<inner_blocks = [32, 16]>>, index, index -> !xetile.tile<32x64xf16, #xetile.tile_attr<inner_blocks = [32, 16]>>
+          %next_A_prefetch_tile = xetile.update_tile_offset %A_prefetch_tile, [%c0, %c32] : !xetile.tile<8x32xf16>, index, index -> !xetile.tile<8x32xf16>
+          %next_B_prefetch_tile = xetile.update_tile_offset %B_prefetch_tile, [%c32, %c0] : !xetile.tile<8x32xf16>, index, index -> !xetile.tile<8x32xf16>
+          %next_A_tile = xetile.update_tile_offset %A_tile, [%c0, %c32]  : !xetile.tile<32x32xf16>, index, index -> !xetile.tile<32x32xf16>
+          %next_B_tile = xetile.update_tile_offset %B_tile, [%c32, %c0]  : !xetile.tile<32x64xf16>, index, index -> !xetile.tile<32x64xf16>
+          //CHECK: xegpu.compile_hint
+          xegpu.compile_hint
+          //CHECK: %[[R47:.*]] = xetile.tile_pack %[[R40]] { inner_blocks = [8, 16] }  : vector<32x32xf16> -> vector<4x2x8x16xf16>
+          //CHECK: %[[R48:.*]] = xetile.tile_pack %[[R42]] { inner_blocks = [16, 16] }  : vector<32x64xf16> -> vector<2x4x16x16xf16>
+          //CHECK: %[[R51:.*]] = xetile.tile_mma %[[R47]], %[[R48]], %[[arg7]] : vector<4x2x8x16xf16>, vector<2x4x16x16xf16>, vector<4x4x8x16xf32> -> vector<4x4x8x16xf32>
+          %new_c_val = xetile.tile_mma %a_val, %b_val, %c_val : vector<32x32xf16>, vector<32x64xf16>, vector<32x64xf32> -> vector<32x64xf32>
+          //CHECK: xegpu.compile_hint
+          xegpu.compile_hint
+          //CHECK: scf.if %[[R38]]
+          scf.if %every_8th_iter_cond {
+            //CHECK: xegpu.nbarrier_wait %[[R31]] : !xegpu.nbarrier
+            xegpu.nbarrier_wait %nbarrier : !xegpu.nbarrier
+          }
+
+          //CHECK: scf.yield %[[R45]], %[[R46]], %[[R51]], %[[R43]], %[[R44]] : !xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 16]>>, !xetile.tile<32x64xf16, #xetile.tile_attr<inner_blocks = [32, 16]>>, vector<4x4x8x16xf32>, !xetile.tile<8x32xf16, #xetile.tile_attr<inner_blocks = [8, 32]>>, !xetile.tile<8x32xf16, #xetile.tile_attr<inner_blocks = [8, 32]>>
+          scf.yield %next_A_tile, %next_B_tile, %new_c_val,
+                    %next_A_prefetch_tile, %next_B_prefetch_tile
+                    : !xetile.tile<32x32xf16>, !xetile.tile<32x64xf16>, vector<32x64xf32>,
+                    !xetile.tile<8x32xf16>, !xetile.tile<8x32xf16>
+        }
+        //CHECK: %[[R34:.*]] = xetile.init_tile %[[arg2]][%[[R6]], %[[R7]]] : memref<4096x4096xf32> -> !xetile.tile<32x64xf32, #xetile.tile_attr<inner_blocks = [8, 16]>>
+        %c_sg_tile = xetile.init_tile %C[%C_sg_tile_offset_x, %C_sg_tile_offset_y] : memref<4096x4096xf32> -> !xetile.tile<32x64xf32>
+        //CHECK: xetile.store_tile %[[R32]]#2,  %[[R34]] : vector<4x4x8x16xf32>, !xetile.tile<32x64xf32, #xetile.tile_attr<inner_blocks = [8, 16]>>
+        xetile.store_tile %k_loop_result#2 , %c_sg_tile : vector<32x64xf32>, !xetile.tile<32x64xf32>
+      }
+      //CHECK: gpu.return
+      gpu.return
+    }
+  }
+}
diff --git a/test/Dialect/XeTile/Transforms/Blocking/sg_gemm_1k_1k_1k_f16_f32.mlir b/test/Dialect/XeTile/Transforms/Blocking/sg_gemm_1k_1k_1k_f16_f32.mlir
new file mode 100644
index 000000000..9d1d90f77
--- /dev/null
+++ b/test/Dialect/XeTile/Transforms/Blocking/sg_gemm_1k_1k_1k_f16_f32.mlir
@@ -0,0 +1,99 @@
+// RUN: imex-opt --xetile-init-duplicate --new-xetile-blocking --canonicalize --cse %s | FileCheck %s
+
+// CHECK-LABEL: gpu.module @test_kernel {
+gpu.module @test_kernel {
+  // CHECK: gpu.func @test_gemm(%[[A:.*]]: memref<1024x1024xf16>, %[[B:.*]]: memref<1024x1024xf16>, %[[C:.*]]: memref<1024x1024xf32>)
+  gpu.func @test_gemm(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) {
+
+    //CHECK: %[[C0:.*]] = arith.constant 0 : index
+    %c0 = arith.constant 0 : index
+
+    //CHECK: %[[C1:.*]] = arith.constant 64 : index
+    %c64 = arith.constant 64 : index
+
+    //CHECK: %[[C2:.*]] = arith.constant 1024 : index
+    %c1024 = arith.constant 1024 : index
+
+    //CHECK: %[[X:.*]] = gpu.block_id  x
+    %block_id_x = gpu.block_id x
+
+    //CHECK: %[[Y:.*]] = gpu.block_id  y
+    %block_id_y = gpu.block_id y
+
+    //CHECK: %[[R0:.*]] = arith.muli %[[X]], %[[C1]] : index
+    %m = arith.muli %block_id_x, %c64 : index
+
+    //CHECK: %[[R1:.*]] = arith.muli %[[Y]], %[[C1]] : index
+    %n = arith.muli %block_id_y, %c64 : index
+
+    //CHECK: %[[R2:.*]] = xetile.init_tile %[[C]][%[[R0]], %[[R1]]]
+    //CHECK-SAME: memref<1024x1024xf32> -> !xetile.tile<64x64xf32, #xetile.tile_attr<inner_blocks = [8, 16]>>
+    //CHECK: %[[R3:.*]] = xetile.init_tile %[[C]][%[[R0]], %[[R1]]]
+    //CHECK-SAME: memref<1024x1024xf32> -> !xetile.tile<64x64xf32, #xetile.tile_attr<inner_blocks = [32, 16]>>
+    %c_init_tile = xetile.init_tile %C[%m, %n] : memref<1024x1024xf32> -> !xetile.tile<64x64xf32>
+
+    //CHECK: %[[R4:.*]] = xetile.load_tile %[[R3]] { padding = 0.000000e+00 : f32 }
+    //CHECK-SAME: !xetile.tile<64x64xf32, #xetile.tile_attr<inner_blocks = [32, 16]>> -> vector<2x4x32x16xf32>
+    //CHECK: %[[R5:.*]] = xetile.tile_unpack %[[R4]] { inner_blocks = [32, 16] }  : vector<2x4x32x16xf32> -> vector<64x64xf32>
+    %c_init_value = xetile.load_tile %c_init_tile : !xetile.tile<64x64xf32> -> vector<64x64xf32>
+
+    //CHECK: %[[R6:.*]] = xetile.init_tile %[[A]][%[[R0]], %[[C0]]]
+    //CHECK-SAME: memref<1024x1024xf16> -> !xetile.tile<64x64xf16, #xetile.tile_attr<inner_blocks = [32, 16]>>
+    %a_init_tile = xetile.init_tile %A[%m, %c0] : memref<1024x1024xf16> -> !xetile.tile<64x64xf16>
+
+    //CHECK: %[[R7:.*]] = xetile.init_tile %[[B]][%[[C0]], %[[R1]]]
+    //CHECK-SAME: memref<1024x1024xf16> -> !xetile.tile<64x64xf16, #xetile.tile_attr<inner_blocks = [32, 16]>>
+    %b_init_tile = xetile.init_tile %B[%c0, %n] : memref<1024x1024xf16> -> !xetile.tile<64x64xf16>
+
+    // compute the value of C tile by iterating over tiles in k-dimension and doing dpas
+    //CHECK: %[[R8:.*]] = xetile.tile_pack %[[R5]] { inner_blocks = [8, 16] }  : vector<64x64xf32> -> vector<8x4x8x16xf32>
+    //CHECK: %[[R9:.*]]:3 = scf.for %[[arg3:.*]] = %[[C0]] to %[[C2]] step %[[C1]]
+    //CHECK-SAME: iter_args(%[[arg4:.*]] = %[[R6]], %[[arg5:.*]] = %[[R7]], %[[arg6:.*]] = %[[R8]])
+    //CHECK-SAME: !xetile.tile<64x64xf16, #xetile.tile_attr<inner_blocks = [32, 16]>>
+    //CHECK-SAME: !xetile.tile<64x64xf16, #xetile.tile_attr<inner_blocks = [32, 16]>>, vector<8x4x8x16xf32>
+    %out:3 = scf.for %k = %c0 to %c1024 step %c64
+      iter_args(%a_tile = %a_init_tile, %b_tile = %b_init_tile, %c_value = %c_init_value)
+      -> (!xetile.tile<64x64xf16>, !xetile.tile<64x64xf16>, vector<64x64xf32>) {
+
+      //CHECK: %[[R12:.*]] = xetile.load_tile %[[arg4]] { padding = 0.000000e+00 : f32 }
+      //CHECK-SAME: !xetile.tile<64x64xf16, #xetile.tile_attr<inner_blocks = [32, 16]>> -> vector<2x4x32x16xf16>
+      //CHECK: %[[R13:.*]] = xetile.tile_unpack %[[R12]] { inner_blocks = [32, 16] }  : vector<2x4x32x16xf16> -> vector<64x64xf16>
+      %a_value = xetile.load_tile %a_tile : !xetile.tile<64x64xf16> -> vector<64x64xf16>
+
+      //CHECK: %[[R14:.*]] = xetile.load_tile %[[arg5]] { padding = 0.000000e+00 : f32 }
+      //CHECK-SAME: !xetile.tile<64x64xf16, #xetile.tile_attr<inner_blocks = [32, 16]>> -> vector<2x4x32x16xf16>
+      //CHECK: %[[R15:.*]] = xetile.tile_unpack %[[R14]] { inner_blocks = [32, 16] }  : vector<2x4x32x16xf16> -> vector<64x64xf16>
+      %b_value = xetile.load_tile %b_tile : !xetile.tile<64x64xf16> -> vector<64x64xf16>
+
+      // perform dpas and accumulate
+      //CHECK: %[[R16:.*]] = xetile.tile_pack %[[R13]] { inner_blocks = [8, 16] }  : vector<64x64xf16> -> vector<8x4x8x16xf16>
+      //CHECK: %[[R17:.*]] = xetile.tile_pack %[[R15]] { inner_blocks = [16, 16] }  : vector<64x64xf16> -> vector<4x4x16x16xf16>
+      //CHECK: %[[R20:.*]] = xetile.tile_mma %[[R16]], %[[R17]], %[[arg6]] : vector<8x4x8x16xf16>, vector<4x4x16x16xf16>, vector<8x4x8x16xf32> -> vector<8x4x8x16xf32>
+      %c_new_value = xetile.tile_mma %a_value, %b_value, %c_value : vector<64x64xf16>, vector<64x64xf16>, vector<64x64xf32> -> vector<64x64xf32>
+
+      // update the offsets for A and B tiles
+      //CHECK: %[[R22:.*]] = xetile.update_tile_offset %[[arg4]], [%[[C0]],  %[[C1]]]
+      //CHECK-SAME: !xetile.tile<64x64xf16, #xetile.tile_attr<inner_blocks = [32, 16]>>, index, index
+      //CHECK-SAME: !xetile.tile<64x64xf16, #xetile.tile_attr<inner_blocks = [32, 16]>>
+      %a_next_tile = xetile.update_tile_offset %a_tile, [%c0, %c64] : !xetile.tile<64x64xf16>, index, index -> !xetile.tile<64x64xf16>
+
+      //CHECK: %[[R23:.*]] = xetile.update_tile_offset %[[arg5]], [%[[C1]],  %[[C0]]]
+      //CHECK-SAME: !xetile.tile<64x64xf16, #xetile.tile_attr<inner_blocks = [32, 16]>>, index, index
+      //CHECK-SAME: !xetile.tile<64x64xf16, #xetile.tile_attr<inner_blocks = [32, 16]>>
+      %b_next_tile = xetile.update_tile_offset %b_tile, [%c64, %c0] : !xetile.tile<64x64xf16>, index, index -> !xetile.tile<64x64xf16>
+
+      //CHECK: scf.yield %[[R22]], %[[R23]], %[[R20]]
+      //CHECK-SAME: !xetile.tile<64x64xf16, #xetile.tile_attr<inner_blocks = [32, 16]>>
+      //CHECK-SAME: !xetile.tile<64x64xf16, #xetile.tile_attr<inner_blocks = [32, 16]>>, vector<8x4x8x16xf32>
+      scf.yield %a_next_tile, %b_next_tile, %c_new_value
+        : !xetile.tile<64x64xf16>, !xetile.tile<64x64xf16>, vector<64x64xf32>
+    }
+
+    // store the final accumulated C tile result back to memory
+    //CHECK: xetile.store_tile %[[R9]]#2,  %[[R2]] : vector<8x4x8x16xf32>, !xetile.tile<64x64xf32, #xetile.tile_attr<inner_blocks = [8, 16]>>
+    xetile.store_tile %out#2, %c_init_tile: vector<64x64xf32>, !xetile.tile<64x64xf32>
+
+    //CHECK: gpu.return
+    gpu.return
+  }
+}
diff --git a/test/Dialect/XeTile/Transforms/Blocking/sg_gemm_1k_1k_1k_f16_f32_slm.mlir b/test/Dialect/XeTile/Transforms/Blocking/sg_gemm_1k_1k_1k_f16_f32_slm.mlir
new file mode 100644
index 000000000..a573234b0
--- /dev/null
+++ b/test/Dialect/XeTile/Transforms/Blocking/sg_gemm_1k_1k_1k_f16_f32_slm.mlir
@@ -0,0 +1,45 @@
+// RUN: imex-opt --xetile-init-duplicate --new-xetile-blocking --canonicalize --cse %s | FileCheck %s
+
+#slm = #xetile.tile_attr<memory_scope = 3>
+
+// CHECK-LABEL: gpu.module @test_kernel {
+gpu.module @test_kernel {
+  //CHECK: gpu.func @test_gemm(%[[arg0:.*]]: memref<128x128xf16, 3>, %[[arg1:.*]]: memref<128x128xf16, 3>, %[[arg2:.*]]: memref<128x128xf32>) {
+  gpu.func @test_gemm(%arg0: memref<128x128xf16, 3>, %arg1: memref<128x128xf16, 3>, %arg2: memref<128x128xf32>) {
+    //CHECK: %[[c0:.*]] = arith.constant 0 : index
+    %c0 = arith.constant 0 : index
+    //CHECK: %[[c16:.*]] = arith.constant 16 : index
+    %c16 = arith.constant 16 : index
+    %c128 = arith.constant 128 : index
+
+    //CHECK: %[[block_id_x:.*]] = gpu.block_id  x
+    //CHECK: %[[block_id_y:.*]] = gpu.block_id  y
+    //CHECK: %[[r0:.*]] = arith.muli %block_id_x, %[[c16]] : index
+    //CHECK: %[[r1:.*]] = arith.muli %block_id_y, %[[c16]] : index
+    %block_id_x = gpu.block_id  x
+    %block_id_y = gpu.block_id  y
+    %0 = arith.muli %block_id_x, %c16 : index
+    %1 = arith.muli %block_id_y, %c16 : index
+
+    //CHECK: %[[r2:.*]] = xetile.init_tile %[[arg2]][%[[r0]], %[[r1]]] : memref<128x128xf32> -> !xetile.tile<8x16xf32, #xetile.tile_attr<inner_blocks = [8, 16]>>
+    //CHECK: %[[r3:.*]] = xetile.load_tile %[[r2]] { padding = 0.000000e+00 : f32 }  : !xetile.tile<8x16xf32, #xetile.tile_attr<inner_blocks = [8, 16]>> -> vector<1x1x8x16xf32>
+    %2 = xetile.init_tile %arg2[%0, %1] : memref<128x128xf32> -> !xetile.tile<8x16xf32>
+    %3 = xetile.load_tile %2 { padding = 0.000000e+00 : f32 }  : !xetile.tile<8x16xf32> -> vector<8x16xf32>
+
+    //CHECK: %[[r5:.*]] = xetile.init_tile %[[arg0]][%[[r0]], %[[c0]]] : memref<128x128xf16, 3> -> !xetile.tile<8x16xf16, #xetile.tile_attr<inner_blocks = [1, 16], memory_scope = 3 : i64>>
+    //CHECK: %[[r6:.*]] = xetile.init_tile %[[arg1]][%[[c0]], %[[r1]]] : memref<128x128xf16, 3> -> !xetile.tile<16x16xf16, #xetile.tile_attr<inner_blocks = [16, 16], memory_scope = 3 : i64>>
+    %4 = xetile.init_tile %arg0[%0, %c0] : memref<128x128xf16, 3> -> !xetile.tile<8x16xf16, #slm>
+    %5 = xetile.init_tile %arg1[%c0, %1] : memref<128x128xf16, 3> -> !xetile.tile<16x16xf16, #slm>
+    %6:3 = scf.for %arg3 = %c0 to %c128 step %c16 iter_args(%arg4 = %4, %arg5 = %5, %arg6 = %3)
+          -> (!xetile.tile<8x16xf16, #slm>, !xetile.tile<16x16xf16, #slm>, vector<8x16xf32>) {
+      %7 = xetile.load_tile %arg4 { padding = 0.000000e+00 : f32 }  : !xetile.tile<8x16xf16, #slm> -> vector<8x16xf16>
+      %8 = xetile.load_tile %arg5 { padding = 0.000000e+00 : f32 }  : !xetile.tile<16x16xf16, #slm> -> vector<16x16xf16>
+      %9 = xetile.tile_mma %7, %8, %arg6 : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+      %10 = xetile.update_tile_offset %arg4, [%c0,  %c16] : !xetile.tile<8x16xf16, #slm>, index, index -> !xetile.tile<8x16xf16, #slm>
+      %11 = xetile.update_tile_offset %arg5, [%c16,  %c0] : !xetile.tile<16x16xf16, #slm>, index, index -> !xetile.tile<16x16xf16, #slm>
+      scf.yield %10, %11, %9 : !xetile.tile<8x16xf16, #slm>, !xetile.tile<16x16xf16, #slm>, vector<8x16xf32>
+    }
+    xetile.store_tile %6#2,  %2 : vector<8x16xf32>, !xetile.tile<8x16xf32>
+    gpu.return
+  }
+}
diff --git a/test/Dialect/XeTile/Transforms/Blocking/sg_gemm_1k_1k_1k_i8_i32.mlir b/test/Dialect/XeTile/Transforms/Blocking/sg_gemm_1k_1k_1k_i8_i32.mlir
new file mode 100644
index 000000000..93805632a
--- /dev/null
+++ b/test/Dialect/XeTile/Transforms/Blocking/sg_gemm_1k_1k_1k_i8_i32.mlir
@@ -0,0 +1,79 @@
+// RUN: imex-opt --xetile-init-duplicate --new-xetile-blocking --canonicalize --cse %s | FileCheck %s
+
+// CHECK-LABEL: gpu.module @test_kernel {
+gpu.module @test_kernel {
+
+  //CHECK: gpu.func @test_gemm(%[[arg0:.*]]: memref<1024x1024xi8>, %[[arg1:.*]]: memref<1024x1024xi8>, %[[arg2:.*]]: memref<1024x1024xi32>)
+  gpu.func @test_gemm(%A: memref<1024x1024xi8>, %B: memref<1024x1024xi8>, %C: memref<1024x1024xi32>) {
+    //CHECK: %[[c0:.*]] = arith.constant 0 : index
+    //CHECK: %[[c64:.*]] = arith.constant 64 : index
+    //CHECK: %[[c1024:.*]] = arith.constant 1024 : index
+    //CHECK: %[[X:.*]] = gpu.block_id  x
+    //CHECK: %[[Y:.*]] = gpu.block_id  y
+    //CHECK: %[[R0:.*]] = arith.muli %[[X]], %[[c64]] : index
+    //CHECK: %[[R1:.*]] = arith.muli %[[Y]], %[[c64]] : index
+    %c0 = arith.constant 0 : index
+    %c64 = arith.constant 64 : index
+    %c1024 = arith.constant 1024 : index
+    %block_id_x = gpu.block_id x
+    %block_id_y = gpu.block_id y
+    %m = arith.muli %block_id_x, %c64 : index
+    %n = arith.muli %block_id_y, %c64 : index
+
+    //CHECK: %[[R2:.*]] = xetile.init_tile %[[arg2]][%[[R0]], %[[R1]]]
+    //CHECK-SAME: memref<1024x1024xi32> -> !xetile.tile<64x64xi32, #xetile.tile_attr<inner_blocks = [8, 16]>>
+    //CHECK: %[[R3:.*]] = xetile.init_tile %[[arg2]][%[[R0]], %[[R1]]]
+    //CHECK-SAME: memref<1024x1024xi32> -> !xetile.tile<64x64xi32, #xetile.tile_attr<inner_blocks = [32, 16]>>
+    %c_init_tile = xetile.init_tile %C[%m, %n] : memref<1024x1024xi32> -> !xetile.tile<64x64xi32>
+
+    //CHECK: %[[R4:.*]] = xetile.load_tile %[[R3]] { padding = 0 : i32 }
+    //CHECK-SAME: !xetile.tile<64x64xi32, #xetile.tile_attr<inner_blocks = [32, 16]>> -> vector<2x4x32x16xi32>
+    //CHECK: %[[R5:.*]] = xetile.tile_unpack %[[R4]] { inner_blocks = [32, 16] }  : vector<2x4x32x16xi32> -> vector<64x64xi32>
+    %c_init_value = xetile.load_tile %c_init_tile : !xetile.tile<64x64xi32> -> vector<64x64xi32>
+
+    //CHECK: %[[R6:.*]] = xetile.init_tile %[[arg0]][%[[R0]], %[[c0]]]
+    //CHECK-SAME: memref<1024x1024xi8> -> !xetile.tile<64x64xi8, #xetile.tile_attr<inner_blocks = [32, 32]>>
+    %a_init_tile = xetile.init_tile %A[%m, %c0] : memref<1024x1024xi8> -> !xetile.tile<64x64xi8>
+
+    //CHECK: %[[R7:.*]] = xetile.init_tile %[[arg1]][%[[c0]], %[[R1]]]
+    //CHECK-SAME: memref<1024x1024xi8> -> !xetile.tile<64x64xi8, #xetile.tile_attr<inner_blocks = [32, 16]>>
+    %b_init_tile = xetile.init_tile %B[%c0, %n] : memref<1024x1024xi8> -> !xetile.tile<64x64xi8>
+
+    //CHECK: %[[R8:.*]] = xetile.tile_pack %[[R5]] { inner_blocks = [8, 16] }  : vector<64x64xi32> -> vector<8x4x8x16xi32>
+    //CHECK: %[[R9:.*]]:3 = scf.for %[[arg3:.*]] = %[[c0]] to %[[c1024]] step %[[c64]] iter_args(%[[arg4:.*]] = %[[R6]], %[[arg5:.*]] = %[[R7]], %[[arg6:.*]] = %[[R8]])
+    //CHECK-SAME: !xetile.tile<64x64xi8, #xetile.tile_attr<inner_blocks = [32, 32]>>, !xetile.tile<64x64xi8, #xetile.tile_attr<inner_blocks = [32, 16]>>, vector<8x4x8x16xi32>
+    %out:3 = scf.for %k = %c0 to %c1024 step %c64 iter_args(%a_tile = %a_init_tile, %b_tile = %b_init_tile, %c_value = %c_init_value)
+                  -> (!xetile.tile<64x64xi8>, !xetile.tile<64x64xi8>, vector<64x64xi32>) {
+      //CHECK: %[[R10:.*]] = xetile.load_tile %[[arg4]] { padding = 0 : i32 }  : !xetile.tile<64x64xi8, #xetile.tile_attr<inner_blocks = [32, 32]>> -> vector<2x2x32x32xi8>
+      //CHECK: %[[R11:.*]] = xetile.tile_unpack %[[R10]] { inner_blocks = [32, 32] }  : vector<2x2x32x32xi8> -> vector<64x64xi8>
+      %a_value = xetile.load_tile %a_tile : !xetile.tile<64x64xi8> -> vector<64x64xi8>
+
+      //CHECK: %[[R12:.*]] = xetile.load_tile %[[arg5]] { padding = 0 : i32 }  : !xetile.tile<64x64xi8, #xetile.tile_attr<inner_blocks = [32, 16]>> -> vector<2x4x32x16xi8>
+      %b_value = xetile.load_tile %b_tile : !xetile.tile<64x64xi8> -> vector<64x64xi8>
+
+      //CHECK: %[[R13:.*]] = xetile.tile_pack %[[R11]] { inner_blocks = [8, 32] }  : vector<64x64xi8> -> vector<8x2x8x32xi8>
+      //CHECK: %[[R14:.*]] = xetile.tile_mma %[[R13]], %[[R12]], %[[arg6]] : vector<8x2x8x32xi8>, vector<2x4x32x16xi8>, vector<8x4x8x16xi32> -> vector<8x4x8x16xi32>
+      %c_new_value = xetile.tile_mma %a_value, %b_value, %c_value : vector<64x64xi8>, vector<64x64xi8>, vector<64x64xi32> -> vector<64x64xi32>
+
+      //CHECK: %[[R15:.*]] = xetile.update_tile_offset %[[arg4]], [%[[c0]],  %[[c64]]]
+      //CHECK-SAME: !xetile.tile<64x64xi8, #xetile.tile_attr<inner_blocks = [32, 32]>>, index, index
+      //CHECK-SAME: !xetile.tile<64x64xi8, #xetile.tile_attr<inner_blocks = [32, 32]>>
+      %a_next_tile = xetile.update_tile_offset %a_tile, [%c0, %c64] : !xetile.tile<64x64xi8>, index, index -> !xetile.tile<64x64xi8>
+
+      //CHECK: %[[R16:.*]] = xetile.update_tile_offset %[[arg5]], [%[[c64]],  %[[c0]]]
+      //CHECK-SAME: !xetile.tile<64x64xi8, #xetile.tile_attr<inner_blocks = [32, 16]>>, index, index
+      //CHECK-SAME: !xetile.tile<64x64xi8, #xetile.tile_attr<inner_blocks = [32, 16]>>
+      %b_next_tile = xetile.update_tile_offset %b_tile, [%c64, %c0] : !xetile.tile<64x64xi8>, index, index -> !xetile.tile<64x64xi8>
+
+      //CHECK: scf.yield %[[R15]], %[[R16]], %[[R14]]
+      //CHECK-SAME: !xetile.tile<64x64xi8, #xetile.tile_attr<inner_blocks = [32, 32]>>
+      //CHECK-SAME: !xetile.tile<64x64xi8, #xetile.tile_attr<inner_blocks = [32, 16]>>, vector<8x4x8x16xi32>
+      scf.yield %a_next_tile, %b_next_tile, %c_new_value : !xetile.tile<64x64xi8>, !xetile.tile<64x64xi8>, vector<64x64xi32>
+    }
+    //CHECK: xetile.store_tile %[[R9]]#2,  %[[R2]] : vector<8x4x8x16xi32>, !xetile.tile<64x64xi32, #xetile.tile_attr<inner_blocks = [8, 16]>>
+    xetile.store_tile %out#2, %c_init_tile {innner_blocks = [8, 16]}: vector<64x64xi32>, !xetile.tile<64x64xi32>
+
+    //CHECK: gpu.return
+    gpu.return
+  }
+}
diff --git a/test/Dialect/XeTile/Transforms/Blocking/unit_tests.mlir b/test/Dialect/XeTile/Transforms/Blocking/unit_tests.mlir
new file mode 100644
index 000000000..c697c371d
--- /dev/null
+++ b/test/Dialect/XeTile/Transforms/Blocking/unit_tests.mlir
@@ -0,0 +1,339 @@
+// RUN: imex-opt --split-input-file --new-xetile-blocking --canonicalize %s -verify-diagnostics -o -| FileCheck %s
+
+gpu.module @test_kernel {
+  //CHECK: gpu.func @sg_load_tile(%[[arg0:.*]]: memref<32x32xf16>)
+  //CHECK: %[[c0:.*]] = arith.constant 0 : index
+  //CHECK: %[[R0:.*]] = xetile.init_tile %[[arg0]][%[[c0]], %[[c0]]] : memref<32x32xf16> -> !xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 32]>>
+  //CHECK: %[[R1:.*]] = xetile.load_tile %[[R0]] { padding = 0.000000e+00 : f32 }  : !xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 32]>> -> vector<1x1x32x32xf16>
+  gpu.func @sg_load_tile(%a: memref<32x32xf16>) {
+    %c0 = arith.constant 0 : index
+  	%1 = xetile.init_tile %a[%c0, %c0] : memref<32x32xf16> -> !xetile.tile<32x32xf16>
+    %2 = xetile.load_tile %1 : !xetile.tile<32x32xf16> -> vector<32x32xf16>
+  	gpu.return
+  }
+
+  //CHECK: gpu.func @sg_load_tile_unaligned(%[[arg0:.*]]: memref<128x128xf16>)
+  //CHECK: %[[c0:.*]] = arith.constant 0 : index
+  //CHECK: %[[R0:.*]] = xetile.init_tile %[[arg0]][%[[c0]], %[[c0]]] : memref<128x128xf16> -> !xetile.tile<85x76xf16, #xetile.tile_attr<inner_blocks = [17, 19]>>
+  //CHECK: %[[R1:.*]] = xetile.load_tile %[[R0]] { padding = 0.000000e+00 : f32 }  : !xetile.tile<85x76xf16, #xetile.tile_attr<inner_blocks = [17, 19]>> -> vector<5x4x17x19xf16>
+  //CHECK: gpu.return
+  gpu.func @sg_load_tile_unaligned(%a: memref<128x128xf16>) {
+    %c0 = arith.constant 0 : index
+    %1 = xetile.init_tile %a[%c0, %c0] : memref<128x128xf16> -> !xetile.tile<85x76xf16>
+    %2 = xetile.load_tile %1 : !xetile.tile<85x76xf16> -> vector<85x76xf16>
+   gpu.return
+  }
+
+  //CHECK: gpu.func @sg_store_tile(%[[arg0:.*]]: memref<32x32xf32>)
+  //CHECK: %[[cst:.*]] = arith.constant dense<0.000000e+00> : vector<4x2x8x16xf32>
+  //CHECK: %[[R0:.*]] = xetile.init_tile %[[arg0]][0, 0] : memref<32x32xf32> -> !xetile.tile<32x32xf32, #xetile.tile_attr<inner_blocks = [8, 16]>>
+  //CHECK: xetile.store_tile %[[cst]],  %[[R0]] : vector<4x2x8x16xf32>, !xetile.tile<32x32xf32, #xetile.tile_attr<inner_blocks = [8, 16]>>
+	gpu.func @sg_store_tile(%a: memref<32x32xf32>) {
+		%result = arith.constant dense<0.0>: vector<32x32xf32>
+		%1 = xetile.init_tile %a[0, 0] : memref<32x32xf32> -> !xetile.tile<32x32xf32>
+		xetile.store_tile %result, %1: vector<32x32xf32>, !xetile.tile<32x32xf32>
+		gpu.return
+	}
+
+
+  //CHECK: gpu.func @sg_store_tile_unaligned(%[[arg0:.*]]: memref<128x128xf32>)
+  //CHECK: %[[cst:.*]] = arith.constant dense<0.000000e+00> : vector<11x19x4x2xf32>
+  //CHECK: %[[R0:.*]] = xetile.init_tile %[[arg0]][0, 0] : memref<128x128xf32> -> !xetile.tile<44x38xf32, #xetile.tile_attr<inner_blocks = [4, 2]>>
+  //CHECK: xetile.store_tile %[[cst]],  %[[R0]] : vector<11x19x4x2xf32>, !xetile.tile<44x38xf32, #xetile.tile_attr<inner_blocks = [4, 2]>>
+  //CHECK: gpu.return
+	gpu.func @sg_store_tile_unaligned(%a: memref<128x128xf32>) {
+	  %result = arith.constant dense<0.0>: vector<44x38xf32>
+	  %1 = xetile.init_tile %a[0, 0] : memref<128x128xf32> -> !xetile.tile<44x38xf32>
+	  xetile.store_tile %result, %1: vector<44x38xf32>, !xetile.tile<44x38xf32>
+	  gpu.return
+	}
+
+  //CHECK: gpu.func @sg_tile_mma(%[[arg0:.*]]: memref<32x32xf16>, %[[arg1:.*]]: memref<32x32xf16>)
+  gpu.func @sg_tile_mma(%a: memref<32x32xf16>, %b: memref<32x32xf16>) {
+    //CHECK: %[[c0:.*]] = arith.constant 0 : index
+    %c0 = arith.constant 0 : index
+
+    //CHECK: %[[R0:.*]] = xetile.init_tile %[[arg0]][%[[c0]], %[[c0]]] : memref<32x32xf16> -> !xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 16]>>
+  	%1 = xetile.init_tile %a[%c0, %c0] : memref<32x32xf16> -> !xetile.tile<32x32xf16>
+
+    //CHECK: %[[R1:.*]] = xetile.load_tile %[[R0]] { padding = 0.000000e+00 : f32 }  : !xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 16]>> -> vector<1x2x32x16xf16>
+    //CHECK: %[[R2:.*]] = xetile.tile_unpack %[[R1]] { inner_blocks = [32, 16] }  : vector<1x2x32x16xf16> -> vector<32x32xf16>
+    %2 = xetile.load_tile %1 : !xetile.tile<32x32xf16> -> vector<32x32xf16>
+
+    //CHECK: %[[R3:.*]] = xetile.init_tile %[[arg1]][%[[c0]], %[[c0]]] : memref<32x32xf16> -> !xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 16]>>
+  	%3 = xetile.init_tile %b[%c0, %c0] : memref<32x32xf16> -> !xetile.tile<32x32xf16>
+
+    //CHECK: %[[R4:.*]] = xetile.load_tile %[[R3]] { padding = 0.000000e+00 : f32 }  : !xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 16]>> -> vector<1x2x32x16xf16>
+    //CHECK: %[[R5:.*]] = xetile.tile_unpack %[[R4]] { inner_blocks = [32, 16] }  : vector<1x2x32x16xf16> -> vector<32x32xf16>
+    %4 = xetile.load_tile %3 : !xetile.tile<32x32xf16> -> vector<32x32xf16>
+
+    //CHECK: %[[R6:.*]] = xetile.tile_pack %[[R2]] { inner_blocks = [8, 16] }  : vector<32x32xf16> -> vector<4x2x8x16xf16>
+    //CHECK: %[[R7:.*]] = xetile.tile_pack %[[R5]] { inner_blocks = [16, 16] }  : vector<32x32xf16> -> vector<2x2x16x16xf16>
+    //CHECK: %[[R8:.*]] = xetile.tile_mma %[[R6]], %[[R7]] : vector<4x2x8x16xf16>, vector<2x2x16x16xf16> -> vector<4x2x8x16xf32>
+    %5 = xetile.tile_mma %2, %4: vector<32x32xf16>, vector<32x32xf16> -> vector<32x32xf32>
+  	gpu.return
+  }
+
+  //CHECK: gpu.func @sg_gemm(%[[arg0:.*]]: memref<32x128xf16>, %[[arg1:.*]]: memref<128x32xf16>, %[[arg2:.*]]: memref<32x32xf32>)
+  gpu.func @sg_gemm(%a: memref<32x128xf16>, %b: memref<128x32xf16>, %c: memref<32x32xf32>) {
+    //CHECK: %[[c0:.*]] = arith.constant 0 : index
+    %c0 = arith.constant 0 : index
+
+    //CHECK: %[[c32:.*]] = arith.constant 32 : index
+    %c32 = arith.constant 32 : index
+
+    //CHECK: %[[c128:.*]] = arith.constant 128 : index
+    %c128 = arith.constant 128 : index
+
+    //CHECK: %[[cst:.*]] = arith.constant dense<0.000000e+00> : vector<4x2x8x16xf32>
+    %cst = arith.constant dense<0.0>: vector<32x32xf32>
+
+    //CHECK: %[[R0:.*]] = xetile.init_tile %[[arg0]][%[[c0]], %[[c0]]] : memref<32x128xf16> -> !xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 16]>>
+  	%1 = xetile.init_tile %a[%c0, %c0] : memref<32x128xf16> -> !xetile.tile<32x32xf16>
+
+    //CHECK: %[[R1:.*]] = xetile.init_tile %[[arg1]][%[[c0]], %[[c0]]] : memref<128x32xf16> -> !xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 16]>>
+  	%2 = xetile.init_tile %b[%c0, %c0] : memref<128x32xf16> -> !xetile.tile<32x32xf16>
+
+    //CHECK: %[[R2:.*]]:3 = scf.for %[[arg3:.*]] = %[[c0]] to %[[c128]] step %[[c32]] iter_args(%[[arg4:.*]] = %[[R0]], %[[arg5:.*]] = %[[R1]], %[[arg6:.*]] = %[[cst]]) -> (!xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 16]>>, !xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 16]>>, vector<4x2x8x16xf32>) {
+    %out:3 = scf.for %k = %c0 to %c128 step %c32 iter_args(%a_tile = %1, %b_tile = %2, %c_value = %cst)
+        -> (!xetile.tile<32x32xf16>, !xetile.tile<32x32xf16>, vector<32x32xf32>) {
+      //CHECK: %[[R4:.*]] = xetile.load_tile %[[arg4]] { padding = 0.000000e+00 : f32 }  : !xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 16]>> -> vector<1x2x32x16xf16>
+      //CHECK: %[[R5:.*]] = xetile.tile_unpack %[[R4]] { inner_blocks = [32, 16] }  : vector<1x2x32x16xf16> -> vector<32x32xf16>
+      //CHECK: %[[R6:.*]] = xetile.load_tile %[[arg5]] { padding = 0.000000e+00 : f32 }  : !xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 16]>> -> vector<1x2x32x16xf16>
+      //CHECK: %[[R7:.*]] = xetile.tile_unpack %[[R6]] { inner_blocks = [32, 16] }  : vector<1x2x32x16xf16> -> vector<32x32xf16>
+      %3 = xetile.load_tile %a_tile : !xetile.tile<32x32xf16> -> vector<32x32xf16>
+      %4 = xetile.load_tile %b_tile : !xetile.tile<32x32xf16> -> vector<32x32xf16>
+
+      //CHECK: %[[R8:.*]] = xetile.update_tile_offset %[[arg4]], [%[[c0]],  %[[c32]]] : !xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 16]>>, index, index -> !xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 16]>>
+      //CHECK: %[[R9:.*]] = xetile.update_tile_offset %[[arg5]], [%[[c32]],  %[[c0]]] : !xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 16]>>, index, index -> !xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 16]>>
+      %a_next_tile = xetile.update_tile_offset %a_tile, [%c0, %c32]: !xetile.tile<32x32xf16>, index, index -> !xetile.tile<32x32xf16>
+      %b_next_tile = xetile.update_tile_offset %b_tile, [%c32, %c0]: !xetile.tile<32x32xf16>, index, index -> !xetile.tile<32x32xf16>
+      //CHECK: %[[R10:.*]] = xetile.tile_pack %[[R5]] { inner_blocks = [8, 16] }  : vector<32x32xf16> -> vector<4x2x8x16xf16>
+      //CHECK: %[[R11:.*]] = xetile.tile_pack %[[R7]] { inner_blocks = [16, 16] }  : vector<32x32xf16> -> vector<2x2x16x16xf16>
+      //CHECK: %[[R12:.*]] = xetile.tile_mma %[[R10]], %[[R11]], %[[arg6]] : vector<4x2x8x16xf16>, vector<2x2x16x16xf16>, vector<4x2x8x16xf32> -> vector<4x2x8x16xf32>
+      %c_new_value = xetile.tile_mma %3, %4, %c_value:
+        vector<32x32xf16>, vector<32x32xf16>, vector<32x32xf32> -> vector<32x32xf32>
+      //CHECK: scf.yield %[[R8]], %[[R9]], %[[R12]] : !xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 16]>>, !xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 16]>>, vector<4x2x8x16xf32>
+      scf.yield %a_next_tile, %b_next_tile, %c_new_value : !xetile.tile<32x32xf16>, !xetile.tile<32x32xf16>, vector<32x32xf32>
+    }
+
+    //CHECK: %[[R3]] = xetile.init_tile %[[arg2]][%[[c0]], %[[c0]]] : memref<32x32xf32> -> !xetile.tile<32x32xf32, #xetile.tile_attr<inner_blocks = [8, 16]>>
+  	%c_tile = xetile.init_tile %c[%c0, %c0] : memref<32x32xf32> -> !xetile.tile<32x32xf32>
+    //CHECK: xetile.store_tile %[[R2]]#2,  %[[R3]] : vector<4x2x8x16xf32>, !xetile.tile<32x32xf32, #xetile.tile_attr<inner_blocks = [8, 16]>>
+    xetile.store_tile %out#2, %c_tile: vector<32x32xf32>, !xetile.tile<32x32xf32>
+  	gpu.return
+  }
+
+  // CHECK-LABEL: gpu.func @inner_reduction
+  // CHECK-SAME: (%[[arg0:.*]]: memref<128x256xf16>, %[[arg1:.*]]: memref<128x256xf16>)
+  gpu.func @inner_reduction(%a: memref<128x256xf16>, %b: memref<128x256xf16>) {
+    //CHECK: %[[c0:.*]] = arith.constant 0 : index
+    %c0 = arith.constant 0 : index
+
+    //CHECK: %[[R0:.*]] = xetile.init_tile %[[arg0]][%[[c0]], %[[c0]]] : memref<128x256xf16> -> !xetile.tile<16x32xf16, #xetile.tile_attr<inner_blocks = [16, 16]>>
+    %t = xetile.init_tile %a[%c0, %c0] : memref<128x256xf16> -> !xetile.tile<16x32xf16>
+    //CHECK: %[[R1:.*]] = xetile.load_tile %[[R0]] { padding = 0.000000e+00 : f32 }  : !xetile.tile<16x32xf16, #xetile.tile_attr<inner_blocks = [16, 16]>> -> vector<1x2x16x16xf16>
+    //CHECK: %[[R2:.*]] = xetile.tile_unpack %[[R1]] { inner_blocks = [16, 16] }  : vector<1x2x16x16xf16> -> vector<16x32xf16>
+    %v = xetile.load_tile %t : !xetile.tile<16x32xf16> -> vector<16x32xf16>
+    //CHECK: %[[R3:.*]] = xetile.tile_pack %[[R2]] { inner_blocks = [1, 16] }  : vector<16x32xf16> -> vector<16x2x1x16xf16>
+    //CHECK: %[[R4:.*]] = math.exp %[[R3]] : vector<16x2x1x16xf16>
+    %e = math.exp %v: vector<16x32xf16>
+
+    //CHECK: %[[R5:.*]] = xetile.reduction <add>, %[[R4]] [1, 3] : vector<16x2x1x16xf16> -> vector<16x1x1x1xf16>
+    //CHECK: %[[R6:.*]] = xetile.tile_unpack %[[R5]] { inner_blocks = [1, 1] }  : vector<16x1x1x1xf16> -> vector<16x1xf16>
+    %r = xetile.reduction <add>, %e [1] : vector<16x32xf16> -> vector<16x1xf16>
+    //CHECK: %[[R7:.*]] = vector.shape_cast %[[R6]] : vector<16x1xf16> to vector<2x8xf16>
+    %c = vector.shape_cast %r: vector<16x1xf16> to vector<2x8xf16>
+    //CHECK: %[[R8:.*]] = xetile.init_tile %[[arg1]][%[[c0]], %[[c0]]] : memref<128x256xf16> -> !xetile.tile<2x8xf16, #xetile.tile_attr<inner_blocks = [2, 8]>>
+    %s = xetile.init_tile %b[%c0, %c0] : memref<128x256xf16> -> !xetile.tile<2x8xf16>
+    //CHECK: %[[R9:.*]] = xetile.tile_pack %[[R7]] { inner_blocks = [2, 8] }  : vector<2x8xf16> -> vector<1x1x2x8xf16>
+    //CHECK: xetile.store_tile %[[R9]],  %[[R8]] : vector<1x1x2x8xf16>, !xetile.tile<2x8xf16, #xetile.tile_attr<inner_blocks = [2, 8]>>
+    xetile.store_tile %c, %s : vector<2x8xf16>, !xetile.tile<2x8xf16>
+    gpu.return
+  }
+
+  //CHECK: gpu.func @outter_reduction(%[[arg0:.*]]: memref<128x256xf16>, %[[arg1:.*]]: memref<128x256xf16>) {
+  gpu.func @outter_reduction(%a: memref<128x256xf16>, %b: memref<128x256xf16>) {
+    //CHECK: %[[c0:.*]] = arith.constant 0 : index
+    %c0 = arith.constant 0 : index
+
+    //CHECK: %[[R0:.*]] = xetile.init_tile %[[arg0]][%[[c0]], %[[c0]]] : memref<128x256xf16> -> !xetile.tile<16x32xf16, #xetile.tile_attr<inner_blocks = [16, 16]>>
+    %t = xetile.init_tile %a[%c0, %c0] : memref<128x256xf16> -> !xetile.tile<16x32xf16>
+
+    //CHECK: %[[R1:.*]] = xetile.load_tile %[[R0]] { padding = 0.000000e+00 : f32 }  : !xetile.tile<16x32xf16, #xetile.tile_attr<inner_blocks = [16, 16]>> -> vector<1x2x16x16xf16>
+    //CHECK: %[[R2:.*]] = xetile.tile_unpack %[[R1]] { inner_blocks = [16, 16] }  : vector<1x2x16x16xf16> -> vector<16x32xf16>
+    %v = xetile.load_tile %t : !xetile.tile<16x32xf16> -> vector<16x32xf16>
+
+    //CHECK: %[[R3:.*]] = xetile.tile_pack %[[R2]] { inner_blocks = [1, 16] }  : vector<16x32xf16> -> vector<16x2x1x16xf16>
+    //CHECK: %[[R4:.*]] = math.exp %[[R3]] : vector<16x2x1x16xf16>
+    %e = math.exp %v: vector<16x32xf16>
+
+    //CHECK: %[[R5:.*]] = xetile.reduction <add>, %4 [0, 2] : vector<16x2x1x16xf16> -> vector<1x2x1x16xf16>
+    //CHECK: %[[R6:.*]] = xetile.tile_unpack %5 { inner_blocks = [1, 16] }  : vector<1x2x1x16xf16> -> vector<1x32xf16>
+    %r = xetile.reduction <add>, %e [0] : vector<16x32xf16> -> vector<1x32xf16>
+
+    //CHECK: %[[R7:.*]] = vector.shape_cast %6 : vector<1x32xf16> to vector<4x8xf16>
+    %c = vector.shape_cast %r: vector<1x32xf16> to vector<4x8xf16>
+
+    //CHECK: %[[R8:.*]] = xetile.init_tile %arg1[%c0, %c0] : memref<128x256xf16> -> !xetile.tile<4x8xf16, #xetile.tile_attr<inner_blocks = [4, 8]>>
+    %s = xetile.init_tile %b[%c0, %c0] : memref<128x256xf16> -> !xetile.tile<4x8xf16>
+
+    //CHECK: %[[R9:.*]] = xetile.tile_pack %7 { inner_blocks = [4, 8] }  : vector<4x8xf16> -> vector<1x1x4x8xf16>
+    //CHECK: xetile.store_tile %9,  %8 : vector<1x1x4x8xf16>, !xetile.tile<4x8xf16, #xetile.tile_attr<inner_blocks = [4, 8]>>
+    xetile.store_tile %c, %s : vector<4x8xf16>, !xetile.tile<4x8xf16>
+    gpu.return
+  }
+
+  //CHECK-LABEL: gpu.func @sg_gemm_with_preops_for_c
+  //CHECK-SAME: (%[[arg0:.*]]: memref<32x128xf16>, %[[arg1:.*]]: memref<128x32xf16>, %[[arg2:.*]]: memref<32x32xf32>)
+  gpu.func @sg_gemm_with_preops_for_c(%a: memref<32x128xf16>, %b: memref<128x32xf16>, %c: memref<32x32xf32>) {
+    //CHECK: %[[c0:.*]] = arith.constant 0 : index
+    //CHECK: %[[c32:.*]] = arith.constant 32 : index
+    //CHECK: %[[c128:.*]] = arith.constant 128 : index
+    //CHECK: %[[cst:.*]] = arith.constant dense<0.000000e+00> : vector<4x2x8x16xf32>
+    %c0 = arith.constant 0 : index
+    %c32 = arith.constant 32 : index
+    %c128 = arith.constant 128 : index
+    %cst = arith.constant dense<0.0>: vector<32x32xf32>
+
+    //CHECK: %[[r0:.*]] = xetile.init_tile %[[arg0]][%[[c0]], %[[c0]]] : memref<32x128xf16> -> !xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 16]>>
+    //CHECK: %[[r1:.*]] = xetile.init_tile %[[arg1]][%[[c0]], %[[c0]]] : memref<128x32xf16> -> !xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 16]>>
+  	%1 = xetile.init_tile %a[%c0, %c0] : memref<32x128xf16> -> !xetile.tile<32x32xf16>
+  	%2 = xetile.init_tile %b[%c0, %c0] : memref<128x32xf16> -> !xetile.tile<32x32xf16>
+    //CHECK: %[[r2:.*]]:3 = scf.for %[[arg3:.*]] = %[[c0]] to %[[c128]] step %[[c32]] iter_args(%[[arg4:.*]] = %[[r0]], %[[arg5:.*]] = %[[r1]], %[[arg6:.*]] = %[[cst]]) -> (!xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 16]>>, !xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 16]>>, vector<4x2x8x16xf32>) {
+    %out:3 = scf.for %k = %c0 to %c128 step %c32 iter_args(%a_tile = %1, %b_tile = %2, %c_value = %cst)
+        -> (!xetile.tile<32x32xf16>, !xetile.tile<32x32xf16>, vector<32x32xf32>) {
+      //CHECK: %[[r7:.*]] = xetile.load_tile %[[arg4]] { padding = 0.000000e+00 : f32 }  : !xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 16]>> -> vector<1x2x32x16xf16>
+      //CHECK: %[[r8:.*]] = xetile.tile_unpack %[[r7]] { inner_blocks = [32, 16] }  : vector<1x2x32x16xf16> -> vector<32x32xf16>
+      %3 = xetile.load_tile %a_tile : !xetile.tile<32x32xf16> -> vector<32x32xf16>
+
+      //CHECK: %[[r9:.*]] = xetile.load_tile %[[arg5]] { padding = 0.000000e+00 : f32 }  : !xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 16]>> -> vector<1x2x32x16xf16>
+      //CHECK: %[[r10:.*]] = xetile.tile_unpack %[[r9]] { inner_blocks = [32, 16] }  : vector<1x2x32x16xf16> -> vector<32x32xf16>
+      %4 = xetile.load_tile %b_tile : !xetile.tile<32x32xf16> -> vector<32x32xf16>
+
+      //CHECK: %[[r11:.*]] = xetile.update_tile_offset %[[arg4]], [%[[c0]],  %[[c32]]] : !xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 16]>>, index, index -> !xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 16]>>
+      //CHECK: %[[r12:.*]] = xetile.update_tile_offset %[[arg5]], [%[[c32]],  %[[c0]]] : !xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 16]>>, index, index -> !xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 16]>>
+      %a_next_tile = xetile.update_tile_offset %a_tile, [%c0, %c32]: !xetile.tile<32x32xf16>, index, index -> !xetile.tile<32x32xf16>
+      %b_next_tile = xetile.update_tile_offset %b_tile, [%c32, %c0]: !xetile.tile<32x32xf16>, index, index -> !xetile.tile<32x32xf16>
+
+      //CHECK: %[[r15:.*]] = arith.addf %[[arg6]], %[[arg6]] : vector<4x2x8x16xf32>
+      %5 = arith.addf %c_value, %c_value: vector<32x32xf32>
+
+      //CHECK: %[[r17:.*]] = xetile.tile_pack %[[r8]] { inner_blocks = [8, 16] }  : vector<32x32xf16> -> vector<4x2x8x16xf16>
+      //CHECK: %[[r18:.*]] = xetile.tile_pack %[[r10]] { inner_blocks = [16, 16] }  : vector<32x32xf16> -> vector<2x2x16x16xf16>
+      //CHECK: %[[r20:.*]] = xetile.tile_mma %[[r17]], %[[r18]], %[[r15]] : vector<4x2x8x16xf16>, vector<2x2x16x16xf16>, vector<4x2x8x16xf32> -> vector<4x2x8x16xf32>
+      %c_new_value = xetile.tile_mma %3, %4, %5: vector<32x32xf16>, vector<32x32xf16>, vector<32x32xf32> -> vector<32x32xf32>
+      //CHECK: scf.yield %[[r11]], %[[r12]], %[[r20]] : !xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 16]>>, !xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 16]>>, vector<4x2x8x16xf32>
+      scf.yield %a_next_tile, %b_next_tile, %c_new_value : !xetile.tile<32x32xf16>, !xetile.tile<32x32xf16>, vector<32x32xf32>
+    }
+
+    //CHECK: %[[r4:.*]] = xetile.init_tile %[[arg2]][%[[c0]], %[[c0]]] : memref<32x32xf32> -> !xetile.tile<32x32xf32, #xetile.tile_attr<inner_blocks = [8, 16]>>
+  	%c_tile = xetile.init_tile %c[%c0, %c0] : memref<32x32xf32> -> !xetile.tile<32x32xf32>
+
+    //CHECK: xetile.store_tile %[[r2]]#2,  %[[r4]] : vector<4x2x8x16xf32>, !xetile.tile<32x32xf32, #xetile.tile_attr<inner_blocks = [8, 16]>>
+    xetile.store_tile %out#2, %c_tile: vector<32x32xf32>, !xetile.tile<32x32xf32>
+  	gpu.return
+  }
+
+  //CHECK-LABEL: gpu.func @sglevel_reduction_broadcast_dim_0
+  //CHECK-SAME: (%[[arg0:.*]]: memref<1024x1024xf16>)
+  gpu.func @sglevel_reduction_broadcast_dim_0(%a: memref<1024x1024xf16>) {
+    //CHECK: %[[R0:.*]] = xetile.init_tile %[[arg0]][0, 0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16, #xetile.tile_attr<inner_blocks = [32, 16]>>
+    %1 = xetile.init_tile %a[0, 0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16>
+    //CHECK: %[[R1:.*]] = xetile.load_tile %[[R0]] { padding = 0.000000e+00 : f32 }  : !xetile.tile<32x64xf16, #xetile.tile_attr<inner_blocks = [32, 16]>> -> vector<1x4x32x16xf16>
+    //CHECK: %[[R2:.*]] = xetile.tile_unpack %[[R1]] { inner_blocks = [32, 16] }  : vector<1x4x32x16xf16> -> vector<32x64xf16>
+    %2 = xetile.load_tile %1: !xetile.tile<32x64xf16> -> vector<32x64xf16>
+    //CHECK: %[[R3:.*]] = xetile.tile_pack %[[R2]] { inner_blocks = [1, 16] }  : vector<32x64xf16> -> vector<32x4x1x16xf16>
+    //CHECK: %[[R4:.*]] = xetile.reduction <add>, %[[R3]] [0, 2] : vector<32x4x1x16xf16> -> vector<1x4x1x16xf16>
+    %3 = xetile.reduction <add>, %2 [0]: vector<32x64xf16> -> vector<1x64xf16>
+    //CHECK: %[[R5:.*]] = xetile.broadcast %[[R4]] [0, 2] : vector<1x4x1x16xf16> -> vector<32x2x1x32xf16>
+    //CHECK: %[[R6:.*]] = xetile.tile_unpack %[[R5]] { inner_blocks = [1, 32] }  : vector<32x2x1x32xf16> -> vector<32x64xf16>
+    %4 = xetile.broadcast %3 [0]: vector<1x64xf16> -> vector<32x64xf16>
+    //CHECK: %[[R7:.*]] = xetile.init_tile %[[arg0]][0, 0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16, #xetile.tile_attr<inner_blocks = [8, 32]>>
+    %5 = xetile.init_tile %a[0, 0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16>
+    //CHECK: %[[R8:.*]] = xetile.tile_pack %[[R6]] { inner_blocks = [8, 32] }  : vector<32x64xf16> -> vector<4x2x8x32xf16>
+    //CHECK: xetile.store_tile %[[R8]],  %[[R7]] : vector<4x2x8x32xf16>, !xetile.tile<32x64xf16, #xetile.tile_attr<inner_blocks = [8, 32]>>
+    xetile.store_tile %4, %5: vector<32x64xf16>, !xetile.tile<32x64xf16>
+    gpu.return
+  }
+
+
+  //CHECK-LABEL: gpu.func @sglevel_reduction_broadcast_dim_1
+  //CHECK-SAME: (%[[arg0:.*]]: memref<1024x1024xf16>)
+  gpu.func @sglevel_reduction_broadcast_dim_1(%a: memref<1024x1024xf16>) {
+    //CHECK: %[[R0:.*]] = xetile.init_tile %[[arg0]][0, 0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16, #xetile.tile_attr<inner_blocks = [32, 16]>>
+    %1 = xetile.init_tile %a[0, 0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16>
+    //CHECK: %[[R1:.*]] = xetile.load_tile %[[R0]] { padding = 0.000000e+00 : f32 }  : !xetile.tile<32x64xf16, #xetile.tile_attr<inner_blocks = [32, 16]>> -> vector<1x4x32x16xf16>
+    //CHECK: %[[R2:.*]] = xetile.tile_unpack %[[R1]] { inner_blocks = [32, 16] }  : vector<1x4x32x16xf16> -> vector<32x64xf16>
+    %2 = xetile.load_tile %1: !xetile.tile<32x64xf16> -> vector<32x64xf16>
+    //CHECK: %[[R3:.*]] = xetile.tile_pack %[[R2]] { inner_blocks = [1, 16] }  : vector<32x64xf16> -> vector<32x4x1x16xf16>
+    //CHECK: %[[R4:.*]] = xetile.reduction <add>, %[[R3]] [1, 3] : vector<32x4x1x16xf16> -> vector<32x1x1x1xf16>
+    %3 = xetile.reduction <add>, %2 [1]: vector<32x64xf16> -> vector<32x1xf16>
+    //CHECK: %[[R5:.*]] = xetile.broadcast %[[R4]] [1, 3] : vector<32x1x1x1xf16> -> vector<32x2x1x32xf16>
+    //CHECK: %[[R6:.*]] = xetile.tile_unpack %[[R5]] { inner_blocks = [1, 32] }  : vector<32x2x1x32xf16> -> vector<32x64xf16>
+    %4 = xetile.broadcast %3 [1]: vector<32x1xf16> -> vector<32x64xf16>
+    //CHECK: %[[R7:.*]] = xetile.init_tile %[[arg0]][0, 0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16, #xetile.tile_attr<inner_blocks = [8, 32]>>
+    %5 = xetile.init_tile %a[0, 0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16>
+    //CHECK: %[[R8:.*]] = xetile.tile_pack %[[R6]] { inner_blocks = [8, 32] }  : vector<32x64xf16> -> vector<4x2x8x32xf16>
+    //CHECK: xetile.store_tile %[[R8]],  %[[R7]] : vector<4x2x8x32xf16>, !xetile.tile<32x64xf16, #xetile.tile_attr<inner_blocks = [8, 32]>>
+    xetile.store_tile %4, %5: vector<32x64xf16>, !xetile.tile<32x64xf16>
+    gpu.return
+  }
+
+
+  //CHECK-LABEL: gpu.func @sglevel_reduction_broadcast_transpose
+  //CHECK-SAME(%[[arg0:.*]]: memref<1024x1024xf16>)
+  gpu.func @sglevel_reduction_broadcast_transpose(%a: memref<1024x1024xf16>) {
+    //CHECK: %[[R0:.*]] = xetile.init_tile %[[arg0]][0, 0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16, #xetile.tile_attr<inner_blocks = [32, 16]>>
+    %1 = xetile.init_tile %a[0, 0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16>
+    //CHECK: %[[R1:.*]] = xetile.load_tile %[[R0]] { padding = 0.000000e+00 : f32 }  : !xetile.tile<32x64xf16, #xetile.tile_attr<inner_blocks = [32, 16]>> -> vector<1x4x32x16xf16>
+    //CHECK: %[[R2:.*]] = xetile.tile_unpack %[[R1]] { inner_blocks = [32, 16] }  : vector<1x4x32x16xf16> -> vector<32x64xf16>
+    %2 = xetile.load_tile %1: !xetile.tile<32x64xf16> -> vector<32x64xf16>
+    //CHECK: %[[R3:.*]] = xetile.tile_pack %[[R2]] { inner_blocks = [1, 16] }  : vector<32x64xf16> -> vector<32x4x1x16xf16>
+    //CHECK: %[[R4:.*]] = xetile.reduction <add>, %[[R3]] [1, 3] : vector<32x4x1x16xf16> -> vector<32x1x1x1xf16>
+    %3 = xetile.reduction <add>, %2 [1]: vector<32x64xf16> -> vector<32x1xf16>
+    //CHECK: %[[R5:.*]] = xetile.broadcast %[[R4]] [1, 3] : vector<32x1x1x1xf16> -> vector<32x8x1x8xf16>
+    //CHECK: %[[R6:.*]] = xetile.tile_unpack %[[R5]] { inner_blocks = [1, 8] }  : vector<32x8x1x8xf16> -> vector<32x64xf16>
+    %4 = xetile.broadcast %3 [1]: vector<32x1xf16> -> vector<32x64xf16>
+    //CHECK: %[[R7:.*]] = xetile.tile_pack %[[R6]] { inner_blocks = [32, 8] }  : vector<32x64xf16> -> vector<1x8x32x8xf16>
+    //CHECK: %[[R8:.*]] = xetile.transpose %[[R7]], [1, 0, 3, 2] : vector<1x8x32x8xf16> -> vector<8x1x8x32xf16>
+    %5 = xetile.transpose %4, [1, 0]: vector<32x64xf16> -> vector<64x32xf16>
+    //CHECK: %[[R9:.*]] = xetile.init_tile %[[arg0]][0, 0] : memref<1024x1024xf16> -> !xetile.tile<64x32xf16, #xetile.tile_attr<inner_blocks = [8, 32]>>
+    %6 = xetile.init_tile %a[0, 0] : memref<1024x1024xf16> -> !xetile.tile<64x32xf16>
+    //CHECK: xetile.store_tile %[[R8]],  %[[R9]] : vector<8x1x8x32xf16>, !xetile.tile<64x32xf16, #xetile.tile_attr<inner_blocks = [8, 32]>>
+    xetile.store_tile %5, %6: vector<64x32xf16>, !xetile.tile<64x32xf16>
+    gpu.return
+  }
+
+  //CHECK-LABEL: gpu.func @sglevel_softmax_dim_0
+  //CHECK-SAME(%[[arg0:.*]]: memref<1024x1024xf16>)
+  gpu.func @sglevel_softmax_dim_0(%a: memref<1024x1024xf16>) {
+
+    //CHECK: %[[R0:.*]] = xetile.init_tile %[[arg0]][0, 0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16, #xetile.tile_attr<inner_blocks = [32, 16]>>
+    %1 = xetile.init_tile %a[0, 0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16>
+    //CHECK: %[[R1:.*]] = xetile.load_tile %[[R0]] { padding = 0.000000e+00 : f32 }  : !xetile.tile<32x64xf16, #xetile.tile_attr<inner_blocks = [32, 16]>> -> vector<1x4x32x16xf16>
+    //CHECK: %[[R2:.*]] = xetile.tile_unpack %[[R1]] { inner_blocks = [32, 16] }  : vector<1x4x32x16xf16> -> vector<32x64xf16>
+    %2 = xetile.load_tile %1: !xetile.tile<32x64xf16> -> vector<32x64xf16>
+    //CHECK: %[[R3:.*]] = xetile.tile_pack %[[R2]] { inner_blocks = [8, 16] }  : vector<32x64xf16> -> vector<4x4x8x16xf16>
+    //CHECK: %[[R4:.*]] = math.exp %[[R3]] : vector<4x4x8x16xf16>
+    //CHECK: %[[R5:.*]] = xetile.tile_unpack %[[R4]] { inner_blocks = [8, 16] }  : vector<4x4x8x16xf16> -> vector<32x64xf16>
+    %3 = math.exp %2: vector<32x64xf16>
+    //CHECK: %[[R6:.*]] = xetile.tile_pack %[[R5]] { inner_blocks = [1, 16] }  : vector<32x64xf16> -> vector<32x4x1x16xf16>
+    //CHECK: %[[R7:.*]] = xetile.reduction <add>, %[[R6]] [0, 2] : vector<32x4x1x16xf16> -> vector<1x4x1x16xf16>
+    %4 = xetile.reduction <add>, %3 [0]: vector<32x64xf16> -> vector<1x64xf16>
+    //CHECK: %[[R8:.*]] = xetile.broadcast %[[R7]] [0, 2] : vector<1x4x1x16xf16> -> vector<32x4x1x16xf16>
+    //CHECK: %[[R9:.*]] = xetile.tile_unpack %[[R8]] { inner_blocks = [1, 16] }  : vector<32x4x1x16xf16> -> vector<32x64xf16>
+    %5 = xetile.broadcast %4 [0]: vector<1x64xf16> -> vector<32x64xf16>
+    //CHECK: %[[R10:.*]] = xetile.tile_pack %[[R9]] { inner_blocks = [8, 16] }  : vector<32x64xf16> -> vector<4x4x8x16xf16>
+    //CHECK: %[[R11:.*]] = arith.divf %[[R4]], %[[R10]] : vector<4x4x8x16xf16>
+    //CHECK: %[[R12:.*]] = xetile.tile_unpack %[[R11]] { inner_blocks = [8, 16] }  : vector<4x4x8x16xf16> -> vector<32x64xf16>
+    %6 = arith.divf %3, %5: vector<32x64xf16>
+    //CHECK: %[[R13:.*]] = xetile.init_tile %[[arg0]][0, 0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16, #xetile.tile_attr<inner_blocks = [8, 32]>>
+    %7 = xetile.init_tile %a[0, 0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16>
+    //CHECK: %[[R14:.*]] = xetile.tile_pack %[[R12]] { inner_blocks = [8, 32] }  : vector<32x64xf16> -> vector<4x2x8x32xf16>
+    //CHECK: xetile.store_tile %[[R14]],  %[[R13]] : vector<4x2x8x32xf16>, !xetile.tile<32x64xf16, #xetile.tile_attr<inner_blocks = [8, 32]>>
+    xetile.store_tile %6, %7: vector<32x64xf16>, !xetile.tile<32x64xf16>
+    gpu.return
+  }
+
+}
diff --git a/test/Dialect/XeTile/Transforms/blocking.mlir b/test/Dialect/XeTile/Transforms/blocking.mlir
index 06c6b183c..5a567d286 100644
--- a/test/Dialect/XeTile/Transforms/blocking.mlir
+++ b/test/Dialect/XeTile/Transforms/blocking.mlir
@@ -302,8 +302,8 @@ gpu.module @test_kernel {
       //CHECK: %[[r4:.*]] = math.exp %[[r3]] : vector<32x4x1x16xf16>
       %3 = math.exp %2: vector<32x64xf16>
 
-      //CHECK: %[[r5:.*]] = xetile.reduce <add>, %[[r4]] [0, 2] : vector<32x4x1x16xf16> -> vector<1x4x1x16xf16>
-      %4 = xetile.reduce <add>, %3 [0]: vector<32x64xf16> -> vector<1x64xf16>
+      //CHECK: %[[r5:.*]] = xetile.reduction <add>, %[[r4]] [0, 2] : vector<32x4x1x16xf16> -> vector<1x4x1x16xf16>
+      %4 = xetile.reduction <add>, %3 [0]: vector<32x64xf16> -> vector<1x64xf16>
 
       //CHECK: %[[r6:.*]] = xetile.broadcast %[[r5]] [0, 2] : vector<1x4x1x16xf16> -> vector<32x4x1x16xf16>
       %5 = xetile.broadcast %4 [0]: vector<1x64xf16> -> vector<32x64xf16>
@@ -336,8 +336,8 @@ gpu.module @test_kernel {
       //CHECK: %[[r4:.*]] = math.exp %[[r3]] : vector<32x4x1x16xf16>
       %3 = math.exp %2: vector<32x64xf16>
 
-      //CHECK: %[[r5:.*]] = xetile.reduce <add>, %[[r4]] [1, 3] : vector<32x4x1x16xf16> -> vector<32x1x1x1xf16>
-      %4 = xetile.reduce <add>, %3 [1]: vector<32x64xf16> -> vector<32x1xf16>
+      //CHECK: %[[r5:.*]] = xetile.reduction <add>, %[[r4]] [1, 3] : vector<32x4x1x16xf16> -> vector<32x1x1x1xf16>
+      %4 = xetile.reduction <add>, %3 [1]: vector<32x64xf16> -> vector<32x1xf16>
 
       //CHECK: %[[r6:.*]] = xetile.broadcast %[[r5]] [1, 3] : vector<32x1x1x1xf16> -> vector<32x4x1x16xf16>
       %5 = xetile.broadcast %4 [1]: vector<32x1xf16> -> vector<32x64xf16>
@@ -370,8 +370,8 @@ gpu.module @test_kernel {
       //CHECK: %[[r4:.*]] = math.exp %[[r3]] : vector<32x4x1x16xf16>
       %3 = math.exp %2: vector<32x64xf16>
 
-      //CHECK: %[[r5:.*]] = xetile.reduce <add>, %[[r4]] [1, 3] : vector<32x4x1x16xf16> -> vector<32x1x1x1xf16>
-      %4 = xetile.reduce <add>, %3 [1]: vector<32x64xf16> -> vector<32x1xf16>
+      //CHECK: %[[r5:.*]] = xetile.reduction <add>, %[[r4]] [1, 3] : vector<32x4x1x16xf16> -> vector<32x1x1x1xf16>
+      %4 = xetile.reduction <add>, %3 [1]: vector<32x64xf16> -> vector<32x1xf16>
 
       //CHECK: %[[r6:.*]] = xetile.broadcast %[[r5]] [1, 3] : vector<32x1x1x1xf16> -> vector<32x4x1x16xf16>
       %5 = xetile.broadcast %4 [1]: vector<32x1xf16> -> vector<32x64xf16>
diff --git a/test/Integration/Dialect/XeTile/block_reduce_dim_0_fp32.mlir b/test/Integration/Dialect/XeTile/block_reduce_dim_0_fp32.mlir
index 0d5076100..826c132d5 100644
--- a/test/Integration/Dialect/XeTile/block_reduce_dim_0_fp32.mlir
+++ b/test/Integration/Dialect/XeTile/block_reduce_dim_0_fp32.mlir
@@ -36,7 +36,7 @@ module @softmax attributes {gpu.container_module} {
       %n = arith.muli %block_id_y, %c32 : index
       %1 = xetile.init_tile %a[%m, %n] : memref<16x1024xf32> -> !xetile.tile<16x32xf32>
       %2 = xetile.load_tile %1: !xetile.tile<16x32xf32> -> vector<16x32xf32>
-      %4 = xetile.reduce <add>, %2 [0]: vector<16x32xf32> -> vector<1x32xf32>
+      %4 = xetile.reduction <add>, %2 [0]: vector<16x32xf32> -> vector<1x32xf32>
       %5 = xetile.init_tile %b[0, %n] : memref<1x1024xf32> -> !xetile.tile<1x32xf32>
       xetile.store_tile %4, %5: vector<1x32xf32>, !xetile.tile<1x32xf32>
       gpu.return
diff --git a/test/Integration/Dialect/XeTile/block_reduce_dim_1_fp32.mlir b/test/Integration/Dialect/XeTile/block_reduce_dim_1_fp32.mlir
index 3183930d8..d562e0da7 100644
--- a/test/Integration/Dialect/XeTile/block_reduce_dim_1_fp32.mlir
+++ b/test/Integration/Dialect/XeTile/block_reduce_dim_1_fp32.mlir
@@ -39,7 +39,7 @@ module @softmax attributes {gpu.container_module} {
       %1 = xetile.init_tile %a[%m, %n] : memref<1024x32xf32> -> !xetile.tile<16x32xf32>
       %2 = xetile.load_tile %1: !xetile.tile<16x32xf32> -> vector<16x32xf32>
 
-      %4 = xetile.reduce <add>, %2 [1]: vector<16x32xf32> -> vector<16x1xf32>
+      %4 = xetile.reduction <add>, %2 [1]: vector<16x32xf32> -> vector<16x1xf32>
       %5 = xetile.init_tile %b[0, %m] : memref<1x1024xf32> -> !xetile.tile<1x16xf32>
       %cast = vector.shape_cast %4: vector<16x1xf32> to vector<1x16xf32>
       xetile.store_tile %cast, %5: vector<1x16xf32>, !xetile.tile<1x16xf32>
diff --git a/test/Integration/Dialect/XeTile/block_softmax_dim_0_fp32.mlir b/test/Integration/Dialect/XeTile/block_softmax_dim_0_fp32.mlir
index 6613172d3..46bafbc15 100644
--- a/test/Integration/Dialect/XeTile/block_softmax_dim_0_fp32.mlir
+++ b/test/Integration/Dialect/XeTile/block_softmax_dim_0_fp32.mlir
@@ -37,7 +37,7 @@ module @softmax attributes {gpu.container_module} {
       %1 = xetile.init_tile %a[%m, %n] : memref<1024x1024xf32> -> !xetile.tile<16x32xf32>
       %2 = xetile.load_tile %1: !xetile.tile<16x32xf32> -> vector<16x32xf32>
       %3 = math.exp %2: vector<16x32xf32>
-      %4 = xetile.reduce <add>, %3 [0]: vector<16x32xf32> -> vector<1x32xf32>
+      %4 = xetile.reduction <add>, %3 [0]: vector<16x32xf32> -> vector<1x32xf32>
       %5 = xetile.broadcast %4 [0]: vector<1x32xf32> -> vector<16x32xf32>
       %6 = arith.divf %3, %5: vector<16x32xf32>
       %7 = xetile.init_tile %b[%m, %n] : memref<1024x1024xf32> -> !xetile.tile<16x32xf32>
diff --git a/test/Integration/Dialect/XeTile/block_softmax_dim_1_fp32.mlir b/test/Integration/Dialect/XeTile/block_softmax_dim_1_fp32.mlir
index 317d42576..c93dfada2 100644
--- a/test/Integration/Dialect/XeTile/block_softmax_dim_1_fp32.mlir
+++ b/test/Integration/Dialect/XeTile/block_softmax_dim_1_fp32.mlir
@@ -37,7 +37,7 @@ module @block_softmax attributes {gpu.container_module} {
       %1 = xetile.init_tile %a[%m, %n] : memref<1024x1024xf32> -> !xetile.tile<16x32xf32>
       %2 = xetile.load_tile %1: !xetile.tile<16x32xf32> -> vector<16x32xf32>
       %3 = math.exp %2: vector<16x32xf32>
-      %4 = xetile.reduce <add>, %3 [1]: vector<16x32xf32> -> vector<16x1xf32>
+      %4 = xetile.reduction <add>, %3 [1]: vector<16x32xf32> -> vector<16x1xf32>
       %5 = xetile.broadcast %4 [1]: vector<16x1xf32> -> vector<16x32xf32>
       %6 = arith.divf %3, %5: vector<16x32xf32>
       %7 = xetile.init_tile %b[%m, %n] : memref<1024x1024xf32> -> !xetile.tile<16x32xf32>