From 1fd2e0002630fc16e73cc55039a7ea175de650e6 Mon Sep 17 00:00:00 2001
From: Igor Zamyatin <igor.zamyatin@intel.com>
Date: Tue, 8 Oct 2024 14:30:25 -0500
Subject: [PATCH] llvm pulldown 10/2024 (#915)

---
 build_tools/llvm_version.txt                  |   2 +-
 ...upport-for-VectorAnyINTEL-capability.patch |  99 ++--
 ...e-spirv.CL.printf-op-assembly-format.patch |  49 --
 ...onstant-attribute-in-ParseDecoration.patch |  36 --
 ...n-and-de-serialization-support-for-s.patch |  33 +-
 ...0007-Move-chunk_size-into-TensorDesc.patch | 432 ------------------
 ...mporary-downstream-defintion-changes.patch |  38 +-
 ...ative-bf16-support-in-SPIR-V-dialect.patch |  52 +--
 .../0010-refine-the-XeGPU-definition.patch    | 206 ---------
 docs/rfcs/XeGPU.md                            |  30 +-
 include/imex/Dialect/XeTile/IR/XeTileAttrs.td |  14 +-
 include/imex/Dialect/XeTile/IR/XeTileTypes.td |  14 +-
 lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp  | 113 -----
 lib/Conversion/XeGPUToVC/LSCPatterns.cpp      |  55 +--
 lib/Conversion/XeGPUToVC/XeGPUToVC.cpp        |  46 +-
 .../XeTileToXeGPU/ArithOpConversion.cpp       |   6 +-
 .../XeTileToXeGPU/XeTileOpConversion.cpp      |   8 +-
 lib/Dialect/XeTile/IR/XeTileDialect.cpp       |   2 +-
 lib/Dialect/XeTile/IR/XeTileOps.cpp           |   2 +-
 .../XeTile/Transforms/BlockAligning.cpp       |   2 +-
 lib/Dialect/XeTile/Transforms/Blocking.cpp    |   6 +-
 .../XeTile/Transforms/BlockingAnalysis.cpp    |  17 +-
 .../XeTile/Transforms/BlockingRewrite.cpp     |   2 +-
 .../XeTile/Transforms/Canonicalization.cpp    |   8 +-
 lib/Transforms/OptimizeTranspose.cpp          |   4 +-
 lib/Transforms/PropagatePackedLayout.cpp      |  12 +-
 lib/Transforms/VnniTransformation.cpp         |  16 +-
 lib/Utils/XeArch.cpp                          |   4 +-
 test/Conversion/GPUToSPIRV/printf.mlir        |   2 +-
 test/Conversion/XeGPUToVC/atomiclsc.mlir      |  44 +-
 .../XeGPUToVC/load_global_no_chunk_f16.mlir   |  37 +-
 .../XeGPUToVC/load_global_no_chunk_f32.mlir   |  29 +-
 .../prefetch_global_no_chunk_f16.mlir         |  19 +-
 .../prefetch_global_no_chunk_f32.mlir         |  18 +-
 .../store_load_slm_no_chunk_f16.mlir          |  50 +-
 .../store_load_slm_no_chunk_f32.mlir          |  36 +-
 .../XeTileToXeGPU/array_length_load.mlir      |   4 +-
 test/Conversion/XeTileToXeGPU/lit.local.cfg   |   8 +
 test/Conversion/XeTileToXeGPU/reduction.mlir  |  20 +-
 .../sg_gemm_1k_1k_1k_f16_f32.mlir             | 222 ++++-----
 .../sg_gemm_1k_1k_1k_f16_f32_slm.mlir         |  80 ++--
 .../sg_gemm_1k_1k_1k_i8_i32.mlir              |  38 +-
 .../sg_gemm_1k_1k_1k_tf32_tf32.mlir           |  80 ++--
 .../XeTileToXeGPU/sg_gemm_transpose_b.mlir    |  10 +-
 .../XeTileToXeGPU/sg_load_tile.mlir           |   4 +-
 .../XeTileToXeGPU/sg_mixed_scf.mlir           |  50 +-
 test/Conversion/XeTileToXeGPU/sg_scf_for.mlir |  26 +-
 test/Conversion/XeTileToXeGPU/sg_softmax.mlir |  16 +-
 .../XeTileToXeGPU/sg_store_tile.mlir          |  32 +-
 .../Conversion/XeTileToXeGPU/sg_tile_mma.mlir |  12 +-
 .../XeTileToXeGPU/sg_tiled_broadcast.mlir     |   4 +-
 .../XeTileToXeGPU/sg_tiled_load_tile.mlir     |   4 +-
 .../XeTileToXeGPU/sg_tiled_scf_for.mlir       |  26 +-
 .../XeTileToXeGPU/sg_tiled_softmax.mlir       |  16 +-
 .../XeTileToXeGPU/sg_tiled_store_tile.mlir    |  32 +-
 .../XeTileToXeGPU/sg_tiled_tile_mma.mlir      |  12 +-
 test/Conversion/XeTileToXeGPU/test_order.mlir |   8 +-
 test/Dialect/XeGPU/IR/XeGPUOps.mlir           |   8 +-
 test/Dialect/XeGPU/IR/atomic_rmw_vc.mlir      |  12 +-
 test/Dialect/XeGPU/IR/create_nd_tdesc.mlir    |  16 +-
 test/Dialect/XeGPU/IR/create_nd_tdesc_vc.mlir |  19 +-
 test/Dialect/XeGPU/IR/create_tdesc_vc.mlir    |  50 +-
 test/Dialect/XeGPU/IR/invalid_vc.mlir         |  26 +-
 test/Dialect/XeGPU/IR/load_gather_vc.mlir     |  24 +-
 test/Dialect/XeGPU/IR/store_scatter_vc.mlir   |  13 +-
 test/Dialect/XeGPU/IR/update_offset_vc.mlir   |  14 +-
 test/Dialect/XeTile/IR/ops.mlir               |   4 +-
 .../sg_gemm_1k_1k_1k_f16_f32_slm.mlir         |   6 +-
 .../sg_gemm_1k_1k_1k_f16_f32_slm.mlir         |   6 +-
 .../Transforms/wg_to_sg_btranspose.mlir       |  28 +-
 .../Transforms/wg_to_sg_gemm_postop.mlir      |   6 +-
 .../load_global_chunk_4_f32.mlir              |   8 +-
 .../load_global_chunk_8_f32.mlir              |   8 +-
 .../load_global_no_chunk_f16.mlir             |   7 +-
 .../load_global_no_chunk_f32.mlir             |   6 +-
 .../store_global_chunk_4_f32.mlir             |   5 +-
 .../store_global_chunk_8_f32.mlir             |   5 +-
 .../store_global_no_chunk_f16.mlir            |   5 +-
 .../store_global_no_chunk_f32.mlir            |   5 +-
 .../store_load_slm_chunk_4_f32.mlir           |   9 +-
 .../store_load_slm_chunk_8_f32.mlir           |   9 +-
 .../store_load_slm_chunk_8_f32_mask.mlir      |   8 +-
 .../store_load_slm_no_chunk_f16.mlir          |   9 +-
 .../store_load_slm_no_chunk_f32.mlir          |   9 +-
 .../Dialect/XeGPU/load1d-slm-f32.mlir         |   2 +-
 .../XeGPU/loadgather2d_masked_f32.mlir        |  11 +-
 .../XeGPU/loadgather_chunk_size_f32.mlir      |   5 +-
 .../XeGPU/loadgather_chunk_size_i32.mlir      |   5 +-
 .../Dialect/XeGPU/loadgather_f32.mlir         |   5 +-
 .../Dialect/XeGPU/loadgather_masked_f32.mlir  |   5 +-
 .../Dialect/XeGPU/optimize_transpose.mlir     |  46 +-
 test/SPIRV/OpTest.spirv.CL.printf.mlir        |   2 +-
 .../postop_reduce_n.mlir                      |  76 +--
 .../VectorLinearize/postop_reduce_n.mlir      |  76 +--
 .../VnniTransform/gemm_with_extract.mlir      |  58 +--
 .../VnniTransform/gemm_with_extract_e2e.mlir  |  56 +--
 96 files changed, 1034 insertions(+), 1885 deletions(-)
 delete mode 100644 build_tools/patches/0002-change-spirv.CL.printf-op-assembly-format.patch
 delete mode 100644 build_tools/patches/0003-Add-Constant-attribute-in-ParseDecoration.patch
 delete mode 100644 build_tools/patches/0007-Move-chunk_size-into-TensorDesc.patch
 delete mode 100644 build_tools/patches/0010-refine-the-XeGPU-definition.patch
 create mode 100644 test/Conversion/XeTileToXeGPU/lit.local.cfg

diff --git a/build_tools/llvm_version.txt b/build_tools/llvm_version.txt
index 0ca8e4c0b..33000613b 100644
--- a/build_tools/llvm_version.txt
+++ b/build_tools/llvm_version.txt
@@ -1 +1 @@
-08a61eb01172054fc5f8c78ff527f01d9768569b
+add6b2f35f2bcf1f59a2ab2d5b3dab124fe0895a
diff --git a/build_tools/patches/0001-Add-support-for-VectorAnyINTEL-capability.patch b/build_tools/patches/0001-Add-support-for-VectorAnyINTEL-capability.patch
index b04bc1020..531b66a3e 100644
--- a/build_tools/patches/0001-Add-support-for-VectorAnyINTEL-capability.patch
+++ b/build_tools/patches/0001-Add-support-for-VectorAnyINTEL-capability.patch
@@ -1,35 +1,14 @@
-From 94cc2bb6a778cad3b762244d6d78ecf2e19b5372 Mon Sep 17 00:00:00 2001
-From: Md Abdullah Shahneous Bari <Md.Abdullah.Shahneous.Bari@intel.com>
-Date: Fri, 26 Apr 2024 20:20:28 +0000
-Subject: [PATCH 1/7] Add-support-for-VectorAnyINTEL-capability
-
-Allow vector of any lengths between [2-2^63-1].
-VectorAnyINTEL capability (part of "SPV_INTEL_vector_compute" extension)
-relaxes the length constraint on SPIR-V vector sizes from 2,3, and 4.
-
-Also add support for following:
-
-- Add support for capability inferred extension requirement checking.
-If a capability is a requirement, the respective extension that implements
-it should also become an extension requirement, there were no support for
-that check, as a result, the extension requirement had to be added separately.
-This separate requirement addition causes problem when a feature is enabled by
-multiple capability, and one of the capability is part of an extension. E.g.,
-vector size of 16 can be enabled by both "Vector16" and "vectorAnyINTEL"
-capability, however, only "vectorAnyINTEL" has an extension requirement
-("SPV_INTEL_vector_compute"). Since the process of adding capability
-and extension requirement are independent, there is no way, to handle
-cases like this. Therefore, for cases like this, enable adding capability
-requirement initially, then do the check for capability inferred extension.
-
-- Add support for optionally skipping capability and extension requirement
+From 45b150c9a0c4e4bd60c153e5142da17fd6cde6da Mon Sep 17 00:00:00 2001
+From: izamyati <igor.zamyatin@intel.com>
+Date: Tue, 24 Sep 2024 17:42:02 -0500
+Subject: [PATCH] Add support for VectorAnyINTEL capability
 
 ---
  .../mlir/Dialect/SPIRV/IR/SPIRVBase.td        |   9 +-
  mlir/include/mlir/IR/CommonTypeConstraints.td |  86 ++++++++++++
  mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp    |   7 +-
  mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp      |  24 +++-
- .../SPIRV/Transforms/SPIRVConversion.cpp      | 132 +++++++++++++++---
+ .../SPIRV/Transforms/SPIRVConversion.cpp      | 126 +++++++++++++++---
  .../arith-to-spirv-unsupported.mlir           |   4 +-
  .../ArithToSPIRV/arith-to-spirv.mlir          |  34 +++++
  .../FuncToSPIRV/types-to-spirv.mlir           |  17 ++-
@@ -42,13 +21,13 @@ requirement initially, then do the check for capability inferred extension.
  mlir/test/Dialect/SPIRV/IR/ocl-ops.mlir       |  34 ++---
  mlir/test/Target/SPIRV/arithmetic-ops.mlir    |   6 +-
  mlir/test/Target/SPIRV/ocl-ops.mlir           |   6 +
- 17 files changed, 319 insertions(+), 68 deletions(-)
+ 17 files changed, 316 insertions(+), 65 deletions(-)
 
 diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td
-index 6ec97e17c5dc..75e42c024553 100644
+index 3b7da9b44a08..ddaeb13ef253 100644
 --- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td
 +++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td
-@@ -4138,7 +4138,12 @@ def SPIRV_Int32 : TypeAlias<I32, "Int32">;
+@@ -4142,7 +4142,12 @@ def SPIRV_Int32 : TypeAlias<I32, "Int32">;
  def SPIRV_Float32 : TypeAlias<F32, "Float32">;
  def SPIRV_Float : FloatOfWidths<[16, 32, 64]>;
  def SPIRV_Float16or32 : FloatOfWidths<[16, 32]>;
@@ -62,8 +41,8 @@ index 6ec97e17c5dc..75e42c024553 100644
                                         [SPIRV_Bool, SPIRV_Integer, SPIRV_Float]>;
  // Component type check is done in the type parser for the following SPIR-V
  // dialect-specific types so we use "Any" here.
-@@ -4189,7 +4194,7 @@ class SPIRV_JointMatrixOfType<list<Type> allowedTypes> :
-     "Joint Matrix">;
+@@ -4185,7 +4190,7 @@ class SPIRV_CoopMatrixOfType<list<Type> allowedTypes> :
+     "Cooperative Matrix">;
 
  class SPIRV_VectorOf<Type type> :
 -    VectorOfLengthAndType<[2, 3, 4, 8,16], [type]>;
@@ -72,10 +51,10 @@ index 6ec97e17c5dc..75e42c024553 100644
  class SPIRV_ScalarOrVectorOf<Type type> :
      AnyTypeOf<[type, SPIRV_VectorOf<type>]>;
 diff --git a/mlir/include/mlir/IR/CommonTypeConstraints.td b/mlir/include/mlir/IR/CommonTypeConstraints.td
-index af4f13dc0936..28d49d9e91f0 100644
+index 211385245555..671ec270efe0 100644
 --- a/mlir/include/mlir/IR/CommonTypeConstraints.td
 +++ b/mlir/include/mlir/IR/CommonTypeConstraints.td
-@@ -608,6 +608,92 @@ class ScalableVectorOfRankAndLengthAndType<list<int> allowedRanks,
+@@ -637,6 +637,92 @@ class ScalableVectorOfRankAndLengthAndType<list<int> allowedRanks,
    ScalableVectorOfLength<allowedLengths>.summary,
    "::mlir::VectorType">;
 
@@ -169,7 +148,7 @@ index af4f13dc0936..28d49d9e91f0 100644
  // Negative values for `n` index in reverse.
  class ShapedTypeWithNthDimOfSize<int n, list<int> allowedSizes> : Type<
 diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp b/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp
-index 72488d6e5d0b..b38f20458d32 100644
+index 48be287ef833..aec6d64209dd 100644
 --- a/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp
 +++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp
 @@ -187,9 +187,12 @@ static Type parseAndVerifyType(SPIRVDialect const &dialect,
@@ -188,7 +167,7 @@ index 72488d6e5d0b..b38f20458d32 100644
        return Type();
      }
 diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp b/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp
-index 3f25696aa5eb..2d64fea0dc26 100644
+index 337df3a5a65f..542c6beba2e4 100644
 --- a/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp
 +++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp
 @@ -100,9 +100,11 @@ bool CompositeType::classof(Type type) {
@@ -206,7 +185,7 @@ index 3f25696aa5eb..2d64fea0dc26 100644
  }
 
  Type CompositeType::getElementType(unsigned index) const {
-@@ -170,7 +172,21 @@ void CompositeType::getCapabilities(
+@@ -164,7 +166,21 @@ void CompositeType::getCapabilities(
        .Case<VectorType>([&](VectorType type) {
          auto vecSize = getNumElements();
          if (vecSize == 8 || vecSize == 16) {
@@ -230,10 +209,10 @@ index 3f25696aa5eb..2d64fea0dc26 100644
            capabilities.push_back(ref);
          }
 diff --git a/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp b/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp
-index 4072608dc8f8..3fc675632970 100644
+index d833ec9309ba..36840582a114 100644
 --- a/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp
 +++ b/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp
-@@ -43,9 +43,13 @@ using namespace mlir;
+@@ -88,9 +88,13 @@ static std::optional<SmallVector<int64_t>> getTargetShape(VectorType vecType) {
  template <typename LabelT>
  static LogicalResult checkExtensionRequirements(
      LabelT label, const spirv::TargetEnv &targetEnv,
@@ -249,7 +228,7 @@ index 4072608dc8f8..3fc675632970 100644
        continue;
 
      LLVM_DEBUG({
-@@ -71,9 +75,13 @@ static LogicalResult checkExtensionRequirements(
+@@ -116,9 +120,13 @@ static LogicalResult checkExtensionRequirements(
  template <typename LabelT>
  static LogicalResult checkCapabilityRequirements(
      LabelT label, const spirv::TargetEnv &targetEnv,
@@ -265,7 +244,7 @@ index 4072608dc8f8..3fc675632970 100644
        continue;
 
      LLVM_DEBUG({
-@@ -90,6 +98,55 @@ static LogicalResult checkCapabilityRequirements(
+@@ -135,6 +143,55 @@ static LogicalResult checkCapabilityRequirements(
    return success();
  }
 
@@ -321,27 +300,24 @@ index 4072608dc8f8..3fc675632970 100644
  /// Returns true if the given `storageClass` needs explicit layout when used in
  /// Shader environments.
  static bool needsExplicitLayout(spirv::StorageClass storageClass) {
-@@ -247,12 +304,17 @@ convertScalarType(const spirv::TargetEnv &targetEnv,
+@@ -280,11 +337,16 @@ convertScalarType(const spirv::TargetEnv &targetEnv,
      return nullptr;
    }
 
--  if (auto floatType = dyn_cast<FloatType>(type)) {
 +  //if (auto floatType = dyn_cast<FloatType>(type)) {
 +  // Convert to 32-bit float and remove floatType related capability
 +  // restriction
-+  if (auto floatType = dyn_cast<FloatType>(type)) {
+   if (auto floatType = dyn_cast<FloatType>(type)) {
      LLVM_DEBUG(llvm::dbgs() << type << " converted to 32-bit for SPIR-V\n");
      return Builder(targetEnv.getContext()).getF32Type();
    }
 
--  auto intType = cast<IntegerType>(type);
 +  //auto intType = cast<IntegerType>(type);
 +  // Convert to 32-bit int and remove intType related capability restriction
-+  auto intType = cast<IntegerType>(type);
+   auto intType = cast<IntegerType>(type);
    LLVM_DEBUG(llvm::dbgs() << type << " converted to 32-bit for SPIR-V\n");
    return IntegerType::get(targetEnv.getContext(), /*width=*/32,
-                           intType.getSignedness());
-@@ -342,16 +404,40 @@ convertVectorType(const spirv::TargetEnv &targetEnv,
+@@ -375,16 +437,40 @@ convertVectorType(const spirv::TargetEnv &targetEnv,
    cast<spirv::CompositeType>(type).getExtensions(extensions, storageClass);
    cast<spirv::CompositeType>(type).getCapabilities(capabilities, storageClass);
 
@@ -389,7 +365,7 @@ index 4072608dc8f8..3fc675632970 100644
  }
 
  static Type
-@@ -1163,16 +1249,18 @@ bool SPIRVConversionTarget::isLegalOp(Operation *op) {
+@@ -1553,16 +1639,18 @@ bool SPIRVConversionTarget::isLegalOp(Operation *op) {
    SmallVector<ArrayRef<spirv::Extension>, 4> typeExtensions;
    SmallVector<ArrayRef<spirv::Capability>, 8> typeCapabilities;
    for (Type valueType : valueTypes) {
@@ -400,10 +376,9 @@ index 4072608dc8f8..3fc675632970 100644
 -      return false;
 -
      typeCapabilities.clear();
--    cast<spirv::SPIRVType>(valueType).getCapabilities(typeCapabilities);
+     cast<spirv::SPIRVType>(valueType).getCapabilities(typeCapabilities);
 -    if (failed(checkCapabilityRequirements(op->getName(), this->targetEnv,
 -                                           typeCapabilities)))
-+    cast<spirv::SPIRVType>(valueType).getCapabilities(typeCapabilities);
 +    typeExtensions.clear();
 +    cast<spirv::SPIRVType>(valueType).getExtensions(typeExtensions);
 +    // Checking for capability and extension requirements along with capability
@@ -418,10 +393,10 @@ index 4072608dc8f8..3fc675632970 100644
    }
 
 diff --git a/mlir/test/Conversion/ArithToSPIRV/arith-to-spirv-unsupported.mlir b/mlir/test/Conversion/ArithToSPIRV/arith-to-spirv-unsupported.mlir
-index 0d92a8e676d8..d61ace8d6876 100644
+index 24a0bab352c3..96b8ea6e7975 100644
 --- a/mlir/test/Conversion/ArithToSPIRV/arith-to-spirv-unsupported.mlir
 +++ b/mlir/test/Conversion/ArithToSPIRV/arith-to-spirv-unsupported.mlir
-@@ -11,9 +11,9 @@ module attributes {
+@@ -28,9 +28,9 @@ module attributes {
      #spirv.vce<v1.0, [Int8, Int16, Int64, Float16, Float64, Shader], []>, #spirv.resource_limits<>>
  } {
 
@@ -434,10 +409,10 @@ index 0d92a8e676d8..d61ace8d6876 100644
  }
 
 diff --git a/mlir/test/Conversion/ArithToSPIRV/arith-to-spirv.mlir b/mlir/test/Conversion/ArithToSPIRV/arith-to-spirv.mlir
-index ae47ae36ca51..644996fe0fa7 100644
+index 1abe0fd2ec46..e485296ad026 100644
 --- a/mlir/test/Conversion/ArithToSPIRV/arith-to-spirv.mlir
 +++ b/mlir/test/Conversion/ArithToSPIRV/arith-to-spirv.mlir
-@@ -1447,6 +1447,40 @@ func.func @ops_flags(%arg0: i64, %arg1: i64) {
+@@ -1462,6 +1462,40 @@ func.func @ops_flags(%arg0: i64, %arg1: i64) {
    %2 = arith.muli %arg0, %arg1 overflow<nsw, nuw> : i64
    // CHECK: %{{.*}} = spirv.IMul %{{.*}}, %{{.*}} : i64
    %3 = arith.muli %arg0, %arg1 overflow<nsw, nuw> : i64
@@ -586,7 +561,7 @@ index 53a1015de75b..6970b8ec0628 100644
    spirv.Return
  }
 diff --git a/mlir/test/Dialect/SPIRV/IR/logical-ops.mlir b/mlir/test/Dialect/SPIRV/IR/logical-ops.mlir
-index 7dc0bd99f54b..5dd9901828cd 100644
+index 5c24f0e6a7d3..3ca61ab48096 100644
 --- a/mlir/test/Dialect/SPIRV/IR/logical-ops.mlir
 +++ b/mlir/test/Dialect/SPIRV/IR/logical-ops.mlir
 @@ -166,7 +166,7 @@ func.func @logicalUnary(%arg0 : i1)
@@ -599,10 +574,10 @@ index 7dc0bd99f54b..5dd9901828cd 100644
    return
  }
 diff --git a/mlir/test/Dialect/SPIRV/IR/non-uniform-ops.mlir b/mlir/test/Dialect/SPIRV/IR/non-uniform-ops.mlir
-index f7fd05b36bae..5228bb719d94 100644
+index d8a26c71d12f..d22378817dbb 100644
 --- a/mlir/test/Dialect/SPIRV/IR/non-uniform-ops.mlir
 +++ b/mlir/test/Dialect/SPIRV/IR/non-uniform-ops.mlir
-@@ -439,7 +439,7 @@ func.func @group_non_uniform_bitwise_and(%val: i32) -> i32 {
+@@ -495,7 +495,7 @@ func.func @group_non_uniform_bitwise_and(%val: i32) -> i32 {
  // -----
 
  func.func @group_non_uniform_bitwise_and(%val: i1) -> i1 {
@@ -611,7 +586,7 @@ index f7fd05b36bae..5228bb719d94 100644
    %0 = spirv.GroupNonUniformBitwiseAnd "Workgroup" "Reduce" %val : i1
    return %0: i1
  }
-@@ -460,7 +460,7 @@ func.func @group_non_uniform_bitwise_or(%val: i32) -> i32 {
+@@ -516,7 +516,7 @@ func.func @group_non_uniform_bitwise_or(%val: i32) -> i32 {
  // -----
 
  func.func @group_non_uniform_bitwise_or(%val: i1) -> i1 {
@@ -620,7 +595,7 @@ index f7fd05b36bae..5228bb719d94 100644
    %0 = spirv.GroupNonUniformBitwiseOr "Workgroup" "Reduce" %val : i1
    return %0: i1
  }
-@@ -481,7 +481,7 @@ func.func @group_non_uniform_bitwise_xor(%val: i32) -> i32 {
+@@ -537,7 +537,7 @@ func.func @group_non_uniform_bitwise_xor(%val: i32) -> i32 {
  // -----
 
  func.func @group_non_uniform_bitwise_xor(%val: i1) -> i1 {
@@ -629,7 +604,7 @@ index f7fd05b36bae..5228bb719d94 100644
    %0 = spirv.GroupNonUniformBitwiseXor "Workgroup" "Reduce" %val : i1
    return %0: i1
  }
-@@ -502,7 +502,7 @@ func.func @group_non_uniform_logical_and(%val: i1) -> i1 {
+@@ -558,7 +558,7 @@ func.func @group_non_uniform_logical_and(%val: i1) -> i1 {
  // -----
 
  func.func @group_non_uniform_logical_and(%val: i32) -> i32 {
@@ -638,7 +613,7 @@ index f7fd05b36bae..5228bb719d94 100644
    %0 = spirv.GroupNonUniformLogicalAnd "Workgroup" "Reduce" %val : i32
    return %0: i32
  }
-@@ -523,7 +523,7 @@ func.func @group_non_uniform_logical_or(%val: i1) -> i1 {
+@@ -579,7 +579,7 @@ func.func @group_non_uniform_logical_or(%val: i1) -> i1 {
  // -----
 
  func.func @group_non_uniform_logical_or(%val: i32) -> i32 {
@@ -647,7 +622,7 @@ index f7fd05b36bae..5228bb719d94 100644
    %0 = spirv.GroupNonUniformLogicalOr "Workgroup" "Reduce" %val : i32
    return %0: i32
  }
-@@ -544,7 +544,7 @@ func.func @group_non_uniform_logical_xor(%val: i1) -> i1 {
+@@ -600,7 +600,7 @@ func.func @group_non_uniform_logical_xor(%val: i1) -> i1 {
  // -----
 
  func.func @group_non_uniform_logical_xor(%val: i32) -> i32 {
diff --git a/build_tools/patches/0002-change-spirv.CL.printf-op-assembly-format.patch b/build_tools/patches/0002-change-spirv.CL.printf-op-assembly-format.patch
deleted file mode 100644
index 69232a7ba..000000000
--- a/build_tools/patches/0002-change-spirv.CL.printf-op-assembly-format.patch
+++ /dev/null
@@ -1,49 +0,0 @@
-From dc1e914409a9d4c02c21a292227754fa4ac0cea7 Mon Sep 17 00:00:00 2001
-From: Dimple Prajapati <dimpalben.r.prajapati@intel.com>
-Date: Fri, 26 Apr 2024 20:30:34 +0000
-Subject: [PATCH 2/7] change-spirv.CL.printf-op-assembly-format
-
----
- mlir/include/mlir/Dialect/SPIRV/IR/SPIRVCLOps.td | 4 ++--
- mlir/test/Dialect/SPIRV/IR/ocl-ops.mlir          | 4 ++--
- 2 files changed, 4 insertions(+), 4 deletions(-)
-
-diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVCLOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVCLOps.td
-index c7c2fe8bc742..b5ca27d7d753 100644
---- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVCLOps.td
-+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVCLOps.td
-@@ -875,7 +875,7 @@ def SPIRV_CLPrintfOp : SPIRV_CLOp<"printf", 184, []> {
-     #### Example:
-
-     ```mlir
--    %0 = spirv.CL.printf %0 %1 %2 : (!spirv.ptr<i8, UniformConstant>, (i32, i32)) -> i32
-+    %0 = spirv.CL.printf %0 : !spirv.ptr<i8, UniformConstant>(%1, %2 : i32, i32) -> i32
-     ```
-   }];
-
-@@ -889,7 +889,7 @@ def SPIRV_CLPrintfOp : SPIRV_CLOp<"printf", 184, []> {
-   );
-
-   let assemblyFormat = [{
--  $format `,` $arguments  attr-dict `:`  `(` type($format) `,` `(` type($arguments) `)` `)` `->` type($result)
-+    $format `:` type($format) ( `(` $arguments^ `:` type($arguments) `)`)? attr-dict `->` type($result)
-   }];
-
-   let hasVerifier = 0;
-diff --git a/mlir/test/Dialect/SPIRV/IR/ocl-ops.mlir b/mlir/test/Dialect/SPIRV/IR/ocl-ops.mlir
-index 7a29abd44b34..b15ffdbbb767 100644
---- a/mlir/test/Dialect/SPIRV/IR/ocl-ops.mlir
-+++ b/mlir/test/Dialect/SPIRV/IR/ocl-ops.mlir
-@@ -275,8 +275,8 @@ func.func @rintvec(%arg0 : vector<3xf16>) -> () {
- //===----------------------------------------------------------------------===//
- // CHECK-LABEL: func.func @printf(
- func.func @printf(%arg0 : !spirv.ptr<i8, UniformConstant>, %arg1 : i32, %arg2 : i32) -> i32 {
--  // CHECK: spirv.CL.printf {{%.*}}, {{%.*}}, {{%.*}} : (!spirv.ptr<i8, UniformConstant>, (i32, i32)) -> i32
--  %0 = spirv.CL.printf %arg0, %arg1, %arg2 : (!spirv.ptr<i8, UniformConstant>, (i32, i32)) -> i32
-+  // CHECK: spirv.CL.printf {{%.*}} : !spirv.ptr<i8, UniformConstant>({{%.*}}, {{%.*}} : i32, i32) -> i32
-+  %0 = spirv.CL.printf %arg0 : !spirv.ptr<i8, UniformConstant>(%arg1, %arg2 : i32, i32) -> i32
-   return %0 : i32
- }
-
---
-2.34.1
diff --git a/build_tools/patches/0003-Add-Constant-attribute-in-ParseDecoration.patch b/build_tools/patches/0003-Add-Constant-attribute-in-ParseDecoration.patch
deleted file mode 100644
index 81d751d81..000000000
--- a/build_tools/patches/0003-Add-Constant-attribute-in-ParseDecoration.patch
+++ /dev/null
@@ -1,36 +0,0 @@
-From 85635423ba70290147e674672854b90bbb81f555 Mon Sep 17 00:00:00 2001
-From: "Prajapati, Dimple" <dimpalben.r.prajapati@intel.com>
-Date: Fri, 26 Apr 2024 20:32:04 +0000
-Subject: [PATCH 3/7] Add-Constant-attribute-in-ParseDecoration
-
----
- mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp | 1 +
- mlir/lib/Target/SPIRV/Serialization/Serializer.cpp     | 1 +
- 2 files changed, 2 insertions(+)
-
-diff --git a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
-index faaa42023a80..cfe3121bbe95 100644
---- a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
-+++ b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
-@@ -297,6 +297,7 @@ LogicalResult spirv::Deserializer::processDecoration(ArrayRef<uint32_t> words) {
-     decorations[words[0]].set(symbol, llvm::dyn_cast<Attribute>(linkageAttr));
-     break;
-   }
-+  case spirv::Decoration::Constant:
-   case spirv::Decoration::Aliased:
-   case spirv::Decoration::AliasedPointer:
-   case spirv::Decoration::Block:
-diff --git a/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp b/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp
-index 200abdf993ce..a7d195d7fcb0 100644
---- a/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp
-+++ b/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp
-@@ -267,6 +267,7 @@ LogicalResult Serializer::processDecorationAttr(Location loc, uint32_t resultID,
-            << stringifyDecoration(decoration);
-   case spirv::Decoration::Aliased:
-   case spirv::Decoration::AliasedPointer:
-+  case spirv::Decoration::Constant:
-   case spirv::Decoration::Flat:
-   case spirv::Decoration::NonReadable:
-   case spirv::Decoration::NonWritable:
---
-2.34.1
diff --git a/build_tools/patches/0004-Add-serialization-and-de-serialization-support-for-s.patch b/build_tools/patches/0004-Add-serialization-and-de-serialization-support-for-s.patch
index 6bc75a749..9d2618753 100644
--- a/build_tools/patches/0004-Add-serialization-and-de-serialization-support-for-s.patch
+++ b/build_tools/patches/0004-Add-serialization-and-de-serialization-support-for-s.patch
@@ -1,22 +1,29 @@
+From 4cb4411e2451b1549bafd6a8a3723f78251ef6f3 Mon Sep 17 00:00:00 2001
+From: izamyati <igor.zamyatin@intel.com>
+Date: Tue, 1 Oct 2024 08:59:35 -0500
+Subject: [PATCH] Add serialization and deserialization support for s
+
+---
+ mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp | 6 ++++++
+ mlir/lib/Target/SPIRV/Serialization/Serializer.cpp     | 6 ++++++
+ 2 files changed, 12 insertions(+)
+
 diff --git a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
-index 12980879b20a..b5fbe8c5ceb8 100644
+index 6c7fe4106982..b1be812e74eb 100644
 --- a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
 +++ b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
-@@ -259,8 +259,9 @@ LogicalResult spirv::Deserializer::processDecoration(ArrayRef<uint32_t> words) {
+@@ -259,6 +259,7 @@ LogicalResult spirv::Deserializer::processDecoration(ArrayRef<uint32_t> words) {
          symbol, FPRoundingModeAttr::get(opBuilder.getContext(),
                                          static_cast<FPRoundingMode>(words[2])));
      break;
--  case spirv::Decoration::DescriptorSet:
 +  case spirv::Decoration::Alignment:
+   case spirv::Decoration::DescriptorSet:
    case spirv::Decoration::Binding:
-+  case spirv::Decoration::DescriptorSet:
      if (words.size() != 3) {
-       return emitError(unknownLoc, "OpDecorate with ")
-              << decorationName << " needs a single integer literal";
-@@ -319,6 +320,10 @@ LogicalResult spirv::Deserializer::processDecoration(ArrayRef<uint32_t> words) {
-   case spirv::Decoration::Restrict:
+@@ -320,6 +321,10 @@ LogicalResult spirv::Deserializer::processDecoration(ArrayRef<uint32_t> words) {
    case spirv::Decoration::RestrictPointer:
    case spirv::Decoration::NoContraction:
+   case spirv::Decoration::Constant:
 +  case spirv::Decoration::SingleElementVectorINTEL:
 +  case spirv::Decoration::VectorComputeCallableFunctionINTEL:
 +  case spirv::Decoration::VectorComputeFunctionINTEL:
@@ -24,7 +31,7 @@ index 12980879b20a..b5fbe8c5ceb8 100644
      if (words.size() != 2) {
        return emitError(unknownLoc, "OpDecoration with ")
               << decorationName << "needs a single target <id>";
-@@ -329,6 +334,7 @@ LogicalResult spirv::Deserializer::processDecoration(ArrayRef<uint32_t> words) {
+@@ -330,6 +335,7 @@ LogicalResult spirv::Deserializer::processDecoration(ArrayRef<uint32_t> words) {
      // it is needed for many validation rules.
      decorations[words[0]].set(symbol, opBuilder.getUnitAttr());
      break;
@@ -33,7 +40,7 @@ index 12980879b20a..b5fbe8c5ceb8 100644
    case spirv::Decoration::SpecId:
      if (words.size() != 3) {
 diff --git a/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp b/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp
-index 714a3edfb565..bb3c68530aa9 100644
+index f355982e9ed8..d6080185eefe 100644
 --- a/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp
 +++ b/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp
 @@ -252,8 +252,10 @@ LogicalResult Serializer::processDecorationAttr(Location loc, uint32_t resultID,
@@ -47,10 +54,10 @@ index 714a3edfb565..bb3c68530aa9 100644
    case spirv::Decoration::Location:
      if (auto intAttr = dyn_cast<IntegerAttr>(attr)) {
        args.push_back(intAttr.getValue().getZExtValue());
-@@ -286,6 +288,10 @@ LogicalResult Serializer::processDecorationAttr(Location loc, uint32_t resultID,
-   case spirv::Decoration::Restrict:
+@@ -287,6 +289,10 @@ LogicalResult Serializer::processDecorationAttr(Location loc, uint32_t resultID,
    case spirv::Decoration::RestrictPointer:
    case spirv::Decoration::NoContraction:
+   case spirv::Decoration::Constant:
 +  case spirv::Decoration::SingleElementVectorINTEL:
 +  case spirv::Decoration::VectorComputeCallableFunctionINTEL:
 +  case spirv::Decoration::VectorComputeFunctionINTEL:
@@ -58,3 +65,5 @@ index 714a3edfb565..bb3c68530aa9 100644
      // For unit attributes and decoration attributes, the args list
      // has no values so we do nothing.
      if (isa<UnitAttr, DecorationAttr>(attr))
+--
+2.34.1
diff --git a/build_tools/patches/0007-Move-chunk_size-into-TensorDesc.patch b/build_tools/patches/0007-Move-chunk_size-into-TensorDesc.patch
deleted file mode 100644
index 33c132dfb..000000000
--- a/build_tools/patches/0007-Move-chunk_size-into-TensorDesc.patch
+++ /dev/null
@@ -1,432 +0,0 @@
-From c1a7d459790db5335907947cf44dcbd230cec783 Mon Sep 17 00:00:00 2001
-From: Chao Chen <chao.chen@intel.com>
-Date: Thu, 29 Aug 2024 17:58:34 +0000
-Subject: [PATCH] move chunk_size into TensorDesc
-
----
- .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td       | 46 +++++++++++---
- .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 19 ++----
- .../mlir/Dialect/XeGPU/IR/XeGPUTypes.td       | 63 ++++++++++++-------
- mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    | 40 ++++++++----
- mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        | 31 ++++-----
- 5 files changed, 122 insertions(+), 77 deletions(-)
-
-diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
-index f3ca09a6a68e..6ffb4eb3c60f 100644
---- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
-+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
-@@ -19,9 +19,15 @@ class XeGPUAttr<string name, string attrMnemonic, list<Trait> traits = [],
-   let mnemonic = attrMnemonic;
- }
-
--def XeGPU_TensorDescAttr: XeGPUAttr<"TensorDesc", "tdesc_attr"> {
-+class XeGPU_TensorDescAttr<string name, string attrMnemonic, list<Trait> traits = [],
-+                         string baseCppClass = "::mlir::Attribute">
-+    : XeGPUAttr<name, attrMnemonic, traits, baseCppClass> {
-+  let assemblyFormat = "`<` struct(params) `>`";
-+}
-+
-+def XeGPU_BlockTensorDescAttr: XeGPU_TensorDescAttr<"BlockTensorDesc", "block_tdesc_attr"> {
-   let summary = [{a composite attribute for `TensorDescType`}];
--  let description = [{`TensorDescAttr` (or `tdesc_attr`) is a composite
-+  let description = [{`BlockTensorDesc` (or `block_tdesc_attr`) is a composite
-     attribute defined for `TensorDescType` for describing following
-     properties of a `TensorDesc`.
-     1. `memory_scope`: It describes where the data block described by the
-@@ -33,29 +39,49 @@ def XeGPU_TensorDescAttr: XeGPUAttr<"TensorDesc", "tdesc_attr"> {
-         8x32. Its default value is 1.
-     3. `boundary_check`: It is used to indicates the hardware whether to do
-         out-of-boundary check. The default value is true.
--    4. `scattered`: It is used to differenciate TensorDescs created from
--       `create_nd_tdesc` vs from `create_tdesc`.
-   }];
-
-   let parameters = (ins
-     OptionalParameter<"MemoryScopeAttr">: $memory_scope,
-     OptionalParameter<"IntegerAttr", "1">: $array_length,
--    OptionalParameter<"BoolAttr", "true">: $boundary_check,
--    OptionalParameter<"BoolAttr", "false">: $scattered
-+    OptionalParameter<"BoolAttr", "true">: $boundary_check
-   );
-
-   let builders = [
-     AttrBuilder<(ins
-       CArg<"xegpu::MemoryScope", "xegpu::MemoryScope::Global">:$memory_scope,
-       CArg<"int", "1">:$array_length,
--      CArg<"bool", "true">: $boundary_check,
--      CArg<"bool", "false">: $scattered
-+      CArg<"bool", "true">: $boundary_check
-     )>
-   ];
-
--  let assemblyFormat = "`<` struct(params) `>`";
- }
-
-+def XeGPU_ScatterTensorDescAttr: XeGPU_TensorDescAttr<"ScatterTensorDesc", "scatter_tdesc_attr"> {
-+  let summary = [{a composite attribute for `TensorDescType`}];
-+  let description = [{`ScatterTensorDesc` (or `scatter_tdesc_attr`) is a composite
-+    attribute defined for `TensorDescType` for describing following
-+    properties of a `TensorDesc`.
-+    1. `memory_scope`: It describes where the data block described by the
-+        TensorDesc is located, `Global` device memory or `Shared` local memory.
-+        It is default to `Global`.
-+    2.  `chunk_size`: indicates number of continious elements accessed for each
-+        offset, default is 1. It is used with `scattered` attr only.
-+  }];
-+
-+  let parameters = (ins
-+    OptionalParameter<"MemoryScopeAttr">: $memory_scope,
-+    OptionalParameter<"IntegerAttr", "1">: $chunk_size
-+  );
-+
-+  let builders = [
-+    AttrBuilder<(ins
-+      CArg<"xegpu::MemoryScope", "xegpu::MemoryScope::Global">:$memory_scope,
-+      CArg<"int", "1">: $chunk_size
-+    )>
-+  ];
-+ }
-+
- //===----------------------------------------------------------------------===//
- // XeGPU Memory Scope Enums.
- //===----------------------------------------------------------------------===//
-@@ -116,4 +142,4 @@ def XeGPU_FenceScopeAttr:
-     let assemblyFormat = "$value";
- }
-
--#endif // MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD
-\ No newline at end of file
-+#endif // MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD
-diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
-index c32c7541c397..13a0bff5de1a 100644
---- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
-+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
-@@ -411,42 +411,33 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
-       is fixed to the hardware supportted subgroup size, e.g., 16 on PVC,
-       implying each element in the array corresponds to a work-item (SIMT lane)
-       in the subgroup.
--    * chunk_size: [optional attribute] indicates number of continious
--      elements accessed for each offset, default is 1.
-
-     Example 1. It assumes subgroup size is 4, and accesses a[0], a[16], a[32], a[64]
-     ```mlir
-     %a = memref.alloc() : memref<1024xf32>
--    %1 = xegpu.create_tdesc %a[0, 16, 32, 64]: memref<1024xf32> -> TensorDesc<4xf32>
-+    %1 = xegpu.create_tdesc %a[0, 16, 32, 64]: memref<1024xf32> -> TensorDesc<4xf32, chunk_size_per_lane = 1>
-     ```
-
-     Example 2. It assumes subgroup size is 4, and each workitem access 8 elements.
-                It will access totally 32 data elements: a[0:7], a[16:23], a[32:39], a[64:71]
-     ```mlir
-     %0 = memref.alloc() : memref<1024xf32>
--    %1 = xegpu.create_tdesc %0[0, 16, 32, 64] {chunk_size = 8}: memref<1024xf32> -> TensorDesc<4x8xf32>
-+    %1 = xegpu.create_tdesc %0[0, 16, 32, 64] : memref<1024xf32> -> TensorDesc<4x8xf32, chunk_size_per_lane = 8>
-     ```
-
-     Example 3. It is similar to Example 2, but there is some overlaps among workitems.
-                It accesses: a[0:7], a[4:11], a[8:15], a[12:19]
-     ```mlir
-     %0 = memref.alloc() : memref<1024xf32>
--    %1 = xegpu.create_tdesc %0[0, 4, 8, 12] {chunk_size = 8}: memref<1024xf32> -> TensorDesc<4x8xf32>
-+    %1 = xegpu.create_tdesc %0[0, 4, 8, 12] : memref<1024xf32> -> TensorDesc<4x8xf32, chunk_size_per_lane = 8>>
-     ```
-   }];
-
-   let arguments = (ins XeGPU_BaseAddrType: $source,
-                        Variadic<Index>: $offsets,
--                       DenseI64ArrayAttr: $const_offsets,
--                       DefaultValuedAttr<I64Attr, "1">: $chunk_size);
-+                       DenseI64ArrayAttr: $const_offsets);
-   let results = (outs XeGPU_TensorDesc:$TensorDesc);
-
--  let builders = [
--    OpBuilder<(ins "xegpu::TensorDescType": $TensorDesc, "Value": $source,
--                   "llvm::ArrayRef<OpFoldResult>": $offsets,
--                   CArg<"uint32_t", "1"> : $chunk_size)>,
--  ];
--
-   let assemblyFormat = [{
-     $source
-     custom<DynamicIndexList>($offsets, $const_offsets)
-@@ -723,7 +714,7 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]
-
- def XeGPU_AtomicRMWOp: XeGPU_Op<"atomic_rmw", [Pure,
-       AllElementTypesMatch<["tensorDesc", "value", "result"]>,
--      AllShapesMatch<["tensorDesc", "mask", "value", "result"]>]> {
-+      AllShapesMatch<["tensorDesc", "value", "result"]>]> {
-   let summary = "Atomic ready-modify-write operation on the TensorDesc. ";
-
-   let description = [{
-diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
-index 9f101a71697b..8b22baf365af 100644
---- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
-+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
-@@ -88,11 +88,14 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
-     TypeBuilderWithInferredContext<(ins
-       "llvm::ArrayRef<int64_t>": $shape,
-       "mlir::Type": $elementType,
--      CArg<"bool", "false">: $scattered,
-       CArg<"int", "1">: $array_length,
--      CArg<"xegpu::MemoryScope", "xegpu::MemoryScope::Global">:$memory_scope,
--      CArg<"bool", "true">: $boundary_check
--    )>
-+      CArg<"bool", "true">: $boundary_check,
-+      CArg<"xegpu::MemoryScope", "xegpu::MemoryScope::Global">:$memory_scope)>,
-+    TypeBuilderWithInferredContext<(ins
-+      "llvm::ArrayRef<int64_t>": $shape,
-+      "mlir::Type": $elementType,
-+      CArg<"int", "1">: $chunk_size,
-+      CArg<"xegpu::MemoryScope", "xegpu::MemoryScope::Global">:$memory_scope)>
-   ];
-
-   let extraClassDeclaration = [{
-@@ -110,40 +113,58 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
-       return llvm::cast<TensorDescType>(cloneWith(getShape(), elementType));
-     }
-
--    TensorDescAttr getEncodingAsTensorDescAttr() const {
--      return llvm::dyn_cast_if_present<TensorDescAttr>(getEncoding());
-+    BlockTensorDescAttr getEncodingAsBlockTensorDescAttr() const {
-+      return llvm::dyn_cast_if_present<BlockTensorDescAttr>(getEncoding());
-+    }
-+
-+    ScatterTensorDescAttr getEncodingAsScatterTensorDescAttr() const {
-+      return llvm::dyn_cast_if_present<ScatterTensorDescAttr>(getEncoding());
-     }
-
-     xegpu::MemoryScope getMemoryScope() const {
--      auto attr = getEncodingAsTensorDescAttr();
--      if (attr && attr.getMemoryScope())
--        return attr.getMemoryScope().getValue();
-+      auto block_attr = getEncodingAsBlockTensorDescAttr();
-+      if (block_attr && block_attr.getMemoryScope())
-+        return block_attr.getMemoryScope().getValue();
-+
-+      auto scatter_attr = getEncodingAsScatterTensorDescAttr();
-+      if (scatter_attr && scatter_attr.getMemoryScope())
-+        return scatter_attr.getMemoryScope().getValue();
-+
-       // return default value
-       return MemoryScope::Global;
-     }
-
-     int getArrayLength() {
--      auto attr = getEncodingAsTensorDescAttr();
--      if (attr && attr.getArrayLength())
--        return attr.getArrayLength().getInt();
-+      auto attr = getEncoding();
-+      auto block_attr = mlir::dyn_cast_if_present<BlockTensorDescAttr>(attr);
-+      assert((!attr || block_attr) && "invalid on non BlockTensorDescAttr.");
-+      if (block_attr && block_attr.getArrayLength())
-+        return block_attr.getArrayLength().getInt();
-       // return default value
-       return 1;
-     }
-
-     bool getBoundaryCheck() {
--      auto attr = getEncodingAsTensorDescAttr();
--      if (attr && attr.getBoundaryCheck())
--        return attr.getBoundaryCheck().getValue();
-+      auto attr = getEncoding();
-+      auto block_attr = mlir::dyn_cast_if_present<BlockTensorDescAttr>(attr);
-+      assert((!attr || block_attr) && "invalid on non BlockTensorDescAttr.");
-+      if (block_attr && block_attr.getBoundaryCheck())
-+        return block_attr.getBoundaryCheck().getValue();
-       // return default value
-       return true;
-     }
-
--    bool getScattered() {
--      auto attr = getEncodingAsTensorDescAttr();
--      if (attr && attr.getScattered())
--        return attr.getScattered().getValue();
--      // return default value
--      return false;
-+    bool isScattered() {
-+      return bool(getEncodingAsScatterTensorDescAttr());
-+    }
-+
-+    int getChunkSize() {
-+      auto attr = getEncoding();
-+      auto scatter_attr = mlir::dyn_cast_if_present<ScatterTensorDescAttr>(attr);
-+      assert((!attr || scatter_attr) && "invalid on non ScatterTensorDescAttr.");
-+      if (scatter_attr && scatter_attr.getChunkSize())
-+        return scatter_attr.getChunkSize().getInt();
-+      return 1;
-     }
-   }];
-
-diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
-index 24719fe748fe..0eab601bbaac 100644
---- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
-+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
-@@ -30,18 +30,28 @@ void XeGPUDialect::initialize() {
- }
-
- //===----------------------------------------------------------------------===//
--// XeGPU_TensorDescAttr
-+// XeGPU_BlockTensorDescAttr
- //===----------------------------------------------------------------------===//
--TensorDescAttr TensorDescAttr::get(mlir::MLIRContext *context,
--                                   xegpu::MemoryScope memory_scope,
--                                   int array_length, bool boundary_check,
--                                   bool scattered) {
-+BlockTensorDescAttr BlockTensorDescAttr::get(mlir::MLIRContext *context,
-+                                        xegpu::MemoryScope memory_scope,
-+                                        int array_length, bool boundary_check) {
-   auto scopeAttr = MemoryScopeAttr::get(context, memory_scope);
-   auto lengthAttr =
-       IntegerAttr::get(IntegerType::get(context, 64), array_length);
-   auto boundaryAttr = BoolAttr::get(context, boundary_check);
--  auto scatteredAttr = BoolAttr::get(context, scattered);
--  return Base::get(context, scopeAttr, lengthAttr, boundaryAttr, scatteredAttr);
-+  return Base::get(context, scopeAttr, lengthAttr, boundaryAttr);
-+}
-+
-+//===----------------------------------------------------------------------===//
-+// XeGPU_ScatterTensorDescAttr
-+//===----------------------------------------------------------------------===//
-+ScatterTensorDescAttr ScatterTensorDescAttr::get(mlir::MLIRContext *context,
-+                                            xegpu::MemoryScope memory_scope,
-+                                            int chunk_size) {
-+  auto scopeAttr = MemoryScopeAttr::get(context, memory_scope);
-+  auto chunkSizeAttr =
-+      IntegerAttr::get(IntegerType::get(context, 64), chunk_size);
-+  return Base::get(context, scopeAttr, chunkSizeAttr);
- }
-
- //===----------------------------------------------------------------------===//
-@@ -108,12 +118,18 @@ void TensorDescType::print(::mlir::AsmPrinter &printer) const {
- }
-
- TensorDescType TensorDescType::get(llvm::ArrayRef<int64_t> shape,
--                                   mlir::Type elementType, bool scattered,
--                                   int array_length, MemoryScope memory_scope,
--                                   bool boundary_check) {
-+                                   mlir::Type elementType, int array_length,
-+                                   bool boundary_check, MemoryScope memory_scope) {
-+  auto context = elementType.getContext();
-+  auto attr = BlockTensorDescAttr::get(context, memory_scope, array_length, boundary_check);
-+  return Base::get(context, shape, elementType, attr);
-+}
-+
-+TensorDescType TensorDescType::get(llvm::ArrayRef<int64_t> shape,
-+                                   mlir::Type elementType, int chunk_size,
-+                                   MemoryScope memory_scope) {
-   auto context = elementType.getContext();
--  auto attr = TensorDescAttr::get(context, memory_scope, array_length,
--                                  boundary_check, scattered);
-+  auto attr = ScatterTensorDescAttr::get(context, memory_scope, chunk_size);
-   return Base::get(context, shape, elementType, attr);
- }
-
-diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
-index 8e185b8d2586..ee3834bd0d9c 100644
---- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
-+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
-@@ -153,7 +153,7 @@ LogicalResult CreateNdDescOp::verify() {
-     return emitOpError("TensorDesc should have the same element "
-                        "type with the source if it is a memref.\n");
-
--  if (getType().getScattered())
-+  if (getType().isScattered())
-     return emitOpError("Expects a non-scattered TensorDesc.\n");
-
-   return success();
-@@ -164,7 +164,7 @@ LogicalResult CreateNdDescOp::verify() {
- //===----------------------------------------------------------------------===//
- LogicalResult PrefetchNdOp::verify() {
-   auto tdescTy = getTensorDescType();
--  if (tdescTy.getScattered())
-+  if (tdescTy.isScattered())
-     return emitOpError("Expects a non-scattered TensorDesc.\n");
-
-   if (!isReadHintOrNone(getL1HintAttr()))
-@@ -189,7 +189,7 @@ LogicalResult LoadNdOp::verify() {
-   if (tdescTy.getRank() > 2)
-     return emitOpError("Expecting a 1D/2D TensorDesc.\n");
-
--  if (tdescTy.getScattered())
-+  if (tdescTy.isScattered())
-     return emitOpError("Expects a non-scattered TensorDesc.\n");
-
-   if (!valueTy)
-@@ -257,7 +257,7 @@ LogicalResult StoreNdOp::verify() {
-   if (dstTy.getRank() > 2)
-     return emitOpError("Expecting a 1D/2D TensorDesc.\n");
-
--  if (dstTy.getScattered())
-+  if (dstTy.isScattered())
-     return emitOpError("Expects a non-scattered TensorDesc.\n");
-
-   if (!valTy)
-@@ -280,7 +280,7 @@ LogicalResult StoreNdOp::verify() {
- //===----------------------------------------------------------------------===//
- LogicalResult UpdateNdOffsetOp::verify() {
-   auto ty = getTensorDescType();
--  if (ty.getScattered())
-+  if (ty.isScattered())
-     return emitOpError("Expects a non-scattered TensorDesc.\n");
-
-   // number of offsets specified must match the rank of the tensor descriptor
-@@ -293,28 +293,19 @@ LogicalResult UpdateNdOffsetOp::verify() {
- //===----------------------------------------------------------------------===//
- // XeGPU_CreateDescOp
- //===----------------------------------------------------------------------===//
--void CreateDescOp::build(OpBuilder &builder, OperationState &state,
--                         TensorDescType TensorDesc, Value source,
--                         llvm::ArrayRef<OpFoldResult> offsets,
--                         uint32_t chunk_size) {
--  llvm::SmallVector<int64_t> staticOffsets;
--  llvm::SmallVector<Value> dynamicOffsets;
--  dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
--  build(builder, state, TensorDesc, source, dynamicOffsets, staticOffsets,
--        chunk_size);
--}
-
- LogicalResult CreateDescOp::verify() {
-   auto tdescTy = getTensorDescType();
--  auto chunkSize = getChunkSize();
-
-   if (getRankOf(getSource()) > 1)
-     return emitOpError(
-         "Expecting the source is a 1D memref or pointer (uint64_t).");
-
--  if (!tdescTy.getScattered())
-+  if (!tdescTy.isScattered())
-     return emitOpError("Expects a scattered TensorDesc.\n");
-
-+  auto chunkSize = tdescTy.getChunkSize();
-+
-   SmallVector<int64_t> shape({(int64_t)getNumOffsets()});
-   if (chunkSize != 1)
-     shape.push_back(chunkSize);
-@@ -332,7 +323,7 @@ LogicalResult CreateDescOp::verify() {
- //===----------------------------------------------------------------------===//
- LogicalResult PrefetchOp::verify() {
-   auto tdescTy = getTensorDescType();
--  if (!tdescTy.getScattered())
-+  if (!tdescTy.isScattered())
-     return emitOpError("Expects a scattered TensorDesc.\n");
-
-   if (!isReadHintOrNone(getL1HintAttr()))
-@@ -355,7 +346,7 @@ LogicalResult LoadGatherOp::verify() {
-   auto maskTy = getMaskType();
-   auto valueTy = getValueType();
-
--  if (!tdescTy.getScattered())
-+  if (!tdescTy.isScattered())
-     return emitOpError("Expects a scattered TensorDesc.\n");
-
-   if (!isReadHintOrNone(getL1HintAttr()))
-@@ -401,7 +392,7 @@ LogicalResult LoadGatherOp::verify() {
- //===----------------------------------------------------------------------===//
- LogicalResult StoreScatterOp::verify() {
-   auto tdescTy = getTensorDescType();
--  if (!tdescTy.getScattered())
-+  if (!tdescTy.isScattered())
-     return emitOpError("Expects a scattered TensorDesc.\n");
-
-   if (!isWriteHintOrNone(getL1HintAttr()))
---
-2.34.1
diff --git a/build_tools/patches/0008-xegpu-temporary-downstream-defintion-changes.patch b/build_tools/patches/0008-xegpu-temporary-downstream-defintion-changes.patch
index 623657e55..72b2739c6 100644
--- a/build_tools/patches/0008-xegpu-temporary-downstream-defintion-changes.patch
+++ b/build_tools/patches/0008-xegpu-temporary-downstream-defintion-changes.patch
@@ -1,19 +1,19 @@
-From c5e6d0bd63d6aab004ae4e795f1466800c54b3ff Mon Sep 17 00:00:00 2001
-From: Chao Chen <chao.chen@intel.com>
-Date: Thu, 29 Aug 2024 19:18:42 +0000
-Subject: [PATCH] Add temporary changes for downstream: - add transposeBitWidth
- for load_nd - add CompileHintOp
+From 0829723718f1e80834d9d0051069e263fcfea82a Mon Sep 17 00:00:00 2001
+From: izamyati <igor.zamyatin@intel.com>
+Date: Tue, 24 Sep 2024 18:25:53 -0500
+Subject: [PATCH] xegpu temporary downstream defintion changes
 
 ---
- mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 6 ++++++
- mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp         | 2 +-
- 2 files changed, 7 insertions(+), 1 deletion(-)
+ mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td      | 6 ++++++
+ mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp | 1 +
+ mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp              | 2 +-
+ 3 files changed, 8 insertions(+), 1 deletion(-)
 
 diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
-index 13a0bff5de1a..64b15fd1cc32 100644
+index e24a056de2ca..948cc40e8595 100644
 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
 +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
-@@ -285,6 +285,7 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [AllElementTypesMatch<["value", "Tensor
+@@ -302,6 +302,7 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [AllElementTypesMatch<["value", "Tensor
    let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
                         OptionalAttr<UnitAttr>: $packed,
                         OptionalAttr<DenseI64ArrayAttr>: $transpose,
@@ -21,7 +21,7 @@ index 13a0bff5de1a..64b15fd1cc32 100644
                         OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
                         OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
                         OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
-@@ -805,4 +806,9 @@ def XeGPU_FenceOp: XeGPU_Op<"fence", []> {
+@@ -850,4 +851,9 @@ def XeGPU_FenceOp: XeGPU_Op<"fence", []> {
    let extraClassDeclaration = extraBaseClassDeclaration;
  }
 
@@ -31,11 +31,23 @@ index 13a0bff5de1a..64b15fd1cc32 100644
 +}
 +
  #endif // MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD
+diff --git a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
+index fa0344276553..849de4fced8f 100644
+--- a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
++++ b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
+@@ -184,6 +184,7 @@ struct TransferReadLowering : public OpRewritePattern<vector::TransferReadOp> {
+     xegpu::CachePolicyAttr hint = nullptr;
+     auto loadOp = rewriter.create<xegpu::LoadNdOp>(
+         loc, vecTy, ndDesc, /*packed=*/nullptr, transposeAttr,
++        /*transpose_bit_width*/nullptr,
+         /*l1_hint=*/hint,
+         /*l2_hint=*/hint, /*l3_hint=*/hint);
+     rewriter.replaceOp(readOp, loadOp);
 diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
-index ee3834bd0d9c..98fc3308d96e 100644
+index 1a7a6b347840..121a7007208b 100644
 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
 +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
-@@ -222,7 +222,7 @@ LogicalResult LoadNdOp::verify() {
+@@ -236,7 +236,7 @@ LogicalResult LoadNdOp::verify() {
        emitWarning("Invalid transpose attr. It is ignored.");
    }
 
diff --git a/build_tools/patches/0009-SPIR-V-Enable-native-bf16-support-in-SPIR-V-dialect.patch b/build_tools/patches/0009-SPIR-V-Enable-native-bf16-support-in-SPIR-V-dialect.patch
index 248af9c41..d282deb4c 100644
--- a/build_tools/patches/0009-SPIR-V-Enable-native-bf16-support-in-SPIR-V-dialect.patch
+++ b/build_tools/patches/0009-SPIR-V-Enable-native-bf16-support-in-SPIR-V-dialect.patch
@@ -1,9 +1,7 @@
-From a2e340bcdb9936074795f1d28bef235be33a53b8 Mon Sep 17 00:00:00 2001
-From: Md Abdullah Shahneous Bari <md.abdullah.shahneous.bari@intel.com>
-Date: Tue, 20 Aug 2024 21:38:22 +0000
-Subject: [PATCH] This Patch enables Khronos extension: SPV_KHR_bfloat16. Most
- of the ops specified in the extension is supported. Some notable exceptions
- are: OpDot, OpCooperativeMatrixMulAddKHR.
+From 1f270ef0932e583d3d12fa9af7082ddecf8d9546 Mon Sep 17 00:00:00 2001
+From: izamyati <igor.zamyatin@intel.com>
+Date: Tue, 24 Sep 2024 18:19:04 -0500
+Subject: [PATCH] SPIR-V Enable native bf16 support in SPIR-V dialect
 
 ---
  .../Dialect/SPIRV/IR/SPIRVArithmeticOps.td    | 10 ++---
@@ -66,7 +64,7 @@ index 22d5afcd7738..de9e11493793 100644
 
    let assemblyFormat = "operands attr-dict `:` type($vector1) `->` type($result)";
 diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td
-index 04952dd1dc61..6c9c348490ab 100644
+index ddaeb13ef253..9b43dbfe2341 100644
 --- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td
 +++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td
 @@ -343,6 +343,7 @@ def SPV_KHR_subgroup_rotate                  : I32EnumAttrCase<"SPV_KHR_subgroup
@@ -77,7 +75,7 @@ index 04952dd1dc61..6c9c348490ab 100644
 
  def SPV_EXT_demote_to_helper_invocation  : I32EnumAttrCase<"SPV_EXT_demote_to_helper_invocation", 1000>;
  def SPV_EXT_descriptor_indexing          : I32EnumAttrCase<"SPV_EXT_descriptor_indexing", 1001>;
-@@ -435,7 +436,7 @@ def SPIRV_ExtensionAttr :
+@@ -434,7 +435,7 @@ def SPIRV_ExtensionAttr :
        SPV_KHR_fragment_shader_barycentric, SPV_KHR_ray_cull_mask,
        SPV_KHR_uniform_group_instructions, SPV_KHR_subgroup_rotate,
        SPV_KHR_non_semantic_info, SPV_KHR_terminate_invocation,
@@ -86,7 +84,7 @@ index 04952dd1dc61..6c9c348490ab 100644
        SPV_EXT_demote_to_helper_invocation, SPV_EXT_descriptor_indexing,
        SPV_EXT_fragment_fully_covered, SPV_EXT_fragment_invocation_density,
        SPV_EXT_fragment_shader_interlock, SPV_EXT_physical_storage_buffer,
-@@ -1193,6 +1194,22 @@ def SPIRV_C_ShaderClockKHR                              : I32EnumAttrCase<"Shade
+@@ -1192,6 +1193,22 @@ def SPIRV_C_ShaderClockKHR                              : I32EnumAttrCase<"Shade
      Extension<[SPV_KHR_shader_clock]>
    ];
  }
@@ -109,7 +107,7 @@ index 04952dd1dc61..6c9c348490ab 100644
  def SPIRV_C_FragmentFullyCoveredEXT                     : I32EnumAttrCase<"FragmentFullyCoveredEXT", 5265> {
    list<I32EnumAttrCase> implies = [SPIRV_C_Shader];
    list<Availability> availability = [
-@@ -1491,6 +1508,7 @@ def SPIRV_CapabilityAttr :
+@@ -1484,6 +1501,7 @@ def SPIRV_CapabilityAttr :
        SPIRV_C_RayQueryKHR, SPIRV_C_RayTracingKHR, SPIRV_C_Float16ImageAMD,
        SPIRV_C_ImageGatherBiasLodAMD, SPIRV_C_FragmentMaskAMD, SPIRV_C_StencilExportEXT,
        SPIRV_C_ImageReadWriteLodAMD, SPIRV_C_Int64ImageEXT, SPIRV_C_ShaderClockKHR,
@@ -117,7 +115,7 @@ index 04952dd1dc61..6c9c348490ab 100644
        SPIRV_C_FragmentFullyCoveredEXT, SPIRV_C_MeshShadingNV, SPIRV_C_FragmentDensityEXT,
        SPIRV_C_ShaderNonUniform, SPIRV_C_RuntimeDescriptorArray,
        SPIRV_C_StorageTexelBufferArrayDynamicIndexing, SPIRV_C_RayTracingNV,
-@@ -4148,16 +4166,21 @@ def SPIRV_Bool : TypeAlias<I1, "bool">;
+@@ -4139,16 +4157,21 @@ def SPIRV_Bool : TypeAlias<I1, "bool">;
  def SPIRV_Integer : AnyIntOfWidths<[8, 16, 32, 64]>;
  def SPIRV_Int16 : TypeAlias<I16, "Int16">;
  def SPIRV_Int32 : TypeAlias<I32, "Int32">;
@@ -142,7 +140,7 @@ index 04952dd1dc61..6c9c348490ab 100644
  // Component type check is done in the type parser for the following SPIR-V
  // dialect-specific types so we use "Any" here.
  def SPIRV_AnyPtr : DialectType<SPIRV_Dialect, SPIRV_IsPtrType,
-@@ -4180,14 +4203,14 @@ def SPIRV_AnyStruct : DialectType<SPIRV_Dialect, SPIRV_IsStructType,
+@@ -4169,14 +4192,14 @@ def SPIRV_AnyStruct : DialectType<SPIRV_Dialect, SPIRV_IsStructType,
  def SPIRV_AnySampledImage : DialectType<SPIRV_Dialect, SPIRV_IsSampledImageType,
                                  "any SPIR-V sampled image type">;
 
@@ -152,14 +150,14 @@ index 04952dd1dc61..6c9c348490ab 100644
  def SPIRV_Aggregate : AnyTypeOf<[SPIRV_AnyArray, SPIRV_AnyRTArray, SPIRV_AnyStruct]>;
  def SPIRV_Composite :
      AnyTypeOf<[SPIRV_Vector, SPIRV_AnyArray, SPIRV_AnyRTArray, SPIRV_AnyStruct,
-                SPIRV_AnyCooperativeMatrix, SPIRV_AnyJointMatrix, SPIRV_AnyMatrix]>;
+                SPIRV_AnyCooperativeMatrix, SPIRV_AnyMatrix]>;
  def SPIRV_Type : AnyTypeOf<[
 -    SPIRV_Void, SPIRV_Bool, SPIRV_Integer, SPIRV_Float, SPIRV_Vector,
 +    SPIRV_Void, SPIRV_Bool, SPIRV_Integer, SPIRV_Float, SPIRV_BFloat16KHR, SPIRV_Vector,
      SPIRV_AnyPtr, SPIRV_AnyArray, SPIRV_AnyRTArray, SPIRV_AnyStruct,
-     SPIRV_AnyCooperativeMatrix, SPIRV_AnyJointMatrix, SPIRV_AnyMatrix,
-     SPIRV_AnySampledImage
-@@ -4764,6 +4787,12 @@ def SPIRV_FPFMM_AllowReassocINTEL : I32BitEnumAttrCaseBit<"AllowReassocINTEL", 1
+     SPIRV_AnyCooperativeMatrix, SPIRV_AnyMatrix, SPIRV_AnySampledImage
+   ]>;
+@@ -4738,6 +4761,12 @@ def SPIRV_FPFMM_AllowReassocINTEL : I32BitEnumAttrCaseBit<"AllowReassocINTEL", 1
    ];
  }
 
@@ -273,7 +271,7 @@ index b05ee0251df5..a5c8aa8fb450 100644
    let summary = [{
      Convert value numerically from one floating-point width to another
 diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp b/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp
-index 654b0a8a2ed0..74f7d06d5272 100644
+index b4ad5923e975..d477c089732a 100644
 --- a/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp
 +++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp
 @@ -171,8 +171,10 @@ static Type parseAndVerifyType(SPIRVDialect const &dialect,
@@ -290,10 +288,10 @@ index 654b0a8a2ed0..74f7d06d5272 100644
      }
    } else if (auto t = llvm::dyn_cast<IntegerType>(type)) {
 diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp b/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp
-index 0eac34ee3a0f..16dcdd60a4bb 100644
+index 542c6beba2e4..27bfc1871528 100644
 --- a/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp
 +++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp
-@@ -596,7 +596,7 @@ bool ScalarType::classof(Type type) {
+@@ -521,7 +521,7 @@ bool ScalarType::classof(Type type) {
  }
 
  bool ScalarType::isValid(FloatType type) {
@@ -302,7 +300,7 @@ index 0eac34ee3a0f..16dcdd60a4bb 100644
  }
 
  bool ScalarType::isValid(IntegerType type) {
-@@ -605,6 +605,14 @@ bool ScalarType::isValid(IntegerType type) {
+@@ -530,6 +530,14 @@ bool ScalarType::isValid(IntegerType type) {
 
  void ScalarType::getExtensions(SPIRVType::ExtensionArrayRefVector &extensions,
                                 std::optional<StorageClass> storage) {
@@ -317,7 +315,7 @@ index 0eac34ee3a0f..16dcdd60a4bb 100644
    // 8- or 16-bit integer/floating-point numbers will require extra extensions
    // to appear in interface storage classes. See SPV_KHR_16bit_storage and
    // SPV_KHR_8bit_storage for more details.
-@@ -623,7 +631,7 @@ void ScalarType::getExtensions(SPIRVType::ExtensionArrayRefVector &extensions,
+@@ -548,7 +556,7 @@ void ScalarType::getExtensions(SPIRVType::ExtensionArrayRefVector &extensions,
      [[fallthrough]];
    case StorageClass::Input:
    case StorageClass::Output:
@@ -326,7 +324,7 @@ index 0eac34ee3a0f..16dcdd60a4bb 100644
        static const Extension exts[] = {Extension::SPV_KHR_16bit_storage};
        ArrayRef<Extension> ref(exts, std::size(exts));
        extensions.push_back(ref);
-@@ -710,7 +718,20 @@ void ScalarType::getCapabilities(
+@@ -635,7 +643,20 @@ void ScalarType::getCapabilities(
    } else {
      assert(llvm::isa<FloatType>(*this));
      switch (bitwidth) {
@@ -349,7 +347,7 @@ index 0eac34ee3a0f..16dcdd60a4bb 100644
      case 32:
        break;
 diff --git a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
-index 14fd4d5d4e40..4960dc7053e0 100644
+index cccf360b8e21..d38615eed7f1 100644
 --- a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
 +++ b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
 @@ -817,14 +817,20 @@ LogicalResult spirv::Deserializer::processType(spirv::Opcode opcode,
@@ -377,7 +375,7 @@ index 14fd4d5d4e40..4960dc7053e0 100644
      case 32:
        floatTy = opBuilder.getF32Type();
        break;
-@@ -1366,6 +1372,9 @@ LogicalResult spirv::Deserializer::processConstant(ArrayRef<uint32_t> operands,
+@@ -1330,6 +1336,9 @@ LogicalResult spirv::Deserializer::processConstant(ArrayRef<uint32_t> operands,
      } else if (floatType.isF16()) {
        APInt data(16, operands[2]);
        value = APFloat(APFloat::IEEEhalf(), data);
@@ -388,7 +386,7 @@ index 14fd4d5d4e40..4960dc7053e0 100644
 
      auto attr = opBuilder.getFloatAttr(floatType, value);
 diff --git a/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp b/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp
-index 64c23c75d4cd..b7d50073db99 100644
+index 10e5264bffac..26a8f7bb5fa9 100644
 --- a/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp
 +++ b/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp
 @@ -477,6 +477,9 @@ LogicalResult Serializer::prepareBasicType(
@@ -401,7 +399,7 @@ index 64c23c75d4cd..b7d50073db99 100644
      return success();
    }
 
-@@ -984,7 +987,8 @@ uint32_t Serializer::prepareConstantFp(Location loc, FloatAttr floatAttr,
+@@ -965,7 +968,8 @@ uint32_t Serializer::prepareConstantFp(Location loc, FloatAttr floatAttr,
      } words = llvm::bit_cast<DoubleWord>(value.convertToDouble());
      encodeInstructionInto(typesGlobalValues, opcode,
                            {typeID, resultID, words.word1, words.word2});
@@ -411,3 +409,5 @@ index 64c23c75d4cd..b7d50073db99 100644
      uint32_t word =
          static_cast<uint32_t>(value.bitcastToAPInt().getZExtValue());
      encodeInstructionInto(typesGlobalValues, opcode, {typeID, resultID, word});
+--
+2.34.1
diff --git a/build_tools/patches/0010-refine-the-XeGPU-definition.patch b/build_tools/patches/0010-refine-the-XeGPU-definition.patch
deleted file mode 100644
index 89d37c371..000000000
--- a/build_tools/patches/0010-refine-the-XeGPU-definition.patch
+++ /dev/null
@@ -1,206 +0,0 @@
-From 8a734652353bdd85b9cc7d2426e7395404372d72 Mon Sep 17 00:00:00 2001
-From: Chao Chen <chao.chen@intel.com>
-Date: Wed, 28 Aug 2024 23:57:49 +0000
-Subject: [PATCH]  refine the XeGPU definition   - add verification for
- scattered tensordesc regarding to chunk size and total size   - refine
- load_gather and store_scatter to reveal transpose effect
-
----
- .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 40 +++++++++++------
- mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    |  1 +
- mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        | 44 ++++++++++++++++---
- 3 files changed, 65 insertions(+), 20 deletions(-)
-
-diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
-index a3922bbad2b3..3e0c6f243fd4 100644
---- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
-+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
-@@ -413,24 +413,28 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
-       implying each element in the array corresponds to a work-item (SIMT lane)
-       in the subgroup.
-
-+    The first dimension of the result TensorDesc corresponds to work-items, so it should
-+    match the dimension of offsets. It may also has a second dimension corresponding to
-+    the chunk_size if the chunk size is larger than 1.
-+
-     Example 1. It assumes subgroup size is 4, and accesses a[0], a[16], a[32], a[64]
-     ```mlir
-     %a = memref.alloc() : memref<1024xf32>
--    %1 = xegpu.create_tdesc %a[0, 16, 32, 64]: memref<1024xf32> -> TensorDesc<4xf32, chunk_size_per_lane = 1>
-+    %1 = xegpu.create_tdesc %a[0, 16, 32, 64]: memref<1024xf32> -> TensorDesc<4xf32>
-     ```
-
-     Example 2. It assumes subgroup size is 4, and each workitem access 8 elements.
-                It will access totally 32 data elements: a[0:7], a[16:23], a[32:39], a[64:71]
-     ```mlir
-     %0 = memref.alloc() : memref<1024xf32>
--    %1 = xegpu.create_tdesc %0[0, 16, 32, 64] : memref<1024xf32> -> TensorDesc<4x8xf32, chunk_size_per_lane = 8>
-+    %1 = xegpu.create_tdesc %0[0, 16, 32, 64] : memref<1024xf32> -> TensorDesc<4x8xf32, chunk_size = 8>
-     ```
-
-     Example 3. It is similar to Example 2, but there is some overlaps among workitems.
-                It accesses: a[0:7], a[4:11], a[8:15], a[12:19]
-     ```mlir
-     %0 = memref.alloc() : memref<1024xf32>
--    %1 = xegpu.create_tdesc %0[0, 4, 8, 12] : memref<1024xf32> -> TensorDesc<4x8xf32, chunk_size_per_lane = 8>>
-+    %1 = xegpu.create_tdesc %0[0, 4, 8, 12] : memref<1024xf32> -> TensorDesc<4x8xf32, chunk_size = 8>>
-     ```
-   }];
-
-@@ -500,28 +504,31 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [AllRanksMatch<["value", "TensorDesc"]
-
-   let description = [{ It (aka. load) load data per each work-item. The output
-     describes the data being loaded at the subgroup level, so its size is
--    consistent with the number of work-items in a subgroup. When `chunk_size_per_lane`
--    attribute is larger than 1 in TensorDesc, the output vector will be 2D vector,
--    with dim-1 correspoding to the chunk size.
-+    consistent with the number of work-items in a subgroup. When the chunk size
-+    is larger than 2, the output vector is a 2D vector, with dim-1 correspoding
-+    to work-items, and dim-0 corresponding to the chunk_size loaded by each work-item.
-+    Specially, there is a transpose effect on the result (as compared to the TensorDesc)
-+    due to the hardware implementation. Therefore, a transpose attribute is introduced
-+    on purpose, making sure users are aware of this implicit transformation.
-
-     The mask operand masks out memory access so that it is safe to pass out-of-boundary
-     addresses/offsets as long as they are masked. It applies to slots of SIMD lanes.
-
-   Example:
-   ```mlir
--    %2 = xegpu.load %1, %0 {transpose = [1, 0],
-+    %2 = xegpu.load %1, %0 {transpose,
-                             l1_hint = #xegpu.cache_hint<cached>,
-                             l2_hint = #xegpu.cache_hint<uncached>,
-                             l3_hint = #xegpu.cache_hint<uncached>}
--          : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered=true>>, vector<16xi1>
--            -> vector<16xf32>
-+          : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_scope=global>>,
-+            vector<16xi1> -> vector<16xf32>
-   ```
-
-   }];
-
-   let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
-                        XeGPU_MaskType: $mask,
--                       OptionalAttr<DenseI64ArrayAttr>: $transpose,
-+                       OptionalAttr<UnitAttr>: $transpose,
-                        OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
-                        OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
-                        OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
-@@ -553,11 +560,15 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [AllRanksMatch<["value", "TensorDesc"]
-   let hasVerifier = 1;
- }
-
--def XeGPU_StoreScatterOp : XeGPU_Op<"store", [AllShapesMatch<["value", "TensorDesc"]>,
--                                        AllElementTypesMatch<["value", "TensorDesc"]>]> {
-+def XeGPU_StoreScatterOp : XeGPU_Op<"store", [AllElementCountsMatch<["value", "TensorDesc"]>,
-+                                              AllElementTypesMatch<["value", "TensorDesc"]>]> {
-   let summary = "store data to scattered memory locations.";
--  let description = [{ It (aka. store) stores data to scattered memory locations.
--  It has similar semantic to `load_gather`.
-+  let description = [{ It (aka. store) stores data to scattered memory locations. The value is
-+  typically a 1D vector. But when the chunk size of the TensorDesc is larger than 1, it will be
-+  a 2D vector instead. For the later case, dim-1 of the value correspods to the simd lanes
-+  and the dim-0 of the value corresponds to the chunk_size stored per lane. So `store_scatter`
-+  has transpose effect, which is similar to `load_gather`. Therefore, a transpose attribute is
-+  introduced on purpose, making sure users are aware of this implicit transformation.
-
-   Example:
-   ```mlir
-@@ -572,6 +583,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [AllShapesMatch<["value", "TensorDe
-     XeGPU_ValueType: $value,
-     XeGPU_TensorDesc: $TensorDesc,
-     XeGPU_MaskType: $mask,
-+    OptionalAttr<UnitAttr>: $transpose,
-     OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
-     OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
-     OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
-diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
-index 0eab601bbaac..555c232ff1f0 100644
---- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
-+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
-@@ -57,6 +57,7 @@ ScatterTensorDescAttr ScatterTensorDescAttr::get(mlir::MLIRContext *context,
- //===----------------------------------------------------------------------===//
- // XeGPU_TensorDescType
- //===----------------------------------------------------------------------===//
-+
- mlir::Type TensorDescType::parse(::mlir::AsmParser &parser) {
-   llvm::SmallVector<int64_t> shape;
-   mlir::Type elementType;
-diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
-index c9e399a7149f..b35a639540aa 100644
---- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
-+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
-@@ -305,6 +305,26 @@ LogicalResult CreateDescOp::verify() {
-
-   auto chunkSize = tdescTy.getChunkSize();
-
-+  // check chunk_size
-+  llvm::SmallVector<int64_t> supportedChunkSizes = {1, 2, 3, 4, 8, 16, 32, 64, 128, 256};
-+  if (!llvm::is_contained(supportedChunkSizes, chunkSize))
-+    return emitOpError("Invalid chunk_size. Supported values are 1, 2, 3, 4, 8, 16, 32, 64, 128, or 256.");
-+
-+  // check total size
-+  auto elemBits = tdescTy.getElementType().getIntOrFloatBitWidth();
-+  auto bitsPerLane = elemBits * chunkSize;
-+  if (chunkSize > 1 && bitsPerLane % 32) {
-+    // For 8-bit and 16-bit data, the hardware only supports chunk size of 1.
-+    // For 32-bit data, the hardware can support larger larger chunk size. So
-+    // we can bitcast 8-bit/16-bit data to 32-bit data for better performance.
-+    // But this requires the total size is 32 bit aligned to make the optimization work.
-+    return emitOpError("access size (chunk_size * sizeof(elemTy)) should be 32-bit aligned.");
-+  }
-+
-+  auto lscConstraints = 512 * 8; // each access is upto 512 bytes.
-+  if (elemBits * tdescTy.getNumElements() > lscConstraints)
-+    return emitOpError("total access size (simd_lanes * chunk_size * sizeof(elemTy)) is upto 512 bytes.");
-+
-   SmallVector<int64_t> shape({(int64_t)getNumOffsets()});
-   if (chunkSize != 1)
-     shape.push_back(chunkSize);
-@@ -370,14 +390,13 @@ LogicalResult LoadGatherOp::verify() {
-   if (tdescShape[0] != maskShape[0])
-     return emitOpError("dim-0 of the Mask and TensorDesc should be the same.");
-
--  if (getTransposeAttr()) {
--    auto trans = getTranspose().value();
--    if (tdescShape.size() < trans.size())
--      emitWarning("Invalid transpose attr. It is ignored.");
--    else
--      transpose(trans, tdescShape);
-+  if (tdescTy.getRank() == 2) {
-+    if (!getTransposeAttr())
-+      return emitOpError("load_gather has to be transposed.");
-+    transpose({1, 0}, tdescShape);
-   }
-
-+
-   if (valueShape != tdescShape)
-     return emitOpError("Unexpected result shape")
-            << "(Expected shape: " << makeString(tdescShape)
-@@ -404,11 +423,24 @@ LogicalResult StoreScatterOp::verify() {
-     return emitOpError("invlid l3_hint: ") << getL3HintAttr();
-
-   auto maskTy = getMaskType();
-+  auto valueTy = getValueType();
-   auto maskShape = getShapeOf(maskTy);
-   auto tdescShape = getShapeOf(tdescTy);
-+  auto valueShape = getShapeOf(valueTy);
-   if (tdescShape[0] != maskShape[0])
-     return emitOpError("dim-0 of the Mask and TensorDesc should be the same.");
-
-+  if (tdescTy.getRank() == 2) {
-+    if (!getTransposeAttr())
-+      return emitOpError("load_gather has to be transposed.");
-+    transpose({1, 0}, tdescShape);
-+  }
-+
-+  if (valueShape != tdescShape)
-+    return emitOpError("Unexpected value shape")
-+           << "(Expected shape: " << makeString(tdescShape)
-+           << ", Given shape: " << makeString(valueShape) << ").\n";
-+
-   return success();
- }
- //===----------------------------------------------------------------------===//
---
-2.34.1
diff --git a/docs/rfcs/XeGPU.md b/docs/rfcs/XeGPU.md
index 0524f4175..3d87e116f 100644
--- a/docs/rfcs/XeGPU.md
+++ b/docs/rfcs/XeGPU.md
@@ -16,13 +16,13 @@ Below is a summary.
 
 | Ops	| Syntax	| Example |
 | :---   | :----   | :--- |
-|create_tdesc	| operation ::= xegpu.create_tdesc $base_addr, $offset attr-dict : type($base_addr), type($offset) -> type($tdesc)	| %scatter_tdesc = xegpu.create_tdesc %mem_addr, %offset: int64, Vector<16 x index> -> tensor_desc<16 x bf16, #xegpu.scatter_tdesc_attr<memory_scope=slm>> |
+|create_tdesc	| operation ::= xegpu.create_tdesc $base_addr, $offset attr-dict : type($base_addr), type($offset) -> type($tdesc)	| %scatter_tdesc = xegpu.create_tdesc %mem_addr, %offset: int64, Vector<16 x index> -> tensor_desc<16 x bf16, #xegpu.scatter_tdesc_attr<memory_space=slm>> |
 |load_gather	| operation ::= xegpu.load_gather $tdesc, $mask attr-dict : type($tdesc), type($mask) -> type($res)	| %result = xegpu.load_gather %scatter_tdesc, %mask {L1 = cached, L2 = uncached, transpose} : tensor_desc<16x8xbf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>>, vector<16xi1> -> vector<8x16xbf16> |
 |store_scatter	| operation ::= xegpu.store_scatter $value, $tdesc, $mask attr-dict : type($value), type($tdesc), type($mask)	| xegpu.store_scatter %value, %scatter_tdesc, %mask {L1 = cached, L2 = uncached} : vector<16xbf16>, tensor_desc<16xbf16, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> |
 |update_offset	| operation ::= xegpu.update_offset $tdesc, $delta : type($tdesc), type($delta) -> type($tdesc)	| %tdesc_updated = xegpu.update_offset %tdesc, %offsets: tensor_desc<16xbf16, #xegpu.scatter_tdesc_attr<>>, vector<16xindex> -> tensor_desc<16xbf16, #xegpu.scatter_tdesc_attr<>> |
 |Prefetch	| operation ::= xegpu.prefetch $tdesc attr-dict : type($tdesc) 	| xegpu.prefetch %scatter_tdesc1 {L1 = cached, L2 = uncached} : tensor_desc<16xbf16, #xegpu.scatter_tdesc_attr<>> |
 |atomic_rmw	| operation ::= xegpu.atomic_rmw $kind, $value, $tdesc, $mask attr-dict : type($value), type($tdesc), type($mask) 	| %ret_value = xegpu.atomic_rmw “addf”, %value, %scatter_mem2, %mask : vector<16xbf16>, tensor_desc<16xbf16, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> |
-|create_nd_tdesc	| operation ::= xegpu.create_nd_tdesc $base_addr, $offset0, $offset1, $tdim0, $tdim1, $tstride0 attr-dict : type($base_addr), index, index, index, index, index, index -> type($tdesc)	| %tdesc = xegpu.create_nd_tdesc %mem_addr, %tile_offset:2, %base_shape:2,%base_strides:2: int64, index, index, index, index, index, index -> tensor_desc<8x16xbf16, #xegpu.block_tdesc_attr<memory_scope=global>> |
+|create_nd_tdesc	| operation ::= xegpu.create_nd_tdesc $base_addr, $offset0, $offset1, $tdim0, $tdim1, $tstride0 attr-dict : type($base_addr), index, index, index, index, index, index -> type($tdesc)	| %tdesc = xegpu.create_nd_tdesc %mem_addr, %tile_offset:2, %base_shape:2,%base_strides:2: int64, index, index, index, index, index, index -> tensor_desc<8x16xbf16, #xegpu.block_tdesc_attr<memory_space=global>> |
 |load_nd	| operation ::= xegpu.load_nd $tdesc attr-dict : type($tdesc) -> type($res)	| %result = xegpu.load_nd %tdesc {L1_hint = uncached, L3_hint = uncached} : tensor_desc<8x16xbf16> -> vector<8x16xbf16> |
 |dpas	| operation ::= xegpu.dpas $matC, $matA, $matB attr_dict : type($matC), type($matA), type($matB) -> type($res)	| %vector_c = xegpu.dpas %vector_c, %vector_a, %vector_b: vector<8x16xfloat>, vector<8x8x2xbf16>, vector<8x16x2xbf16> -> vector<8x16xfloat> |
 |store_nd	| operation ::= xegpu.store_nd $value, $tdesc attr-dict : type($value), type($tdesc) | xegpu.store_nd %value, %tdesc {L1_hint = uncached, L3_hint = uncached} : vector<8x16xbf16>, tensor_desc<8x16xbf16> |
@@ -66,7 +66,7 @@ data fragments and will be introduced in the next section in details. XeGPU oper
 
 `create_nd_tdesc` can also accept an optional `block_tdesc_attr` to extend its capablity. The `block_tdesc_attr` could encode the following
 optional attributes:
-- `memory_scope`. It describes where the data block being described is located. `global` means device memory, or `slm` means shared local memory.
+- `memory_space`. It describes where the data block being described is located. `global` means device memory, or `slm` means shared local memory.
   It is default to `global`. However, it has to match with the memory scope of the base addresses. If the base address is for shared local memory,
   than the memory scope of the tensor_desc has to be shared local memory too.
 - `array_length`. It is only used for load. It describes how many horizontally consecutive blocks will be loaded by a hardware load instruction.
@@ -96,16 +96,16 @@ create_nd_tdesc also accepts a memref as input instead of a memory address, shap
 The example below accepts a memory address and an offset and creates a 1D tensor_desc. The tensor_desc describes a 1D vector that is loaded by all work items combined within the subgroup.
 ```mlir
   #sg_map_a = xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>
-  #tdesc_attr1 = !xegpu.block_tdesc_attr<memory_scope=slm, boundary_check=false, sg= #sg_map_a>
+  #tdesc_attr1 = !xegpu.block_tdesc_attr<memory_space=slm, boundary_check=false, sg= #sg_map_a>
   %tdesc1 = xegpu.create_nd_tdesc %mem_addr, %offset :
 		uint64, index into tensor_desc<16xbf16, #tdesc_attr1>
 
-  #tdesc_attr2 = !xegpu.block_tdesc_attr<memory_scope=slm, boundary_check=false>
+  #tdesc_attr2 = !xegpu.block_tdesc_attr<memory_space=slm, boundary_check=false>
   %tdesc2 = xegpu.create_nd_tdesc %mem_addr, %offset :
 		uint64, index into tensor_desc<16xbf16, #tdesc_attr2>
 ```
 
-Attribute `memory_scope` indicates whether the tensor is located in the global or shared local memory. The default value is global.
+Attribute `memory_space` indicates whether the tensor is located in the global or shared local memory. The default value is global.
 Attribute `boundary_check` indicates whether the operation detects the boundary and pads with zero for out-of-boundary access. The default value is true.
 For 1D tensor description, the base_shape and base_stride are optional, the attribute “boundary_check” must be false, “%mem_add + %offset” must not access out-of-boundary memory to avoid undefined behavior.
 
@@ -197,7 +197,7 @@ When this variant is used, the matrix B must be in VNNI layout, and the matrix A
 ```
 
 `prefetch_nd` prefetches the memory specified by tensor_desc to cache.
-Attributes `L1_hint`, `L2_hint`, `L3_hint`, and `memory_scope` can be applied to prefetch_nd.
+Attributes `L1_hint`, `L2_hint`, `L3_hint`, and `memory_space` can be applied to prefetch_nd.
 ```mlir
   xegpu.prefetch_nd %tdesc2: tensor_desc<8x16xbf16>
   xegpu.prefetch_nd %tdesc2: tensor_desc<16xbf16>
@@ -230,7 +230,7 @@ creates a tensor_desc, which describes the memory base address and offsets for 1
 
 `scatter_tdesc_attr` could also contain the following optional attributes to extend the capbility of the operator,
 as shown in the following example.
-- `memory_scope`. It has the same semantic to the one in `block_tdesc_attr`, describing where the data block being
+- `memory_space`. It has the same semantic to the one in `block_tdesc_attr`, describing where the data block being
   described is located: global means device memory, and slm means shared local memory. It has to match with the memory
   scope of the base addresses. It is default to global.
 - `chunk_size`. It specifies the size being loaded per each work item, when each work item may load a consecutive
@@ -241,7 +241,7 @@ as shown in the following example.
    a valid chunk size could be 2, 4, 8, 16, 32, 64, and for int8, a valid chunk size could be 4, 8, 16, 32, 64.
 
 ```mlir
-  #tdesc_attr = !xegpu.scatter_tdesc_attr< memory_scope=slm, chunk_size=8>
+  #tdesc_attr = !xegpu.scatter_tdesc_attr< memory_space=slm, chunk_size=8>
   %scatter_tdesc_chunk = xegpu.create_tdesc, %base_addr, %offsets :
 		uint64, vector<16xindex> into tensor_desc<16x8xuint16, #tdesc_attr>
 ```
@@ -258,7 +258,7 @@ When loading a tensor_desc with chunk_size attribute, the output vector must be
 The transpose attribute must be present to explicitly describe the transpose effect.
 
 ```mlir
-  #tdesc_attr = #xegpu.scatter_tdesc_attr<memory_scope=slm, chunk_size=8>
+  #tdesc_attr = #xegpu.scatter_tdesc_attr<memory_space=slm, chunk_size=8>
   %result = xegpu.load_gather %scatter_tdesc_chunk, %mask {L1 = cached, L2 = uncached, transpose} :
           tensor_desc<16x8xbf16, #tdesc_attr>, vector<16xi1> -> vector<8x16xbf16>
 ```
@@ -276,7 +276,7 @@ uint32, uint64.
 xegpu.store_scatter %value, %scatter_tdesc1, %mask :
      	 vector<16xuint16>, vector<16xi1>, tensor_desc<16xuint16, #xegpu.scatter_tdesc_attr<>>
 ```
-Attributes `L1_hint`, `L2_hint`, `L3_hint`, and `memory_scope` can be applied to `store_scatter`. Similar to `load_gather`,
+Attributes `L1_hint`, `L2_hint`, `L3_hint`, and `memory_space` can be applied to `store_scatter`. Similar to `load_gather`,
 when the `chunk_size` of `tensor_desc` is specified, the `value` is a 2D vector with the shape of [chunk_size, subgroup_size].
 
 `prefetch` prefetches data from the memory specified by tensor_desc.
@@ -392,7 +392,7 @@ For load_nd with `transpose` attribute, wi_layout is transposed to match with th
 `xegpu.sg_map` is also used to describe the WI data distribution for regular load. Below example shows that each WI loads one fp32 data element. The result vector <16xfp32> is loaded and distributed to each WI as <1xf32>.
 ```mlir
   #sg_map_t = xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>
-  #scatter_attr = !xegpu.tdesc_attr< memory_scope=slm, scattered=true>
+  #scatter_attr = !xegpu.tdesc_attr< memory_space=slm, scattered=true>
   %scatter_tdesc = xegpu.create_tdesc, %src_addr, %offsets:
 		uint64, vector<16xindex> into tensor_desc<16xfp32, #scatter_attr, #sg_map_t>
 
@@ -403,7 +403,7 @@ For load_nd with `transpose` attribute, wi_layout is transposed to match with th
 Below example shows that each WI loads 4 fp32 data element with the chunk_size_per_lane. This load with chunk_size_per_lane is effectively load 2D tensor and transpose. The data fragement <1x4xf32> is loaded and transposed as <4x1xf32>.
 ```mlir
   #sg_map_t = xegpu.sg_map<wi_layout = [16, 1], wi_data = [1, 4]>
-  #scatter_attr = !xegpu.tdesc_attr< memory_scope=slm, scattered=true>
+  #scatter_attr = !xegpu.tdesc_attr< memory_space=slm, scattered=true>
   %scatter_tdesc_chunk = xegpu.create_tdesc, %src_addr, %offsets
 		{chunk_size_per_lane=4} :
 		uint64, vector<16xindex> into tensor_desc<16x4xfp32, #scatter_attr, #sg_map_t>
@@ -554,7 +554,7 @@ An example on how to perform transpose using load_gather with chunk_size_per_lan
 ```mlir
 
   #sg_map_t = xegpu.sg_map<wi_layout = [16, 1], wi_data = [1, 1]>
-  #scatter_attr = !xegpu.tdesc_attr< memory_scope=slm, scattered=true>
+  #scatter_attr = !xegpu.tdesc_attr< memory_space=slm, scattered=true>
   %scatter_tdesc_chunk = xegpu.create_tdesc, %src_addr, %offsets
 		{chunk_size_per_lane=4} :
 		uint64, vector<16xindex> into tensor_desc<16x4xfp32, #scatter_attr, #sg_map_t>
@@ -563,7 +563,7 @@ An example on how to perform transpose using load_gather with chunk_size_per_lan
           tensor_desc<16x4xfp32, #tdesc_attr, #sg_map_t>, vector<16xi1> -> vector<4x1xfp32>
 
   #sg_map = xegpu.sg_map<wi_layout = [1, 16], wi_data = [1, 1]>
-  #tdesc_attr = !xegpu.tdesc_attr< memory_scope=slm, boundary_check=false>
+  #tdesc_attr = !xegpu.tdesc_attr< memory_space=slm, boundary_check=false>
   %tdesc2 = xegpu.create_nd_tdesc %dest_addr, %offset:
 		uint64, index into tensor_desc<64xfp32, #tdesc_attr>
   xegpu.store_nd %value, %tdesc2:
diff --git a/include/imex/Dialect/XeTile/IR/XeTileAttrs.td b/include/imex/Dialect/XeTile/IR/XeTileAttrs.td
index e8c168e19..70de8c0e8 100644
--- a/include/imex/Dialect/XeTile/IR/XeTileAttrs.td
+++ b/include/imex/Dialect/XeTile/IR/XeTileAttrs.td
@@ -64,7 +64,7 @@ def XeTile_TileAttr : XeTile_Attr<"XeTile", "tile_attr"> {
     OptionalParameter<"xetile::WorkGroupMapAttr">:$wg_map,
     DefaultValuedParameter<"mlir::DenseI32ArrayAttr", "mlir::DenseI32ArrayAttr::get($_ctxt, {1, 0})">:$order,
     OptionalParameter<"mlir::DenseI64ArrayAttr">:$inner_blocks,
-    OptionalParameter<"mlir::Attribute">:$memory_scope
+    OptionalParameter<"mlir::Attribute">:$memory_space
   );
   let assemblyFormat = "`<` struct(params) `>`";
   let genVerifyDecl = true;
@@ -73,31 +73,31 @@ def XeTile_TileAttr : XeTile_Attr<"XeTile", "tile_attr"> {
                       CArg<"xetile::WorkGroupMapAttr", "{}">:$wg_map,
                       CArg<"llvm::ArrayRef<int32_t>", "{1, 0}">:$order,
                       CArg<"llvm::ArrayRef<int64_t>", "{}">:$inner_blocks,
-                      CArg<"int", "0">:$memory_scope),
+                      CArg<"int", "0">:$memory_space),
     [{
         mlir::Type intType = mlir::IntegerType::get($_ctxt, 32);
         return $_get($_ctxt, sg_map, wg_map, mlir::DenseI32ArrayAttr::get($_ctxt, order),
                       mlir::DenseI64ArrayAttr::get($_ctxt, inner_blocks),
-                      mlir::IntegerAttr::get(intType, memory_scope));
+                      mlir::IntegerAttr::get(intType, memory_space));
     }]>,
     AttrBuilder<(ins CArg<"llvm::ArrayRef<int32_t>", "{1, 0}">:$order,
-                     CArg<"int", "0">:$memory_scope),
+                     CArg<"int", "0">:$memory_space),
     [{
         mlir::Type intType = mlir::IntegerType::get($_ctxt, 32);
         return $_get($_ctxt, xetile::SubGroupMapAttr(), xetile::WorkGroupMapAttr(),
                       mlir::DenseI32ArrayAttr::get($_ctxt, order),
                       mlir::DenseI64ArrayAttr::get($_ctxt, {}),
-                      mlir::IntegerAttr::get(intType, memory_scope));
+                      mlir::IntegerAttr::get(intType, memory_space));
     }]>,
     AttrBuilder<(ins CArg<"xetile::SubGroupMapAttr", "{}">:$sg_map,
                       CArg<"xetile::WorkGroupMapAttr", "{}">:$wg_map,
                       CArg<"llvm::ArrayRef<int32_t>", "{1, 0}">:$order,
-                      CArg<"int", "0">:$memory_scope),
+                      CArg<"int", "0">:$memory_space),
     [{
        mlir::Type intType = mlir::IntegerType::get($_ctxt, 32);
        return $_get($_ctxt, sg_map, wg_map, mlir::DenseI32ArrayAttr::get($_ctxt, order),
                       mlir::DenseI64ArrayAttr::get($_ctxt, {}),
-                      mlir::IntegerAttr::get(intType, memory_scope));
+                      mlir::IntegerAttr::get(intType, memory_space));
     }]>
   ];
 }
diff --git a/include/imex/Dialect/XeTile/IR/XeTileTypes.td b/include/imex/Dialect/XeTile/IR/XeTileTypes.td
index 0e9ee2908..df6fa1b40 100644
--- a/include/imex/Dialect/XeTile/IR/XeTileTypes.td
+++ b/include/imex/Dialect/XeTile/IR/XeTileTypes.td
@@ -117,20 +117,20 @@ def XeTile : XeTile_Type<"Tile", "tile", [ShapedTypeInterface],
       return mlir::DenseI32ArrayAttr::get(getContext(), {1, 0});
     }
 
-    mlir::Attribute getMemoryScope() {
+    mlir::Attribute getMemorySpace() {
       auto encoding = llvm::dyn_cast_if_present<xetile::XeTileAttr>(getEncoding());
       if (encoding)
-        return encoding.getMemoryScope();
+        return encoding.getMemorySpace();
       return mlir::Attribute();
     }
 
-    int getMemoryScopeAsInt() {
+    int getMemorySpaceAsInt() {
       auto encoding = llvm::dyn_cast_if_present<xetile::XeTileAttr>(getEncoding());
-      if (encoding && encoding.getMemoryScope()) {
-        auto memoryScope = encoding.getMemoryScope();
-        assert(mlir::isa<mlir::IntegerAttr>(memoryScope) &&
+      if (encoding && encoding.getMemorySpace()) {
+        auto MemorySpace = encoding.getMemorySpace();
+        assert(mlir::isa<mlir::IntegerAttr>(MemorySpace) &&
               "Using `getMemorySpaceAsInt` with non-Integer attribute");
-        return mlir::cast<mlir::IntegerAttr>(memoryScope).getInt();
+        return mlir::cast<mlir::IntegerAttr>(MemorySpace).getInt();
       }
       // return default value 0 indicating Global memory
       return 0;
diff --git a/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp b/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp
index 521dd2b5f..b4ea05ec8 100644
--- a/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp
+++ b/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp
@@ -69,118 +69,6 @@ class GPUXToSPIRVPass : public impl::ConvertGPUXToSPIRVBase<GPUXToSPIRVPass> {
   bool mapMemorySpace;
 };
 
-class PrintfOpPattern : public mlir::OpConversionPattern<mlir::gpu::PrintfOp> {
-public:
-  using mlir::OpConversionPattern<mlir::gpu::PrintfOp>::OpConversionPattern;
-  mlir::LogicalResult
-  matchAndRewrite(mlir::gpu::PrintfOp gpuPrintfOp, OpAdaptor adaptor,
-                  mlir::ConversionPatternRewriter &rewriter) const override {
-    auto loc = gpuPrintfOp.getLoc();
-
-    auto funcOp = rewriter.getBlock()
-                      ->getParent()
-                      ->getParentOfType<mlir::spirv::FuncOp>();
-
-    auto moduleOp = funcOp->getParentOfType<mlir::spirv::ModuleOp>();
-
-    const char formatStringPrefix[] = "printfMsg";
-    unsigned stringNumber = 0;
-    mlir::SmallString<16> globalVarName;
-    mlir::spirv::GlobalVariableOp globalVar;
-
-    // formulate spirv global variable name
-    do {
-      globalVarName.clear();
-      (formatStringPrefix + llvm::Twine(stringNumber++))
-          .toStringRef(globalVarName);
-    } while (moduleOp.lookupSymbol(globalVarName));
-
-    auto i8Type = rewriter.getI8Type();
-    auto i32Type = rewriter.getI32Type();
-
-    unsigned scNum = 0;
-    auto createSpecConstant = [&](unsigned value) {
-      auto attr = rewriter.getI8IntegerAttr(value);
-      mlir::SmallString<16> specCstName;
-      (llvm::Twine(globalVarName) + "_sc" + llvm::Twine(scNum++))
-          .toStringRef(specCstName);
-
-      return rewriter.create<mlir::spirv::SpecConstantOp>(
-          loc, rewriter.getStringAttr(specCstName), attr);
-    };
-
-    // define GlobalVarOp with printf format string using SpecConstants
-    // and make composite of SpecConstants
-    {
-      mlir::Operation *parent =
-          mlir::SymbolTable::getNearestSymbolTable(gpuPrintfOp->getParentOp());
-
-      mlir::ConversionPatternRewriter::InsertionGuard guard(rewriter);
-
-      mlir::Block &entryBlock = *parent->getRegion(0).begin();
-      rewriter.setInsertionPointToStart(
-          &entryBlock); // insertion point at module level
-
-      // Create Constituents with SpecConstant to construct
-      // SpecConstantCompositeOp
-      llvm::SmallString<20> formatString(gpuPrintfOp.getFormat());
-      formatString.push_back('\0'); // Null terminate for C
-      mlir::SmallVector<mlir::Attribute, 4> constituents;
-      for (auto c : formatString) {
-        auto cSpecConstantOp = createSpecConstant(c);
-        constituents.push_back(mlir::SymbolRefAttr::get(cSpecConstantOp));
-      }
-
-      // Create specialization constant composite defined via spirv.SpecConstant
-      size_t contentSize = constituents.size();
-      auto globalType = mlir::spirv::ArrayType::get(i8Type, contentSize);
-      mlir::spirv::SpecConstantCompositeOp specCstComposite;
-      mlir::SmallString<16> specCstCompositeName;
-      (llvm::Twine(globalVarName) + "_scc").toStringRef(specCstCompositeName);
-      specCstComposite = rewriter.create<mlir::spirv::SpecConstantCompositeOp>(
-          loc, mlir::TypeAttr::get(globalType),
-          rewriter.getStringAttr(specCstCompositeName),
-          rewriter.getArrayAttr(constituents));
-
-      // Define GlobalVariable initialized from Constant Composite
-      globalVar = rewriter.create<mlir::spirv::GlobalVariableOp>(
-          loc,
-          mlir::spirv::PointerType::get(
-              globalType, mlir::spirv::StorageClass::UniformConstant),
-          globalVarName, mlir::FlatSymbolRefAttr::get(specCstComposite));
-      globalVar->setAttr("Constant", rewriter.getUnitAttr());
-    }
-
-    // Get SSA value of Global variable
-    mlir::Value globalPtr =
-        rewriter.create<mlir::spirv::AddressOfOp>(loc, globalVar);
-
-    mlir::Value fmtStr = rewriter.create<mlir::spirv::BitcastOp>(
-        loc,
-        mlir::spirv::PointerType::get(
-            i8Type, mlir::spirv::StorageClass::UniformConstant),
-        globalPtr);
-
-    // Get printf arguments
-    auto argsRange = adaptor.getArgs();
-    mlir::SmallVector<mlir::Value, 4> printfArgs;
-    printfArgs.reserve(argsRange.size() + 1);
-    printfArgs.append(argsRange.begin(), argsRange.end());
-
-    rewriter.create<mlir::spirv::CLPrintfOp>(loc, i32Type, fmtStr, printfArgs);
-
-    rewriter.eraseOp(gpuPrintfOp);
-
-    return mlir::success();
-  }
-};
-
-void populateGPUPrintfToSPIRVPatterns(mlir::SPIRVTypeConverter &typeConverter,
-                                      mlir::RewritePatternSet &patterns) {
-
-  patterns.add<PrintfOpPattern>(typeConverter, patterns.getContext());
-}
-
 // This op:
 //   vector.create_mask %maskVal : vector<vWidth x i1>
 // is lowered to:
@@ -519,7 +407,6 @@ void GPUXToSPIRVPass::runOnOperation() {
     mlir::populateSCFToSPIRVPatterns(typeConverter, scfToSpirvCtx, patterns);
     mlir::cf::populateControlFlowToSPIRVPatterns(typeConverter, patterns);
     mlir::populateMathToSPIRVPatterns(typeConverter, patterns);
-    imex::populateGPUPrintfToSPIRVPatterns(typeConverter, patterns);
     imex::populateVectorToSPIRVPatterns(typeConverter, patterns);
 
     if (failed(applyFullConversion(gpuModule, *target, std::move(patterns))))
diff --git a/lib/Conversion/XeGPUToVC/LSCPatterns.cpp b/lib/Conversion/XeGPUToVC/LSCPatterns.cpp
index c883399b8..221d94dcd 100644
--- a/lib/Conversion/XeGPUToVC/LSCPatterns.cpp
+++ b/lib/Conversion/XeGPUToVC/LSCPatterns.cpp
@@ -177,11 +177,11 @@ static LogicalResult isValidScatterSetup(Type elemTy, int simd_lanes,
 // or the data for store. It is not used for prefetch. prefetch on slm is not
 // available.
 static std::string getLSCIntrinsicStr(llvm::StringRef opName, int simd_lanes,
-                                      xegpu::MemoryScope memoryScope,
+                                      xegpu::MemorySpace MemorySpace,
                                       llvm::StringRef dataTyStr = "") {
-  auto kind = memoryScope == xegpu::MemoryScope::SLM ? "slm" : "stateless";
+  auto kind = MemorySpace == xegpu::MemorySpace::SLM ? "slm" : "stateless";
   // using 32bit for slm and 64bit for stateless
-  auto addrBits = memoryScope == xegpu::MemoryScope::SLM ? 32 : 64;
+  auto addrBits = MemorySpace == xegpu::MemorySpace::SLM ? 32 : 64;
   auto predTyStr = llvm::formatv("v{0}i1", simd_lanes).str();
   auto offsetTyStr = llvm::formatv("v{0}i{1}", simd_lanes, addrBits).str();
   if (opName == "load") {
@@ -443,7 +443,7 @@ static Value genLoadIntrinsicCallWithC32BConversion(
     ConversionPatternRewriter &rewriter, Location &loc, VectorType resultTy,
     int simd_lanes, Value pred, std::optional<xegpu::CachePolicy> l1,
     std::optional<xegpu::CachePolicy> l3, Type elemTy, int chunkSize,
-    xegpu::MemoryScope scope, Value addresses) {
+    xegpu::MemorySpace scope, Value addresses) {
 
   // truncate the value from i32Ty to elemTy.
   auto truncate = [&](Value value, Type elemTy,
@@ -486,7 +486,7 @@ static Value gen1DLoadInstrinsicCall(ConversionPatternRewriter &rewriter,
                                      std::optional<xegpu::CachePolicy> l1,
                                      std::optional<xegpu::CachePolicy> l3,
                                      Type elemTy, int elems,
-                                     xegpu::MemoryScope scope, Value payload) {
+                                     xegpu::MemorySpace scope, Value payload) {
   const int simd_lanes = 1;
   auto pred = dense_vector_int_val(1, i1Ty, simd_lanes);
   auto bitWidth = elemTy.getIntOrFloatBitWidth();
@@ -512,9 +512,9 @@ static func::CallOp
 genPrefetchIntrinsicCall(ConversionPatternRewriter &rewriter, Location &loc,
                          int simd_lanes, std::optional<xegpu::CachePolicy> l1,
                          std::optional<xegpu::CachePolicy> l3, Type elemTy,
-                         int chunkSize, xegpu::MemoryScope memoryScope,
+                         int chunkSize, xegpu::MemorySpace MemorySpace,
                          Value addresses) {
-  auto intrinsicStr = getLSCIntrinsicStr("prefetch", simd_lanes, memoryScope);
+  auto intrinsicStr = getLSCIntrinsicStr("prefetch", simd_lanes, MemorySpace);
   auto pred = dense_vector_int_val(1, i1Ty, simd_lanes);
   return genLSCIntrinsicCallWithEncoding(
       rewriter, loc, intrinsicStr, {} /* null resultType */, pred, LSC_LOAD, l1,
@@ -526,12 +526,12 @@ genPrefetchIntrinsicCall(ConversionPatternRewriter &rewriter, Location &loc,
 static func::CallOp gen1DPrefetchIntrinsicCall(
     ConversionPatternRewriter &rewriter, Location &loc,
     std::optional<xegpu::CachePolicy> l1, std::optional<xegpu::CachePolicy> l3,
-    Type elemTy, int elems, xegpu::MemoryScope memoryScope, Value payload) {
+    Type elemTy, int elems, xegpu::MemorySpace MemorySpace, Value payload) {
   const int simd_lanes = 1;
   auto bitWidth = elemTy.getIntOrFloatBitWidth();
   assert(bitWidth >= 32 && "1D block is only for 32/64-bit data.");
   return genPrefetchIntrinsicCall(rewriter, loc, simd_lanes, l1, l3, elemTy,
-                                  elems, memoryScope, payload);
+                                  elems, MemorySpace, payload);
 }
 
 // Generate a call to lsc.store intrinsic, using convert-to-32b conversion
@@ -545,7 +545,7 @@ static func::CallOp genStoreIntrinsicCallWithC32BConversion(
     ConversionPatternRewriter &rewriter, Location &loc, int simd_lanes,
     Value pred, std::optional<xegpu::CachePolicy> l1,
     std::optional<xegpu::CachePolicy> l3, Type elemTy, int chunkSize,
-    xegpu::MemoryScope scope, Value addresses, Value data) {
+    xegpu::MemorySpace scope, Value addresses, Value data) {
 
   // lsc store only takes 32-bit data as input and save the least 8-bit,
   // or 16-bit to the memory. So we need to extend the data to 32-bit if
@@ -597,7 +597,7 @@ static func::CallOp
 gen1DStoreInstrinsicCall(ConversionPatternRewriter &rewriter, Location &loc,
                          std::optional<xegpu::CachePolicy> l1,
                          std::optional<xegpu::CachePolicy> l3, Type elemTy,
-                         int elems, xegpu::MemoryScope scope, Value payload,
+                         int elems, xegpu::MemorySpace scope, Value payload,
                          Value data) {
   auto bitWidth = elemTy.getIntOrFloatBitWidth();
   assert(bitWidth >= 32 && "1D block is only for 32/64-bit data.");
@@ -729,7 +729,7 @@ class LoadNdPattern : public OpConversionPattern<LoadNdOp> {
     auto tdescTy = op.getTensorDescType();
     auto elemTy = tdescTy.getElementType();
     auto rank = tdescTy.getRank();
-    auto scope = tdescTy.getMemoryScope();
+    auto scope = tdescTy.getMemorySpace();
 
     auto l1hint = op.getL1Hint();
     auto l3hint = op.getL3Hint();
@@ -763,11 +763,11 @@ class LoadNdPattern : public OpConversionPattern<LoadNdOp> {
 
       auto newValue = gen1DLoadInstrinsicCall(
           rewriter, loc, op.getType(), l1hint, l3hint, elemTy, elems,
-          tdescTy.getMemoryScope(), adaptor.getTensorDesc());
+          tdescTy.getMemorySpace(), adaptor.getTensorDesc());
       rewriter.replaceOp(op, newValue);
       return success();
     } else if (rank == 2) { // 2d.ugm.desc
-      if (scope != xegpu::MemoryScope::Global)
+      if (scope != xegpu::MemorySpace::Global)
         return rewriter.notifyMatchFailure(
             op, "Only global access supported for block load.");
       auto payload = adaptor.getTensorDesc();
@@ -800,8 +800,9 @@ class LoadNdPattern : public OpConversionPattern<LoadNdOp> {
         // keep the clean interface. This part of the logic will be moved out.
         auto shape = tdescTy.getShape().vec();
         shape[1] = shape[1] / factor;
-        tdescTy = TensorDescType::get(tdescTy.getContext(), shape, elemTy,
-                                      tdescTy.getEncoding());
+        tdescTy =
+            TensorDescType::get(tdescTy.getContext(), shape, elemTy,
+                                tdescTy.getEncoding(), /*sg_map*/ nullptr);
 
         // update arg7 of the payload
         auto nblks = tdescTy.getArrayLength();
@@ -862,14 +863,14 @@ class PrefetchNdPattern : public OpConversionPattern<PrefetchNdOp> {
     auto tdescTy = op.getTensorDescType();
     auto elemTy = tdescTy.getElementType();
     auto rank = tdescTy.getRank();
-    auto scope = tdescTy.getMemoryScope();
+    auto scope = tdescTy.getMemorySpace();
 
     auto l1hint = op.getL1Hint();
     auto l3hint = op.getL3Hint();
 
     if (rank == 1) { // for 1D tensor desc, use lsc.load
 
-      if (scope == xegpu::MemoryScope::SLM) {
+      if (scope == xegpu::MemorySpace::SLM) {
         // no prefetch for slm.
         rewriter.eraseOp(op);
         return success();
@@ -887,7 +888,7 @@ class PrefetchNdPattern : public OpConversionPattern<PrefetchNdOp> {
       rewriter.replaceOp(op, callOp);
       return success();
     } else if (rank == 2) { // 2d.ugm.desc
-      if (scope != xegpu::MemoryScope::Global)
+      if (scope != xegpu::MemorySpace::Global)
         return rewriter.notifyMatchFailure(
             op, "Only global access supported for block prefetch.");
       auto callOp = gen2DPrefetchIntrinsicCall(
@@ -910,7 +911,7 @@ class StoreNdPattern : public OpConversionPattern<StoreNdOp> {
     auto tdescTy = op.getTensorDescType();
     auto elemTy = tdescTy.getElementType();
     auto rank = tdescTy.getRank();
-    auto scope = tdescTy.getMemoryScope();
+    auto scope = tdescTy.getMemorySpace();
 
     auto l1hint = op.getL1Hint();
     auto l3hint = op.getL3Hint();
@@ -943,7 +944,7 @@ class StoreNdPattern : public OpConversionPattern<StoreNdOp> {
       return success();
 
     } else if (rank == 2) { // store.2d.ugm.desc
-      if (scope != xegpu::MemoryScope::Global)
+      if (scope != xegpu::MemorySpace::Global)
         return rewriter.notifyMatchFailure(
             op, "Only global access supported for block store.");
 
@@ -996,7 +997,7 @@ class LoadGatherPattern : public OpConversionPattern<LoadGatherOp> {
     auto resultTy = cast<VectorType>(op.getType());
     auto newValue = genLoadIntrinsicCallWithC32BConversion(
         rewriter, loc, resultTy, simd_lanes, op.getMask(), l1hint, l3hint,
-        elemTy, chunkSize, tdescTy.getMemoryScope(), adaptor.getTensorDesc());
+        elemTy, chunkSize, tdescTy.getMemorySpace(), adaptor.getTensorDesc());
     rewriter.replaceOp(op, newValue);
 
     return success();
@@ -1015,11 +1016,11 @@ class PrefetchPattern : public OpConversionPattern<PrefetchOp> {
     auto elemTy = tdescTy.getElementType();
     auto chunkSize = tdescTy.getChunkSize();
     auto simd_lanes = tdescTy.getShape()[0];
-    auto scope = tdescTy.getMemoryScope();
+    auto scope = tdescTy.getMemorySpace();
 
     // For SLM, there is not prefetch available, we will simply
     // remove the prefetch op.
-    if (scope == xegpu::MemoryScope::SLM) {
+    if (scope == xegpu::MemorySpace::SLM) {
       rewriter.eraseOp(op);
       return success();
     }
@@ -1080,7 +1081,7 @@ class StoreScatterPattern : public OpConversionPattern<StoreScatterOp> {
     auto l3hint = op.getL3Hint();
     auto callOp = genStoreIntrinsicCallWithC32BConversion(
         rewriter, loc, simd_lanes, op.getMask(), l1hint, l3hint, elemTy,
-        chunkSize, tdescTy.getMemoryScope(), adaptor.getTensorDesc(),
+        chunkSize, tdescTy.getMemorySpace(), adaptor.getTensorDesc(),
         adaptor.getValue());
 
     rewriter.replaceOp(op, callOp);
@@ -1204,10 +1205,10 @@ class FencePattern : public OpConversionPattern<FenceOp> {
     fence_scope = lscFenceScope::GROUP;
 
     switch (op.getMemoryKind()) {
-    case xegpu::MemoryScope::Global:
+    case xegpu::MemorySpace::Global:
       sfid = lscSFID::UGM;
       break;
-    case xegpu::MemoryScope::SLM:
+    case xegpu::MemorySpace::SLM:
       sfid = lscSFID::TGM;
       break;
     }
diff --git a/lib/Conversion/XeGPUToVC/XeGPUToVC.cpp b/lib/Conversion/XeGPUToVC/XeGPUToVC.cpp
index 3d6858464..b97010633 100644
--- a/lib/Conversion/XeGPUToVC/XeGPUToVC.cpp
+++ b/lib/Conversion/XeGPUToVC/XeGPUToVC.cpp
@@ -227,13 +227,13 @@ class CreateNdDescPattern : public OpConversionPattern<CreateNdDescOp> {
                   ConversionPatternRewriter &rewriter) const override {
     auto loc = op.getLoc();
     auto tdescTy = op.getType();
-    auto scope = tdescTy.getMemoryScope();
+    auto scope = tdescTy.getMemorySpace();
     auto rank = tdescTy.getRank();
     auto elemBytes = tdescTy.getElementType().getIntOrFloatBitWidth() / 8;
 
     // SLM has to use 32-bit address, while ugm needs to use 64-bit address.
     auto addrTy =
-        (scope == xegpu::MemoryScope::SLM) ? (Type)i32Ty : (Type)i64Ty;
+        (scope == xegpu::MemorySpace::SLM) ? (Type)i32Ty : (Type)i64Ty;
 
     // Handle different source types: memref and i64/i32/ui64/ui32
     auto memRefType = dyn_cast<MemRefType>(op.getSource().getType());
@@ -249,7 +249,7 @@ class CreateNdDescPattern : public OpConversionPattern<CreateNdDescOp> {
     base = adjustBasePointer(rewriter, op, base);
     base = rewriter.create<arith::IndexCastUIOp>(loc, addrTy, base);
 
-    if (scope == xegpu::MemoryScope::SLM || rank == 1) {
+    if (scope == xegpu::MemorySpace::SLM || rank == 1) {
       // for SLM and 1D, we need to create message for use regular load/store
       // instead of matrix descriptor, the shape of accepted TensorDescs are
       // limited to 1xN (rank = 2 with leading dimension to be 1) or N (rank =
@@ -369,14 +369,14 @@ class UpdateNDOffsetPattern : public OpConversionPattern<UpdateNdOffsetOp> {
 
     auto loc = op.getLoc();
     auto tdescTy = op.getTensorDescType();
-    auto scope = tdescTy.getMemoryScope();
+    auto scope = tdescTy.getMemorySpace();
     auto rank = tdescTy.getRank();
 
     auto addrTy =
-        (scope == xegpu::MemoryScope::SLM) ? (Type)i32Ty : (Type)i64Ty;
+        (scope == xegpu::MemorySpace::SLM) ? (Type)i32Ty : (Type)i64Ty;
 
     auto desc = adaptor.getTensorDesc();
-    if (scope == xegpu::MemoryScope::SLM || rank == 1) {
+    if (scope == xegpu::MemorySpace::SLM || rank == 1) {
       // for SLM and 1D, we need to create message for use regular load/store
       // instead of matrix descriptor
 
@@ -435,20 +435,6 @@ class UpdateNDOffsetPattern : public OpConversionPattern<UpdateNdOffsetOp> {
   }
 };
 
-// converts an array of OpFoldResult into a vector of index.
-static Value convertToIndexVector(llvm::ArrayRef<OpFoldResult> ofrs,
-                                  Location loc,
-                                  ConversionPatternRewriter &rewriter) {
-  SmallVector<Value> array;
-  for (auto ofr : ofrs) {
-    auto value = getValueOrConstantOp(ofr, loc, rewriter, indexTy);
-    assert(value.getType().isIndex() && "expecting an index type value.");
-    array.push_back(value);
-  }
-  return rewriter.create<vector::FromElementsOp>(
-      loc, vecTy(ofrs.size(), indexTy), ValueRange(array));
-}
-
 class CreateDescPattern : public OpConversionPattern<CreateDescOp> {
 public:
   using OpConversionPattern<CreateDescOp>::OpConversionPattern;
@@ -462,8 +448,8 @@ class CreateDescPattern : public OpConversionPattern<CreateDescOp> {
     assert(elemTy.isIntOrFloat() && "only support int or float element type.");
 
     // use 32-bit address for SLM and 64-bit address for UGM
-    auto scope = tdescTy.getMemoryScope();
-    auto addrTy = scope == xegpu::MemoryScope::SLM ? (Type)i32Ty : (Type)i64Ty;
+    auto scope = tdescTy.getMemorySpace();
+    auto addrTy = scope == xegpu::MemorySpace::SLM ? (Type)i32Ty : (Type)i64Ty;
 
     Value base = rewriter.create<memref::ExtractAlignedPointerAsIndexOp>(
         loc, adaptor.getSource());
@@ -478,8 +464,7 @@ class CreateDescPattern : public OpConversionPattern<CreateDescOp> {
     // offset is represented in number of elements, need to scale it to bytes
     auto elemBytes = elemTy.getIntOrFloatBitWidth() / 8;
     auto factor = dense_vector_int_val(elemBytes, addrTy, simd_lanes);
-    Value offsets = convertToIndexVector(op.getMixedOffsets(), loc, rewriter);
-    offsets = castValueTo(offsets, payloadTy, loc, rewriter);
+    Value offsets = castValueTo(adaptor.getOffsets(), payloadTy, loc, rewriter);
     offsets = muli(factor, offsets);
 
     // create a payload with the base address broadcasted to all simd lanes
@@ -506,16 +491,15 @@ class UpdateOffsetOpPattern : public OpConversionPattern<UpdateOffsetOp> {
     assert(elemTy.isIntOrFloat() && "only support int or float element type.");
 
     // use 32-bit address for SLM and 64-bit address for UGM
-    auto scope = tdescTy.getMemoryScope();
-    auto addrTy = scope == xegpu::MemoryScope::SLM ? (Type)i32Ty : (Type)i64Ty;
+    auto scope = tdescTy.getMemorySpace();
+    auto addrTy = scope == xegpu::MemorySpace::SLM ? (Type)i32Ty : (Type)i64Ty;
 
     auto simd_lanes = tdescTy.getShape()[0];
     auto payloadTy = VectorType::get(simd_lanes, addrTy);
 
     auto elemBytes = elemTy.getIntOrFloatBitWidth() / 8;
     Value factor = dense_vector_int_val(elemBytes, addrTy, simd_lanes);
-    Value offsets = convertToIndexVector(op.getMixedOffsets(), loc, rewriter);
-    offsets = castValueTo(offsets, payloadTy, loc, rewriter);
+    Value offsets = castValueTo(adaptor.getOffsets(), payloadTy, loc, rewriter);
     offsets = muli(factor, offsets);
 
     auto payload = addi(adaptor.getTensorDesc(), offsets);
@@ -917,14 +901,14 @@ struct XeGPUToVCPass : public imex::impl::ConvertXeGPUToVCBase<XeGPUToVCPass> {
     typeConverter.addConversion([&](IndexType type) { return type; });
 
     typeConverter.addConversion([&](xegpu::TensorDescType type) -> Type {
-      auto scope = type.getMemoryScope();
+      auto scope = type.getMemorySpace();
       auto rank = type.getRank();
       auto i32Type = IntegerType::get(&getContext(), 32);
       auto i64Type = IntegerType::get(&getContext(), 64);
 
-      if (type.isScattered() || rank == 1 || scope == xegpu::MemoryScope::SLM) {
+      if (type.isScattered() || rank == 1 || scope == xegpu::MemorySpace::SLM) {
         auto addrTy =
-            scope == xegpu::MemoryScope::SLM ? (Type)i32Type : (Type)i64Type;
+            scope == xegpu::MemorySpace::SLM ? (Type)i32Type : (Type)i64Type;
         auto simd_lanes = type.isScattered() ? type.getShape()[0] : 1;
         return VectorType::get(simd_lanes, addrTy);
       } else if (rank == 2) {
diff --git a/lib/Conversion/XeTileToXeGPU/ArithOpConversion.cpp b/lib/Conversion/XeTileToXeGPU/ArithOpConversion.cpp
index 31cffb595..1102a8b64 100644
--- a/lib/Conversion/XeTileToXeGPU/ArithOpConversion.cpp
+++ b/lib/Conversion/XeTileToXeGPU/ArithOpConversion.cpp
@@ -311,8 +311,7 @@ class SgVectorMultiDimReductionOpPattern
 
     rewriter.setInsertionPoint(op);
     // doing reduction on outer dimension
-    if (mlir::isConstantIntValue(dims[0], 0) &&
-        mlir::isConstantIntValue(dims[1], 2)) {
+    if (dims[0] == 0 && dims[1] == 2) {
       auto intermediates = lowerOuterReduction(sources, shape, op.getKind(),
                                                loc, elemTy, rewriter);
       {
@@ -330,8 +329,7 @@ class SgVectorMultiDimReductionOpPattern
     }
 
     // doing reduction on inner dimension
-    if (mlir::isConstantIntValue(dims[0], 1) &&
-        mlir::isConstantIntValue(dims[1], 3)) {
+    if (dims[0] == 1 && dims[1] == 3) {
       auto intermediates = lowerInnerReductionWithIntraVectorShuffles(
           sources, shape, op.getKind(), loc, elemTy, rewriter);
 
diff --git a/lib/Conversion/XeTileToXeGPU/XeTileOpConversion.cpp b/lib/Conversion/XeTileToXeGPU/XeTileOpConversion.cpp
index bd9201d10..60033b870 100644
--- a/lib/Conversion/XeTileToXeGPU/XeTileOpConversion.cpp
+++ b/lib/Conversion/XeTileToXeGPU/XeTileOpConversion.cpp
@@ -407,9 +407,9 @@ class SgInitTileOpPattern : public XeOneToNConversion<xetile::InitTileOp> {
     auto shape = llvm::to_vector(tileTy.getShape());
     auto indexType = rewriter.getIndexType();
 
-    auto memoryScope = op.getSourceMemorySpaceAsInt() == 3
-                           ? mlir::xegpu::MemoryScope::SLM
-                           : mlir::xegpu::MemoryScope::Global;
+    auto MemorySpace = op.getSourceMemorySpaceAsInt() == 3
+                           ? mlir::xegpu::MemorySpace::SLM
+                           : mlir::xegpu::MemorySpace::Global;
 
     if (tileTy.getRank() != 2)
       return op.emitOpError("The tile shape should be 2D.");
@@ -454,7 +454,7 @@ class SgInitTileOpPattern : public XeOneToNConversion<xetile::InitTileOp> {
     auto offsetsX = offsets.pop_back_val();
 
     auto tDescTy = mlir::xegpu::TensorDescType::get(
-        innerBlk, elemTy, array_length, true /*boundary_check*/, memoryScope);
+        innerBlk, elemTy, array_length, true /*boundary_check*/, MemorySpace);
 
     auto createIndexConstant = [&](mlir::Type type, int64_t value) {
       auto attr = rewriter.getIndexAttr(value);
diff --git a/lib/Dialect/XeTile/IR/XeTileDialect.cpp b/lib/Dialect/XeTile/IR/XeTileDialect.cpp
index a2afb71e3..0813ea315 100644
--- a/lib/Dialect/XeTile/IR/XeTileDialect.cpp
+++ b/lib/Dialect/XeTile/IR/XeTileDialect.cpp
@@ -116,7 +116,7 @@ mlir::LogicalResult XeTileAttr::verify(
     ::llvm::function_ref<::mlir::InFlightDiagnostic()> emitError,
     ::imex::xetile::SubGroupMapAttr sg_map, xetile::WorkGroupMapAttr wg_map,
     mlir::DenseI32ArrayAttr order, mlir::DenseI64ArrayAttr inner_blocks,
-    mlir::Attribute memoryScope) {
+    mlir::Attribute MemorySpace) {
 
   if (order != mlir::DenseI32ArrayAttr() && order.size() != 2)
     emitError() << "expect integer array of size 2 for order";
diff --git a/lib/Dialect/XeTile/IR/XeTileOps.cpp b/lib/Dialect/XeTile/IR/XeTileOps.cpp
index f51249f49..c9767a9dc 100644
--- a/lib/Dialect/XeTile/IR/XeTileOps.cpp
+++ b/lib/Dialect/XeTile/IR/XeTileOps.cpp
@@ -129,7 +129,7 @@ mlir::LogicalResult InitTileOp::verify() {
   auto tileTy = getType();
   // Check for memory space validity.
   if (getSourceMemorySpaceAsInt() !=
-      static_cast<unsigned int>(tileTy.getMemoryScopeAsInt()))
+      static_cast<unsigned int>(tileTy.getMemorySpaceAsInt()))
     return emitOpError(
         "memory space of the tile doesn't match with the source.");
 
diff --git a/lib/Dialect/XeTile/Transforms/BlockAligning.cpp b/lib/Dialect/XeTile/Transforms/BlockAligning.cpp
index 3b1c902a4..b579e7772 100644
--- a/lib/Dialect/XeTile/Transforms/BlockAligning.cpp
+++ b/lib/Dialect/XeTile/Transforms/BlockAligning.cpp
@@ -203,7 +203,7 @@ struct InitTileOpPattern
 
     auto attr = imex::xetile::XeTileAttr::get(
         op.getContext(), tileTy.getSgMap(), tileTy.getWgMap(),
-        tileTy.getOrder(), newBlockSize, tileTy.getMemoryScope());
+        tileTy.getOrder(), newBlockSize, tileTy.getMemorySpace());
 
     auto newTileTy = imex::xetile::TileType::get(tileTy.getShape(),
                                                  tileTy.getElementType(), attr);
diff --git a/lib/Dialect/XeTile/Transforms/Blocking.cpp b/lib/Dialect/XeTile/Transforms/Blocking.cpp
index 83f82c4a3..02f8584e8 100644
--- a/lib/Dialect/XeTile/Transforms/Blocking.cpp
+++ b/lib/Dialect/XeTile/Transforms/Blocking.cpp
@@ -548,8 +548,8 @@ struct VectorMultiDimReductionOpPattern
     // will be transformed to
     // multi_reduction<add>, %e, %a[1, 3]: vector<16x2x1x16xf16> to
     // vector<16x1xf16>
-    auto dim = mlir::cast<mlir::IntegerAttr>(reductionDims[0]).getInt();
-    auto newReductionDims = rewriter.getI64ArrayAttr({dim, dim + 2});
+    auto dim = reductionDims[0];
+    auto newReductionDims = rewriter.getDenseI64ArrayAttr({dim, dim + 2});
 
     auto newDestShape =
         (dim == 0)
@@ -955,7 +955,7 @@ struct InitTileOpPattern
 
     auto attr = imex::xetile::XeTileAttr::get(
         op.getContext(), tileTy.getSgMap(), tileTy.getWgMap(),
-        tileTy.getOrder(), innerBlocks, tileTy.getMemoryScope());
+        tileTy.getOrder(), innerBlocks, tileTy.getMemorySpace());
 
     auto newTileTy =
         imex::xetile::TileType::get(tileTy.getShape(), elemTy, attr);
diff --git a/lib/Dialect/XeTile/Transforms/BlockingAnalysis.cpp b/lib/Dialect/XeTile/Transforms/BlockingAnalysis.cpp
index 9f575df8e..2005f185c 100644
--- a/lib/Dialect/XeTile/Transforms/BlockingAnalysis.cpp
+++ b/lib/Dialect/XeTile/Transforms/BlockingAnalysis.cpp
@@ -213,9 +213,10 @@ class BlockingAnalysisImpl
                        std::shared_ptr<XeuArchInterface> uArch)
       : SparseBackwardDataFlowAnalysis(solver, symbolTable), uArch(uArch) {}
 
-  void visitOperation(mlir::Operation *op,
-                      mlir::ArrayRef<BlockingLattice *> operands,
-                      mlir::ArrayRef<const BlockingLattice *> results) override;
+  mlir::LogicalResult
+  visitOperation(mlir::Operation *op,
+                 mlir::ArrayRef<BlockingLattice *> operands,
+                 mlir::ArrayRef<const BlockingLattice *> results) override;
 
   void visitBranchOperand(mlir::OpOperand &operand) override {}
 
@@ -283,7 +284,7 @@ class BlockingAnalysisImpl
   std::shared_ptr<XeuArchInterface> uArch = nullptr;
 };
 
-void BlockingAnalysisImpl::visitOperation(
+mlir::LogicalResult BlockingAnalysisImpl::visitOperation(
     mlir::Operation *op, mlir::ArrayRef<BlockingLattice *> operands,
     mlir::ArrayRef<const BlockingLattice *> results) {
 
@@ -319,6 +320,8 @@ void BlockingAnalysisImpl::visitOperation(
 
   if (auto createMaskOp = mlir::dyn_cast<mlir::vector::CreateMaskOp>(op))
     visitCreateMaskOp(createMaskOp, operands, results);
+
+  return mlir::success();
 }
 
 void BlockingAnalysisImpl::visitPrefetchTileOp(
@@ -327,7 +330,7 @@ void BlockingAnalysisImpl::visitPrefetchTileOp(
   auto tileTy = op.getTile().getType();
   auto elemTy = tileTy.getElementType();
   auto shape = tileTy.getShape();
-  auto memSpace = tileTy.getMemoryScopeAsInt();
+  auto memSpace = tileTy.getMemorySpaceAsInt();
   // initialized with a default size queried from the architecture
   auto size = getInnerBlockSize(op, elemTy, shape, memSpace);
   if (!size)
@@ -348,7 +351,7 @@ void BlockingAnalysisImpl::visitLoadTileOp(
   auto elemTy = tileTy.getElementType();
   auto bitWidth = elemTy.getIntOrFloatBitWidth();
   auto shape = tileTy.getShape();
-  auto memSpace = tileTy.getMemoryScopeAsInt();
+  auto memSpace = tileTy.getMemorySpaceAsInt();
   // initialized with a default size queried from the architecture
   Block block = getInnerBlockSize(op, elemTy, shape, memSpace);
 
@@ -387,7 +390,7 @@ void BlockingAnalysisImpl::visitStoreTileOp(
   auto tileTy = op.getTile().getType();
   auto elemTy = tileTy.getElementType();
   auto shape = tileTy.getShape();
-  auto memSpace = tileTy.getMemoryScopeAsInt();
+  auto memSpace = tileTy.getMemorySpaceAsInt();
   auto size = getInnerBlockSize(op, elemTy, shape, memSpace);
 
   if (!size)
diff --git a/lib/Dialect/XeTile/Transforms/BlockingRewrite.cpp b/lib/Dialect/XeTile/Transforms/BlockingRewrite.cpp
index a988583d3..878987bdc 100644
--- a/lib/Dialect/XeTile/Transforms/BlockingRewrite.cpp
+++ b/lib/Dialect/XeTile/Transforms/BlockingRewrite.cpp
@@ -213,7 +213,7 @@ struct InitTileOpPattern
 
     auto attr = imex::xetile::XeTileAttr::get(
         op.getContext(), tileTy.getSgMap(), tileTy.getWgMap(),
-        tileTy.getOrder(), innerBlockAttr, tileTy.getMemoryScope());
+        tileTy.getOrder(), innerBlockAttr, tileTy.getMemorySpace());
 
     auto elemTy = tileTy.getElementType();
     auto newTileTy = imex::xetile::TileType::get(shape, elemTy, attr);
diff --git a/lib/Dialect/XeTile/Transforms/Canonicalization.cpp b/lib/Dialect/XeTile/Transforms/Canonicalization.cpp
index 53c7769f4..8cc02340a 100644
--- a/lib/Dialect/XeTile/Transforms/Canonicalization.cpp
+++ b/lib/Dialect/XeTile/Transforms/Canonicalization.cpp
@@ -301,16 +301,14 @@ struct VectorMultiReductionToXeTileReduce
       return mlir::failure();
     // If result is not 1D, we can not convert it to xetile.reduce. This
     // requires that the reduction dimensions has rank 1.
-    auto reductionDims = op.getReductionDims().getValue();
+    auto reductionDims = op.getReductionDims();
     if (reductionDims.size() != 1)
       return mlir::failure();
     // Retain discardable attributes if any.
     llvm::SmallVector<mlir::NamedAttribute> discardableAttrs(
         op->getDiscardableAttrs().begin(), op->getDiscardableAttrs().end());
     // Create an equivalent XeTileReduceOp
-    int64_t reduceDim = llvm::cast<mlir::IntegerAttr>(reductionDims[0])
-                            .getValue()
-                            .getSExtValue();
+    int64_t reduceDim = reductionDims[0];
     auto resultTy = llvm::cast<mlir::VectorType>(op.getType());
     auto xetileResultTy = mlir::VectorType::get(
         (reduceDim == 0 ? llvm::ArrayRef<int64_t>({1, resultTy.getDimSize(0)})
@@ -410,7 +408,7 @@ struct XeTileCanonicalizationPass final
           auto newAttr = imex::xetile::XeTileAttr::get(
               tileTy.getContext(), tileTy.getSgMap(), tileTy.getWgMap(),
               mlir::DenseI32ArrayAttr::get(tileTy.getContext(), {1, 0}),
-              tileTy.getInnerBlocks(), tileTy.getMemoryScope());
+              tileTy.getInnerBlocks(), tileTy.getMemorySpace());
 
           return imex::xetile::TileType::get(
               swapLastTwoElems(tileTy.getShape()), tileTy.getElementType(),
diff --git a/lib/Transforms/OptimizeTranspose.cpp b/lib/Transforms/OptimizeTranspose.cpp
index 9033786d8..7a150f48e 100644
--- a/lib/Transforms/OptimizeTranspose.cpp
+++ b/lib/Transforms/OptimizeTranspose.cpp
@@ -128,8 +128,8 @@ struct LoadTransposeAnalysis {
           transposeAttr.asArrayRef() == llvm::ArrayRef<int64_t>{1, 0})
         return mlir::WalkResult::skip();
       // Memory space of the load op must be global.
-      if (loadOp.getTensorDesc().getType().getMemoryScope() !=
-          mlir::xegpu::MemoryScope::Global)
+      if (loadOp.getTensorDesc().getType().getMemorySpace() !=
+          mlir::xegpu::MemorySpace::Global)
         return mlir::WalkResult::skip();
       // Single user must be a transpose op.
       auto transposeOp = llvm::dyn_cast_if_present<mlir::vector::TransposeOp>(
diff --git a/lib/Transforms/PropagatePackedLayout.cpp b/lib/Transforms/PropagatePackedLayout.cpp
index c872ba865..2f220be88 100644
--- a/lib/Transforms/PropagatePackedLayout.cpp
+++ b/lib/Transforms/PropagatePackedLayout.cpp
@@ -160,9 +160,9 @@ class LayoutAnalysisImpl
 public:
   using SparseBackwardDataFlowAnalysis::SparseBackwardDataFlowAnalysis;
 
-  void visitOperation(mlir::Operation *op,
-                      mlir::ArrayRef<LayoutLattice *> operands,
-                      mlir::ArrayRef<const LayoutLattice *> results) override {
+  mlir::LogicalResult
+  visitOperation(mlir::Operation *op, mlir::ArrayRef<LayoutLattice *> operands,
+                 mlir::ArrayRef<const LayoutLattice *> results) override {
     if (mlir::OpTrait::hasElementwiseMappableTraits(op)) {
       Layout layout;
       for (auto &&[res, resLattice] :
@@ -182,7 +182,7 @@ class LayoutAnalysisImpl
         propagateIfChanged(argLattice, argLattice->meet(tmpLayout));
       }
 
-      return;
+      return mlir::success();
     }
 
     if (auto dpas = mlir::dyn_cast<mlir::xegpu::DpasOp>(op)) {
@@ -193,12 +193,14 @@ class LayoutAnalysisImpl
           propagateIfChanged(operand, operand->meet(std::nullopt));
         }
       }
-      return;
+      return mlir::success();
     }
 
     // Unknown ops: mark all args as invalid layout (no layout change).
     for (auto operand : operands)
       propagateIfChanged(operand, operand->meet(std::nullopt));
+
+    return mlir::success();
   }
 
   void visitBranchOperand(mlir::OpOperand &operand) override {}
diff --git a/lib/Transforms/VnniTransformation.cpp b/lib/Transforms/VnniTransformation.cpp
index 36f2a7f8f..bb5ee5ae2 100644
--- a/lib/Transforms/VnniTransformation.cpp
+++ b/lib/Transforms/VnniTransformation.cpp
@@ -127,9 +127,9 @@ class LayoutAnalysisImpl
 public:
   using SparseBackwardDataFlowAnalysis::SparseBackwardDataFlowAnalysis;
 
-  void visitOperation(mlir::Operation *op,
-                      mlir::ArrayRef<LayoutLattice *> operands,
-                      mlir::ArrayRef<const LayoutLattice *> results) override {
+  mlir::LogicalResult
+  visitOperation(mlir::Operation *op, mlir::ArrayRef<LayoutLattice *> operands,
+                 mlir::ArrayRef<const LayoutLattice *> results) override {
     // the B operand of a dpas operation is always in vnni layout
     // and it is the start point of the layout propagation
     if (auto dpas = mlir::dyn_cast<mlir::xegpu::DpasOp>(op)) {
@@ -144,7 +144,7 @@ class LayoutAnalysisImpl
         // for C operand, it cannot be in vnni format
         propagateIfChanged(operands[2], operands[2]->meet(Layout(false)));
       }
-      return;
+      return mlir::success();
     }
 
     if (mlir::OpTrait::hasElementwiseMappableTraits(op)) {
@@ -175,7 +175,7 @@ class LayoutAnalysisImpl
         for (auto &&lattice : operands)
           propagateIfChanged(lattice, lattice->meet(layout));
       }
-      return;
+      return mlir::success();
     }
 
     if (auto extractStrideSliceOp =
@@ -186,7 +186,7 @@ class LayoutAnalysisImpl
         layout = Layout::meet(layout, Layout(isVNNIApplicable(srcTy)));
         propagateIfChanged(operands[0], operands[0]->meet(layout));
       }
-      return;
+      return mlir::success();
     }
 
     if (auto extractOp = mlir::dyn_cast<mlir::vector::ExtractOp>(op)) {
@@ -201,12 +201,14 @@ class LayoutAnalysisImpl
         layout = Layout::meet(layout, Layout(isVNNIApplicable(vecTy)));
         propagateIfChanged(operands[0], operands[0]->meet(layout));
       }
-      return;
+      return mlir::success();
     }
 
     // Unknown ops: mark all args as non-vnni layout (no layout change).
     for (auto operand : operands)
       propagateIfChanged(operand, operand->join(Layout(false)));
+
+    return mlir::success();
   }
 
   void visitBranchOperand(mlir::OpOperand &operand) override {}
diff --git a/lib/Utils/XeArch.cpp b/lib/Utils/XeArch.cpp
index c9f3657cf..649faaa3a 100644
--- a/lib/Utils/XeArch.cpp
+++ b/lib/Utils/XeArch.cpp
@@ -304,7 +304,7 @@ mlir::LogicalResult XeuArchInterface::isLegalLoad2dOp(mlir::Operation *op) {
     auto tdescTy = loadOp.getTensorDescType();
 
     // TODO: need more thinking on SLM
-    if (tdescTy.getMemoryScope() == mlir::xegpu::MemoryScope::SLM)
+    if (tdescTy.getMemorySpace() == mlir::xegpu::MemorySpace::SLM)
       return mlir::success();
 
     int elementSize = loadOp.getTensorDescType().getElementTypeBitWidth();
@@ -347,7 +347,7 @@ mlir::LogicalResult XeuArchInterface::isLegalStore2dOp(mlir::Operation *op) {
     int elementSize = tdescTy.getElementTypeBitWidth();
 
     // TODO: need more thinking on SLM
-    if (tdescTy.getMemoryScope() == mlir::xegpu::MemoryScope::SLM)
+    if (tdescTy.getMemorySpace() == mlir::xegpu::MemorySpace::SLM)
       return mlir::success();
 
     LoadStore2DConfig storeParams;
diff --git a/test/Conversion/GPUToSPIRV/printf.mlir b/test/Conversion/GPUToSPIRV/printf.mlir
index 5c0255cbe..206b156da 100644
--- a/test/Conversion/GPUToSPIRV/printf.mlir
+++ b/test/Conversion/GPUToSPIRV/printf.mlir
@@ -47,7 +47,7 @@ module @test attributes {
         %2 = gpu.thread_id x
         // CHECK: [[FMTSTR_ADDR:%.*]] = spirv.mlir.addressof [[PRINTMSG]] : !spirv.ptr<!spirv.array<[[ARRAYSIZE]] x i8>, UniformConstant>
         // CHECK-NEXT: [[FMTSTR_PTR:%.*]] = spirv.Bitcast [[FMTSTR_ADDR]] : !spirv.ptr<!spirv.array<[[ARRAYSIZE]] x i8>, UniformConstant> to !spirv.ptr<i8, UniformConstant>
-        // CHECK-NEXT: {{%.*}} = spirv.CL.printf [[FMTSTR_PTR]] : !spirv.ptr<i8, UniformConstant>({{%.*}}, {{%.*}}, {{%.*}} : i32, f32, i64) -> i32
+        // CHECK-NEXT: {{%.*}} = spirv.CL.printf [[FMTSTR_PTR]] {{%.*}}, {{%.*}}, {{%.*}} : !spirv.ptr<i8, UniformConstant>, i32, f32, i64 -> i32
         gpu.printf "\nHello, world : %d %f \n Thread id: %d\n" %arg0, %arg1, %2: i32, f32, index
           gpu.return
     }
diff --git a/test/Conversion/XeGPUToVC/atomiclsc.mlir b/test/Conversion/XeGPUToVC/atomiclsc.mlir
index 8402e53b1..353e94860 100644
--- a/test/Conversion/XeGPUToVC/atomiclsc.mlir
+++ b/test/Conversion/XeGPUToVC/atomiclsc.mlir
@@ -4,46 +4,30 @@ module @gemm attributes {gpu.container_module} {
   gpu.module @test_kernel {
     // CHECK: func.func private @llvm.genx.lsc.xatomic.stateless.v16i32.v16i1.v16i64(vector<16xi1>, i8, i8, i8, i16, i32, i8, i8, i8, vector<16xi1>, vector<16xi64>, vector<16xi32>, vector<16xi32>, i32, vector<16xi32>) -> vector<16xi32> attributes {VectorComputeFunctionINTEL, linkage_attributes = #spirv.linkage_attributes<linkage_name = "llvm.genx.lsc.xatomic.stateless.v16i32.v16i1.v16i64", linkage_type = <Import>>}
     gpu.func @test_atomiclsc(%arg0: memref<128xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+      // CHECK:  %[[cst:.*]] = arith.constant dense<true> : vector<16xi1>
+      %mask = arith.constant dense<true> : vector<16xi1>
+
+      // CHECK: %[[cst_0:.*]] = arith.constant dense<5.000000e-01> : vector<16xf32>
+      %1 = arith.constant dense<0.5> : vector<16xf32>
 
-      //CHECK: %[[cst:.*]] = arith.constant dense<true> : vector<16xi1>
-      //CHECK: %[[cst_0:.*]] = arith.constant dense<5.000000e-01> : vector<16xf32>
       //CHECK: %[[intptr:.*]] = memref.extract_aligned_pointer_as_index %{{.*}} : memref<128xf32> -> index
       //CHECK: %[[r0:.*]] = arith.index_castui %[[intptr]] : index to i64
-      //CHECK: %[[cst_1:.*]] = arith.constant dense<4> : vector<16xi64>
-      //CHECK: %[[c0:.*]] = arith.constant 0 : index
-      //CHECK: %[[c1:.*]] = arith.constant 1 : index
-      //CHECK: %[[c2:.*]] = arith.constant 2 : index
-      //CHECK: %[[c3:.*]] = arith.constant 3 : index
-      //CHECK: %[[c4:.*]] = arith.constant 4 : index
-      //CHECK: %[[c5:.*]] = arith.constant 5 : index
-      //CHECK: %[[c6:.*]] = arith.constant 6 : index
-      //CHECK: %[[c7:.*]] = arith.constant 7 : index
-      //CHECK: %[[c8:.*]] = arith.constant 8 : index
-      //CHECK: %[[c9:.*]] = arith.constant 9 : index
-      //CHECK: %[[c10:.*]] = arith.constant 10 : index
-      //CHECK: %[[c11:.*]] = arith.constant 11 : index
-      //CHECK: %[[c12:.*]] = arith.constant 12 : index
-      //CHECK: %[[c13:.*]] = arith.constant 13 : index
-      //CHECK: %[[c14:.*]] = arith.constant 14 : index
-      //CHECK: %[[c15:.*]] = arith.constant 15 : index
-      //CHECK: %[[r1:.*]] = vector.from_elements %[[c0]], %[[c1]], %[[c2]], %[[c3]], %[[c4]], %[[c5]], %[[c6]], %[[c7]], %[[c8]], %[[c9]], %[[c10]], %[[c11]], %[[c12]], %[[c13]], %[[c14]], %[[c15]] : vector<16xindex>
-      //CHECK: %[[r2:.*]] = arith.index_castui %[[r1]] : vector<16xindex> to vector<16xi64>
-      //CHECK: %[[r3:.*]] = arith.muli %[[r2]], %[[cst_1]] : vector<16xi64>
-      //CHECK: %[[r4:.*]] = vector.broadcast %[[r0]] : i64 to vector<16xi64>
-      //CHECK: %[[r5:.*]] = arith.addi %[[r4]], %[[r3]] : vector<16xi64>
+      //CHECK: %[[cst_1:.*]] = arith.constant dense<[0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60]> : vector<16xi64>
+      //CHECK: %[[r1:.*]] = vector.broadcast %[[r0]] : i64 to vector<16xi64>
+      //CHECK: %[[r2:.*]] = arith.addi %[[r1]], %[[cst_1]] : vector<16xi64>
       //CHECK: %[[c19_i8:.*]] = arith.constant 19 : i8
       //CHECK: %[[c1_i8:.*]] = arith.constant 1 : i8
       //CHECK: %[[c1_i16:.*]] = arith.constant 1 : i16
       //CHECK: %[[c0_i32:.*]] = arith.constant 0 : i32
       //CHECK: %[[c3_i8:.*]] = arith.constant 3 : i8
       //CHECK: %[[cst_2:.*]] = arith.constant dense<0> : vector<16xi32>
-      //CHECK: %[[r6:.*]] = vector.bitcast %[[cst_0]] : vector<16xf32> to vector<16xi32>
-      //CHECK: %[[r7:.*]] = func.call @llvm.genx.lsc.xatomic.stateless.v16i32.v16i1.v16i64(%[[cst]], %[[c19_i8]], %[[c1_i8]], %[[c1_i8]], %[[c1_i16]], %[[c0_i32]], %[[c3_i8]], %[[c1_i8]], %[[c1_i8]], %[[cst]], %[[r5]], %[[r6]], %[[cst_2]], %[[c0_i32]], %[[cst_2]]) : (vector<16xi1>, i8, i8, i8, i16, i32, i8, i8, i8, vector<16xi1>, vector<16xi64>, vector<16xi32>, vector<16xi32>, i32, vector<16xi32>) -> vector<16xi32>
-
-      %mask = arith.constant dense<true> : vector<16xi1>
-      %1 = arith.constant dense<0.5> : vector<16xf32>
+      //CHECK: %[[r3:.*]] = vector.bitcast %[[cst_0]] : vector<16xf32> to vector<16xi32>
+      //CHECK: %[[r4:.*]] = func.call @llvm.genx.lsc.xatomic.stateless.v16i32.v16i1.v16i64(
+      //CHECK-SAME: %[[cst]], %[[c19_i8]], %[[c1_i8]], %[[c1_i8]], %[[c1_i16]], %[[c0_i32]], %[[c3_i8]],
+      //CHECK-SAME: %[[c1_i8]], %[[c1_i8]], %[[cst]], %[[r2]], %[[r3]], %[[cst_2]], %[[c0_i32]], %[[cst_2]])
+      //CHECK-SAME: (vector<16xi1>, i8, i8, i8, i16, i32, i8, i8, i8, vector<16xi1>, vector<16xi64>, vector<16xi32>, vector<16xi32>, i32, vector<16xi32>) -> vector<16xi32>
       %offsets = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex>
-      %2 = xegpu.create_tdesc %arg0[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : memref<128xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+      %2 = xegpu.create_tdesc %arg0, %offsets : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
       %3 = xegpu.atomic_rmw "addf" %2, %mask, %1 : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>, vector<16xf32> -> vector<16xf32>
       gpu.return
     }
diff --git a/test/Conversion/XeGPUToVC/load_global_no_chunk_f16.mlir b/test/Conversion/XeGPUToVC/load_global_no_chunk_f16.mlir
index c1f3de856..10a9a7f77 100644
--- a/test/Conversion/XeGPUToVC/load_global_no_chunk_f16.mlir
+++ b/test/Conversion/XeGPUToVC/load_global_no_chunk_f16.mlir
@@ -1,6 +1,6 @@
 // RUN: imex-opt -convert-xegpu-to-vc -cse  %s | FileCheck %s
 
-#scatter = #xegpu.scatter_tdesc_attr<memory_scope=global>
+#scatter = #xegpu.scatter_tdesc_attr<memory_space=global>
 
 gpu.module @test_kernel {
   //CHECK: gpu.func @test_copy(%[[arg0:.*]]: memref<16xf16>, %[[arg1:.*]]: memref<16xf16>) kernel
@@ -9,39 +9,38 @@ gpu.module @test_kernel {
     //CHECK: %[[mask:.*]] = arith.constant dense<true> : vector<16xi1>
     %mask = arith.constant dense<1> : vector<16xi1>
 
+    %offsets = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex>
+
     //CHECK: %[[a_ptr:.*]] = memref.extract_aligned_pointer_as_index %[[arg0]] : memref<16xf16> -> index
     //CHECK: %[[r0:.*]] = arith.index_castui %[[a_ptr]] : index to i64
-    //CHECK: %[[cst_0:.*]] = arith.constant dense<2> : vector<16xi64>
-    //CHECK: %[[r1:.*]] = vector.from_elements {{.*}} : vector<16xindex>
-    //CHECK: %[[r2:.*]] = arith.index_castui %[[r1]] : vector<16xindex> to vector<16xi64>
-    //CHECK: %[[r3:.*]] = arith.muli %[[r2]], %[[cst_0]] : vector<16xi64>
-    //CHECK: %[[r4:.*]] = vector.broadcast %[[r0]] : i64 to vector<16xi64>
-    //CHECK: %[[r5:.*]] = arith.addi %[[r4]], %[[r3]] : vector<16xi64>
-    %a_tdesc = xegpu.create_tdesc %a[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : memref<16xf16> -> !xegpu.tensor_desc<16xf16, #scatter>
+    //CHECK: %[[cst_0:.*]] = arith.constant dense<[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]> : vector<16xi64>
+    //CHECK: %[[r1:.*]] = vector.broadcast %[[r0]] : i64 to vector<16xi64>
+    //CHECK: %[[r2:.*]] = arith.addi %[[r1]], %[[cst_0]] : vector<16xi64>
+    %a_tdesc = xegpu.create_tdesc %a, %offsets : memref<16xf16>, vector<16xindex> -> !xegpu.tensor_desc<16xf16, #scatter>
 
     //CHECK: %[[c0_i8:.*]] = arith.constant 0 : i8
     //CHECK: %[[c1_i16:.*]] = arith.constant 1 : i16
     //CHECK: %[[c0_i32:.*]] = arith.constant 0 : i32
     //CHECK: %[[c6_i8:.*]] = arith.constant 6 : i8
     //CHECK: %[[c1_i8:.*]] = arith.constant 1 : i8
-    //CHECK: %[[r6:.*]] = func.call @llvm.genx.lsc.load.stateless.v16i32.v16i1.v16i64
-    //CHECK-SAME: (%[[mask]], %[[c0_i8]], %[[c0_i8]], %[[c0_i8]], %[[c1_i16]], %[[c0_i32]], %[[c6_i8]], %[[c1_i8]], %[[c1_i8]], %[[c0_i8]], %[[r5]], %[[c0_i32]])
+    //CHECK: %[[r3:.*]] = func.call @llvm.genx.lsc.load.stateless.v16i32.v16i1.v16i64
+    //CHECK-SAME: (%[[mask]], %[[c0_i8]], %[[c0_i8]], %[[c0_i8]], %[[c1_i16]], %[[c0_i32]], %[[c6_i8]], %[[c1_i8]], %[[c1_i8]], %[[c0_i8]], %[[r2]], %[[c0_i32]])
     //CHECK-SAME: (vector<16xi1>, i8, i8, i8, i16, i32, i8, i8, i8, i8, vector<16xi64>, i32) -> vector<16xi32>
-    //CHECK: %[[r7:.*]] = arith.trunci %[[r6]] : vector<16xi32> to vector<16xi16>
-    //CHECK: %[[r8:.*]] = vector.bitcast %[[r7]] : vector<16xi16> to vector<16xf16>
+    //CHECK: %[[r4:.*]] = arith.trunci %[[r3]] : vector<16xi32> to vector<16xi16>
+    //CHECK: %[[r5:.*]] = vector.bitcast %[[r4]] : vector<16xi16> to vector<16xf16>
     %data = xegpu.load %a_tdesc, %mask : !xegpu.tensor_desc<16xf16, #scatter>, vector<16xi1> -> vector<16xf16>
 
     //CHECK: %[[b_ptr:.*]] = memref.extract_aligned_pointer_as_index %[[arg1]] : memref<16xf16> -> index
-    //CHECK: %[[r9:.*]] = arith.index_castui %[[b_ptr]] : index to i64
-    //CHECK: %[[r10:.*]] = vector.broadcast %[[r9]] : i64 to vector<16xi64>
-    //CHECK: %[[r11:.*]] = arith.addi %[[r10]], %[[r3]] : vector<16xi64>
-    %b_tdesc = xegpu.create_tdesc %b[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : memref<16xf16> -> !xegpu.tensor_desc<16xf16, #scatter>
+    //CHECK: %[[r6:.*]] = arith.index_castui %[[b_ptr]] : index to i64
+    //CHECK: %[[r7:.*]] = vector.broadcast %[[r6]] : i64 to vector<16xi64>
+    //CHECK: %[[r8:.*]] = arith.addi %[[r7]], %[[cst_0]] : vector<16xi64>
+    %b_tdesc = xegpu.create_tdesc %b, %offsets : memref<16xf16>, vector<16xindex> -> !xegpu.tensor_desc<16xf16, #scatter>
 
-    //CHECK: %[[r12:.*]] = vector.bitcast %[[r8]] : vector<16xf16> to vector<16xi16>
-    //CHECK: %[[r13:.*]] = arith.extui %[[r12]] : vector<16xi16> to vector<16xi32>
+    //CHECK: %[[r9:.*]] = vector.bitcast %[[r5]] : vector<16xf16> to vector<16xi16>
+    //CHECK: %[[r10:.*]] = arith.extui %[[r9]] : vector<16xi16> to vector<16xi32>
     //CHECK: %[[c4_i8:.*]] = arith.constant 4 : i8
     //CHECK: func.call @llvm.genx.lsc.store.stateless.v16i1.v16i64.v16i32
-    //CHECK-SAME: (%[[mask]], %[[c4_i8]], %[[c0_i8]], %[[c0_i8]], %[[c1_i16]], %[[c0_i32]], %[[c6_i8]], %[[c1_i8]], %[[c1_i8]], %[[c0_i8]], %[[r11]], %[[r13]], %[[c0_i32]])
+    //CHECK-SAME: (%[[mask]], %[[c4_i8]], %[[c0_i8]], %[[c0_i8]], %[[c1_i16]], %[[c0_i32]], %[[c6_i8]], %[[c1_i8]], %[[c1_i8]], %[[c0_i8]], %[[r8]], %[[r10]], %[[c0_i32]])
     //CHECK-SAME: (vector<16xi1>, i8, i8, i8, i16, i32, i8, i8, i8, i8, vector<16xi64>, vector<16xi32>, i32) -> ()
     xegpu.store %data, %b_tdesc, %mask : vector<16xf16>, !xegpu.tensor_desc<16xf16, #scatter>, vector<16xi1>
     gpu.return
diff --git a/test/Conversion/XeGPUToVC/load_global_no_chunk_f32.mlir b/test/Conversion/XeGPUToVC/load_global_no_chunk_f32.mlir
index b99c8e289..a7a52f3d4 100644
--- a/test/Conversion/XeGPUToVC/load_global_no_chunk_f32.mlir
+++ b/test/Conversion/XeGPUToVC/load_global_no_chunk_f32.mlir
@@ -1,7 +1,7 @@
 
 // RUN: imex-opt -convert-xegpu-to-vc -cse  %s | FileCheck %s
 
-#scatter = #xegpu.scatter_tdesc_attr<memory_scope=global>
+#scatter = #xegpu.scatter_tdesc_attr<memory_space=global>
 gpu.module @test_kernel {
 
   //CHECK: gpu.func @test_copy(%[[arg0:.*]]: memref<16xf32>, %[[arg1:.*]]: memref<16xf32>) kernel
@@ -9,37 +9,34 @@ gpu.module @test_kernel {
 
     //CHECK: %[[mask:.*]] = arith.constant dense<true> : vector<16xi1>
     %mask = arith.constant dense<1> : vector<16xi1>
+    %offsets = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex>
 
     //CHECK: %[[a_ptr:.*]] = memref.extract_aligned_pointer_as_index %[[arg0]] : memref<16xf32> -> index
     //CHECK: %[[r0:.*]] = arith.index_castui %[[a_ptr]] : index to i64
-    //CHECK: %[[cst_0:.*]] = arith.constant dense<4> : vector<16xi64>
-
-    //CHECK: %[[r1:.*]] = vector.from_elements {{.*}} : vector<16xindex>
-    //CHECK: %[[r2:.*]] = arith.index_castui %[[r1]] : vector<16xindex> to vector<16xi64>
-    //CHECK: %[[r3:.*]] = arith.muli %[[r2]], %[[cst_0]] : vector<16xi64>
-    //CHECK: %[[r4:.*]] = vector.broadcast %[[r0]] : i64 to vector<16xi64>
-    //CHECK: %[[r5:.*]] = arith.addi %[[r4]], %[[r3]] : vector<16xi64>
-    %a_tdesc = xegpu.create_tdesc %a[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #scatter>
+    //CHECK: %[[cst_0:.*]] = arith.constant dense<[0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60]> : vector<16xi64>
+    //CHECK: %[[r1:.*]] = vector.broadcast %[[r0]] : i64 to vector<16xi64>
+    //CHECK: %[[r2:.*]] = arith.addi %[[r1]], %[[cst_0]] : vector<16xi64>
+    %a_tdesc = xegpu.create_tdesc %a, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #scatter>
 
     //CHECK: %[[c0_i8:.*]] = arith.constant 0 : i8
     //CHECK: %[[c1_i16:.*]] = arith.constant 1 : i16
     //CHECK: %[[c0_i32:.*]] = arith.constant 0 : i32
     //CHECK: %[[c3_i8:.*]] = arith.constant 3 : i8
     //CHECK: %[[c1_i8:.*]] = arith.constant 1 : i8
-    //CHECK: %[[r6:.*]] = func.call @llvm.genx.lsc.load.stateless.v16f32.v16i1.v16i64
-    //CHECK-SAME: (%[[mask]], %[[c0_i8]], %[[c0_i8]], %[[c0_i8]], %[[c1_i16]], %[[c0_i32]], %[[c3_i8]], %[[c1_i8]], %[[c1_i8]], %[[c0_i8]], %[[r5]], %[[c0_i32]])
+    //CHECK: %[[r3:.*]] = func.call @llvm.genx.lsc.load.stateless.v16f32.v16i1.v16i64
+    //CHECK-SAME: (%[[mask]], %[[c0_i8]], %[[c0_i8]], %[[c0_i8]], %[[c1_i16]], %[[c0_i32]], %[[c3_i8]], %[[c1_i8]], %[[c1_i8]], %[[c0_i8]], %[[r2]], %[[c0_i32]])
     //CHECK-SAME: (vector<16xi1>, i8, i8, i8, i16, i32, i8, i8, i8, i8, vector<16xi64>, i32) -> vector<16xf32>
     %data = xegpu.load %a_tdesc, %mask : !xegpu.tensor_desc<16xf32, #scatter>, vector<16xi1> -> vector<16xf32>
 
     //CHECK: %[[b_ptr:.*]] = memref.extract_aligned_pointer_as_index %[[arg1]] : memref<16xf32> -> index
-    //CHECK: %[[r7:.*]] = arith.index_castui %[[b_ptr]] : index to i64
-    //CHECK: %[[r8:.*]] = vector.broadcast %[[r7]] : i64 to vector<16xi64>
-    //CHECK: %[[r9:.*]] = arith.addi %[[r8]], %[[r3]] : vector<16xi64>
-    %b_tdesc = xegpu.create_tdesc %b[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #scatter>
+    //CHECK: %[[r4:.*]] = arith.index_castui %[[b_ptr]] : index to i64
+    //CHECK: %[[r5:.*]] = vector.broadcast %[[r4]] : i64 to vector<16xi64>
+    //CHECK: %[[r6:.*]] = arith.addi %[[r5]], %[[cst_0]] : vector<16xi64>
+    %b_tdesc = xegpu.create_tdesc %b, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #scatter>
 
     //CHECK: %[[c4_i8:.*]] = arith.constant 4 : i8
     //CHECK: func.call @llvm.genx.lsc.store.stateless.v16i1.v16i64.v16f32
-    //CHECK-SAME: (%[[mask]], %[[c4_i8]], %[[c0_i8]], %[[c0_i8]], %[[c1_i16]], %[[c0_i32]], %[[c3_i8]], %[[c1_i8]], %[[c1_i8]], %[[c0_i8]], %[[r9]], %[[r6]], %[[c0_i32]])
+    //CHECK-SAME: (%[[mask]], %[[c4_i8]], %[[c0_i8]], %[[c0_i8]], %[[c1_i16]], %[[c0_i32]], %[[c3_i8]], %[[c1_i8]], %[[c1_i8]], %[[c0_i8]], %[[r6]], %[[r3]], %[[c0_i32]])
     //CHECK-SAME: (vector<16xi1>, i8, i8, i8, i16, i32, i8, i8, i8, i8, vector<16xi64>, vector<16xf32>, i32) -> ()
     xegpu.store %data, %b_tdesc, %mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #scatter>, vector<16xi1>
     gpu.return
diff --git a/test/Conversion/XeGPUToVC/prefetch_global_no_chunk_f16.mlir b/test/Conversion/XeGPUToVC/prefetch_global_no_chunk_f16.mlir
index a42062a72..4ad2f4880 100644
--- a/test/Conversion/XeGPUToVC/prefetch_global_no_chunk_f16.mlir
+++ b/test/Conversion/XeGPUToVC/prefetch_global_no_chunk_f16.mlir
@@ -1,6 +1,6 @@
 // RUN: imex-opt -convert-xegpu-to-vc -cse  %s | FileCheck %s
 
-#scatter = #xegpu.scatter_tdesc_attr<memory_scope=global>
+#scatter = #xegpu.scatter_tdesc_attr<memory_space=global>
 
 gpu.module @test_kernel {
   //CHECK: gpu.func @test_copy(%[[arg0:.*]]: memref<16xf16>, %[[arg1:.*]]: memref<16xf16>) kernel
@@ -8,17 +8,14 @@ gpu.module @test_kernel {
 
     //CHECK: %[[cst:.*]] = arith.constant dense<true> : vector<16xi1>
     %mask = arith.constant dense<1> : vector<16xi1>
+    %offsets = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex>
 
     //CHECK: %[[intptr:.*]] = memref.extract_aligned_pointer_as_index %[[arg0]] : memref<16xf16> -> index
     //CHECK: %[[r0:.*]] = arith.index_castui %[[intptr]] : index to i64
-    //CHECK: %[[cst_0:.*]] = arith.constant dense<2> : vector<16xi64>
-
-    //CHECK: %[[r1:.*]] = vector.from_elements {{.*}} : vector<16xindex>
-    //CHECK: %[[r2:.*]] = arith.index_castui %[[r1]] : vector<16xindex> to vector<16xi64>
-    //CHECK: %[[r3:.*]] = arith.muli %[[r2]], %[[cst_0]] : vector<16xi64>
-    //CHECK: %[[r4:.*]] = vector.broadcast %[[r0]] : i64 to vector<16xi64>
-    //CHECK: %[[r5:.*]] = arith.addi %[[r4]], %[[r3]] : vector<16xi64>
-    %a_tdesc = xegpu.create_tdesc %a[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : memref<16xf16> -> !xegpu.tensor_desc<16xf16, #scatter>
+    //CHECK: %[[cst_0:.*]] = arith.constant dense<[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]> : vector<16xi64>
+    //CHECK: %[[r1:.*]] = vector.broadcast %[[r0]] : i64 to vector<16xi64>
+    //CHECK: %[[r2:.*]] = arith.addi %[[r1]], %[[cst_0]] : vector<16xi64>
+    %a_tdesc = xegpu.create_tdesc %a, %offsets : memref<16xf16>, vector<16xindex> -> !xegpu.tensor_desc<16xf16, #scatter>
 
     //CHECK: %[[c0_i8:.*]] = arith.constant 0 : i8
     //CHECK: %[[c1_i16:.*]] = arith.constant 1 : i16
@@ -27,12 +24,12 @@ gpu.module @test_kernel {
     //CHECK: %[[c1_i8:.*]] = arith.constant 1 : i8
 
     //CHECK: func.call @llvm.genx.lsc.prefetch.stateless.v16i1.v16i64
-    //CHECK-SAME: (%[[cst]], %[[c0_i8]], %[[c0_i8]], %[[c0_i8]], %[[c1_i16]], %[[c0_i32]], %[[c6_i8]], %[[c1_i8]], %[[c1_i8]], %[[c0_i8]], %[[r5]], %[[c0_i32]])
+    //CHECK-SAME: (%[[cst]], %[[c0_i8]], %[[c0_i8]], %[[c0_i8]], %[[c1_i16]], %[[c0_i32]], %[[c6_i8]], %[[c1_i8]], %[[c1_i8]], %[[c0_i8]], %[[r2]], %[[c0_i32]])
     //CHECK-SAME: (vector<16xi1>, i8, i8, i8, i16, i32, i8, i8, i8, i8, vector<16xi64>, i32) -> ()
     xegpu.prefetch %a_tdesc : !xegpu.tensor_desc<16xf16, #scatter>
     %data = xegpu.load %a_tdesc, %mask : !xegpu.tensor_desc<16xf16, #scatter>, vector<16xi1> -> vector<16xf16>
 
-    %b_tdesc = xegpu.create_tdesc %b[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : memref<16xf16> -> !xegpu.tensor_desc<16xf16, #scatter>
+    %b_tdesc = xegpu.create_tdesc %b, %offsets : memref<16xf16>, vector<16xindex> -> !xegpu.tensor_desc<16xf16, #scatter>
     xegpu.store %data, %b_tdesc, %mask : vector<16xf16>, !xegpu.tensor_desc<16xf16, #scatter>, vector<16xi1>
     gpu.return
   }
diff --git a/test/Conversion/XeGPUToVC/prefetch_global_no_chunk_f32.mlir b/test/Conversion/XeGPUToVC/prefetch_global_no_chunk_f32.mlir
index 64d310ae9..f748b25ca 100644
--- a/test/Conversion/XeGPUToVC/prefetch_global_no_chunk_f32.mlir
+++ b/test/Conversion/XeGPUToVC/prefetch_global_no_chunk_f32.mlir
@@ -1,21 +1,19 @@
 // RUN: imex-opt -convert-xegpu-to-vc -cse  %s | FileCheck %s
 
-#scatter = #xegpu.scatter_tdesc_attr<memory_scope=global>
+#scatter = #xegpu.scatter_tdesc_attr<memory_space=global>
 gpu.module @test_kernel {
   gpu.func @test_copy(%a: memref<16xf32>, %b: memref<16xf32>) kernel {
 
     //CHECK: %[[cst:.*]] = arith.constant dense<true> : vector<16xi1>
     %mask = arith.constant dense<1> : vector<16xi1>
+    %offsets = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex>
 
     //CHECK: %[[intptr:.*]] = memref.extract_aligned_pointer_as_index %arg0 : memref<16xf32> -> index
     //CHECK: %[[r0:.*]] = arith.index_castui %[[intptr]] : index to i64
-    //CHECK: %[[cst_0:.*]] = arith.constant dense<4> : vector<16xi64>
-    //CHECK: %[[r1:.*]] = vector.from_elements {{.*}} : vector<16xindex>
-    //CHECK: %[[r2:.*]] = arith.index_castui %[[r1]] : vector<16xindex> to vector<16xi64>
-    //CHECK: %[[r3:.*]] = arith.muli %[[r2]], %[[cst_0]] : vector<16xi64>
-    //CHECK: %[[r4:.*]] = vector.broadcast %[[r0]] : i64 to vector<16xi64>
-    //CHECK: %[[r5:.*]] = arith.addi %[[r4]], %[[r3]] : vector<16xi64>
-    %a_tdesc = xegpu.create_tdesc %a[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #scatter>
+    //CHECK: %[[cst_0:.*]] = arith.constant dense<[0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60]> : vector<16xi64>
+    //CHECK: %[[r1:.*]] = vector.broadcast %[[r0]] : i64 to vector<16xi64>
+    //CHECK: %[[r2:.*]] = arith.addi %[[r1]], %[[cst_0]] : vector<16xi64>
+    %a_tdesc = xegpu.create_tdesc %a, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #scatter>
 
     //CHECK: %[[c0_i8:.*]] = arith.constant 0 : i8
     //CHECK: %[[c1_i16:.*]] = arith.constant 1 : i16
@@ -24,11 +22,11 @@ gpu.module @test_kernel {
     //CHECK: %[[c1_i8:.*]] = arith.constant 1 : i8
 
     //CHECK: func.call @llvm.genx.lsc.prefetch.stateless.v16i1.v16i64
-    //CHECK-SAME: (%[[cst]], %[[c0_i8]], %[[c0_i8]], %[[c0_i8]], %[[c1_i16]], %[[c0_i32]], %[[c3_i8]], %[[c1_i8]], %[[c1_i8]], %[[c0_i8]], %[[r5]], %[[c0_i32]])
+    //CHECK-SAME: (%[[cst]], %[[c0_i8]], %[[c0_i8]], %[[c0_i8]], %[[c1_i16]], %[[c0_i32]], %[[c3_i8]], %[[c1_i8]], %[[c1_i8]], %[[c0_i8]], %[[r2]], %[[c0_i32]])
     //CHECK-SAME: (vector<16xi1>, i8, i8, i8, i16, i32, i8, i8, i8, i8, vector<16xi64>, i32) -> ()
     xegpu.prefetch %a_tdesc : !xegpu.tensor_desc<16xf32, #scatter>
     %data = xegpu.load %a_tdesc, %mask : !xegpu.tensor_desc<16xf32, #scatter>, vector<16xi1> -> vector<16xf32>
-    %b_tdesc = xegpu.create_tdesc %b[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #scatter>
+    %b_tdesc = xegpu.create_tdesc %b, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #scatter>
     xegpu.store %data, %b_tdesc, %mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #scatter>, vector<16xi1>
     gpu.return
   }
diff --git a/test/Conversion/XeGPUToVC/store_load_slm_no_chunk_f16.mlir b/test/Conversion/XeGPUToVC/store_load_slm_no_chunk_f16.mlir
index fda563970..696a44e85 100644
--- a/test/Conversion/XeGPUToVC/store_load_slm_no_chunk_f16.mlir
+++ b/test/Conversion/XeGPUToVC/store_load_slm_no_chunk_f16.mlir
@@ -1,7 +1,7 @@
 // RUN: imex-opt -convert-xegpu-to-vc -cse  %s | FileCheck %s
 
-#global = #xegpu.scatter_tdesc_attr<memory_scope=global>
-#slm = #xegpu.scatter_tdesc_attr<memory_scope=slm>
+#global = #xegpu.scatter_tdesc_attr<memory_space=global>
+#slm = #xegpu.scatter_tdesc_attr<memory_space=slm>
 
 gpu.module @test_kernel {
   //CHECK: gpu.func @test_store_scatter(%[[arg0:.*]]: memref<16xf16>) kernel
@@ -14,21 +14,20 @@ gpu.module @test_kernel {
     //CHECK: %[[cst_0:.*]] = arith.constant dense<true> : vector<16xi1>
     %mask = arith.constant dense<1> : vector<16xi1>
 
+    %offsets = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex>
+
     //CHECK: %[[alloc:.*]] = memref.alloc() : memref<16xf16, 3>
     %slm = memref.alloc() : memref<16xf16, 3>
 
     //CHECK: %[[intptr:.*]] = memref.extract_aligned_pointer_as_index %[[alloc]] : memref<16xf16, 3> -> index
     //CHECK: %[[r0:.*]] = arith.index_castui %[[intptr]] : index to i32
-    //CHECK: %[[cst_1:.*]] = arith.constant dense<2> : vector<16xi32>
-    //CHECK: %[[r1:.*]] = vector.from_elements {{.*}} : vector<16xindex>
-    //CHECK: %[[r2:.*]] = arith.index_castui %[[r1]] : vector<16xindex> to vector<16xi32>
-    //CHECK: %[[r3:.*]] = arith.muli %[[r2]], %[[cst_1]] : vector<16xi32>
-    //CHECK: %[[r4:.*]] = vector.broadcast %[[r0]] : i32 to vector<16xi32>
-    //CHECK: %[[r5:.*]] = arith.addi %[[r4]], %[[r3]] : vector<16xi32>
-    %slm_tdesc = xegpu.create_tdesc %slm[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : memref<16xf16, 3> -> !xegpu.tensor_desc<16xf16, #slm>
+    //CHECK: %[[cst_1:.*]] = arith.constant dense<[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]> : vector<16xi32>
+    //CHECK: %[[r1:.*]] = vector.broadcast %[[r0]] : i32 to vector<16xi32>
+    //CHECK: %[[r2:.*]] = arith.addi %[[r1]], %[[cst_1]] : vector<16xi32>
+    %slm_tdesc = xegpu.create_tdesc %slm, %offsets : memref<16xf16, 3>, vector<16xindex> -> !xegpu.tensor_desc<16xf16, #slm>
 
-    //CHECK: %[[r6:.*]] = vector.bitcast %[[cst]] : vector<16xf16> to vector<16xi16>
-    //CHECK: %[[r7:.*]] = arith.extui %[[r6]] : vector<16xi16> to vector<16xi32>
+    //CHECK: %[[r3:.*]] = vector.bitcast %[[cst]] : vector<16xf16> to vector<16xi16>
+    //CHECK: %[[r4:.*]] = arith.extui %[[r3]] : vector<16xi16> to vector<16xi32>
     //CHECK: %[[c4_i8:.*]] = arith.constant 4 : i8
     //CHECK: %[[c0_i8:.*]] = arith.constant 0 : i8
     //CHECK: %[[c1_i16:.*]] = arith.constant 1 : i16
@@ -36,32 +35,29 @@ gpu.module @test_kernel {
     //CHECK: %[[c6_i8:.*]] = arith.constant 6 : i8
     //CHECK: %[[c1_i8:.*]] = arith.constant 1 : i8
     //CHECK: func.call @llvm.genx.lsc.store.slm.v16i1.v16i32.v16i32
-    //CHECK-SAME: (%[[cst_0]], %[[c4_i8]], %[[c0_i8]], %[[c0_i8]], %[[c1_i16]], %[[c0_i32]], %[[c6_i8]], %[[c1_i8]], %[[c1_i8]], %[[c0_i8]], %[[r5]], %[[r7]], %[[c0_i32]])
+    //CHECK-SAME: (%[[cst_0]], %[[c4_i8]], %[[c0_i8]], %[[c0_i8]], %[[c1_i16]], %[[c0_i32]], %[[c6_i8]], %[[c1_i8]], %[[c1_i8]], %[[c0_i8]], %[[r2]], %[[r4]], %[[c0_i32]])
     //CHECK-SAME: (vector<16xi1>, i8, i8, i8, i16, i32, i8, i8, i8, i8, vector<16xi32>, vector<16xi32>, i32) -> ()
     xegpu.store %cst, %slm_tdesc, %mask : vector<16xf16>, !xegpu.tensor_desc<16xf16, #slm>, vector<16xi1>
 
-    //CHECK: %[[r8:.*]] = func.call @llvm.genx.lsc.load.slm.v16i32.v16i1.v16i32
-    //CHECK-SAME: (%[[cst_0]], %[[c0_i8]], %[[c0_i8]], %[[c0_i8]], %[[c1_i16]], %[[c0_i32]], %[[c6_i8]], %[[c1_i8]], %[[c1_i8]], %[[c0_i8]], %[[r5]], %[[c0_i32]])
+    //CHECK: %[[r5:.*]] = func.call @llvm.genx.lsc.load.slm.v16i32.v16i1.v16i32
+    //CHECK-SAME: (%[[cst_0]], %[[c0_i8]], %[[c0_i8]], %[[c0_i8]], %[[c1_i16]], %[[c0_i32]], %[[c6_i8]], %[[c1_i8]], %[[c1_i8]], %[[c0_i8]], %[[r2]], %[[c0_i32]])
     //CHECK-SAME: (vector<16xi1>, i8, i8, i8, i16, i32, i8, i8, i8, i8, vector<16xi32>, i32) -> vector<16xi32>
-    //CHECK: %[[r9:.*]] = arith.trunci %[[r8]] : vector<16xi32> to vector<16xi16>
-    //CHECK: %[[r10:.*]] = vector.bitcast %[[r9]] : vector<16xi16> to vector<16xf16>
+    //CHECK: %[[r6:.*]] = arith.trunci %[[r5]] : vector<16xi32> to vector<16xi16>
+    //CHECK: %[[r7:.*]] = vector.bitcast %[[r6]] : vector<16xi16> to vector<16xf16>
     %data = xegpu.load %slm_tdesc, %mask : !xegpu.tensor_desc<16xf16, #slm>, vector<16xi1> -> vector<16xf16>
 
     //CHECK: %[[intptr_2:.*]] = memref.extract_aligned_pointer_as_index %[[arg0]] : memref<16xf16> -> index
 
-    //CHECK: %[[r11:.*]] = arith.index_castui %[[intptr_2]] : index to i64
-    //CHECK: %[[cst_3:.*]] = arith.constant dense<2> : vector<16xi64>
-
-    //CHECK: %[[r12:.*]] = arith.index_castui %[[r1]] : vector<16xindex> to vector<16xi64>
-    //CHECK: %[[r13:.*]] = arith.muli %[[r12]], %[[cst_3]] : vector<16xi64>
-    //CHECK: %[[r14:.*]] = vector.broadcast %[[r11]] : i64 to vector<16xi64>
-    //CHECK: %[[r15:.*]] = arith.addi %[[r14]], %[[r13]] : vector<16xi64>
-    %tdesc = xegpu.create_tdesc %mem[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : memref<16xf16> -> !xegpu.tensor_desc<16xf16, #global>
+    //CHECK: %[[r8:.*]] = arith.index_castui %[[intptr_2]] : index to i64
+    //CHECK: %[[cst_3:.*]] = arith.constant dense<[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]> : vector<16xi64>
+    //CHECK: %[[r9:.*]] = vector.broadcast %[[r8]] : i64 to vector<16xi64>
+    //CHECK: %[[r10:.*]] = arith.addi %[[r9]], %[[cst_3]] : vector<16xi64>
+    %tdesc = xegpu.create_tdesc %mem, %offsets : memref<16xf16>, vector<16xindex> -> !xegpu.tensor_desc<16xf16, #global>
 
-    //CHECK: %[[r16:.*]] = vector.bitcast %[[r10]] : vector<16xf16> to vector<16xi16>
-    //CHECK: %[[r17:.*]] = arith.extui %[[r16]] : vector<16xi16> to vector<16xi32>
+    //CHECK: %[[r11:.*]] = vector.bitcast %[[r7]] : vector<16xf16> to vector<16xi16>
+    //CHECK: %[[r12:.*]] = arith.extui %[[r11]] : vector<16xi16> to vector<16xi32>
     //CHECK: func.call @llvm.genx.lsc.store.stateless.v16i1.v16i64.v16i32
-    //CHECK-SAME: (%[[cst_0]], %[[c4_i8]], %[[c0_i8]], %[[c0_i8]], %[[c1_i16]], %[[c0_i32]], %[[c6_i8]], %[[c1_i8]], %[[c1_i8]], %[[c0_i8]], %[[r15]], %[[r17]], %[[c0_i32]])
+    //CHECK-SAME: (%[[cst_0]], %[[c4_i8]], %[[c0_i8]], %[[c0_i8]], %[[c1_i16]], %[[c0_i32]], %[[c6_i8]], %[[c1_i8]], %[[c1_i8]], %[[c0_i8]], %[[r10]], %[[r12]], %[[c0_i32]])
     //CHECK-SAME: (vector<16xi1>, i8, i8, i8, i16, i32, i8, i8, i8, i8, vector<16xi64>, vector<16xi32>, i32) -> ()
     xegpu.store %data, %tdesc, %mask : vector<16xf16>, !xegpu.tensor_desc<16xf16, #global>, vector<16xi1>
 
diff --git a/test/Conversion/XeGPUToVC/store_load_slm_no_chunk_f32.mlir b/test/Conversion/XeGPUToVC/store_load_slm_no_chunk_f32.mlir
index 1af7818e7..14024fcf9 100644
--- a/test/Conversion/XeGPUToVC/store_load_slm_no_chunk_f32.mlir
+++ b/test/Conversion/XeGPUToVC/store_load_slm_no_chunk_f32.mlir
@@ -1,7 +1,7 @@
 // RUN: imex-opt -convert-xegpu-to-vc -cse  %s | FileCheck %s
 
-#global = #xegpu.scatter_tdesc_attr<memory_scope=global>
-#slm = #xegpu.scatter_tdesc_attr<memory_scope=slm>
+#global = #xegpu.scatter_tdesc_attr<memory_space=global>
+#slm = #xegpu.scatter_tdesc_attr<memory_space=slm>
 
 gpu.module @test_kernel {
   //CHECK: gpu.func @test_store_scatter(%[[arg0:.*]]: memref<16xf32>) kernel
@@ -12,19 +12,17 @@ gpu.module @test_kernel {
 
     //CHECK: %[[cst_0:.*]] = arith.constant dense<true> : vector<16xi1>
     %mask = arith.constant dense<1> : vector<16xi1>
+    %offsets = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex>
 
     //CHECK: %[[alloc:.*]] = memref.alloc() : memref<16xf32, 3>
     %slm = memref.alloc() : memref<16xf32, 3>
 
     //CHECK: %[[intptr:.*]] = memref.extract_aligned_pointer_as_index %[[alloc]] : memref<16xf32, 3> -> index
     //CHECK: %[[r0:.*]] = arith.index_castui %[[intptr]] : index to i32
-    //CHECK: %[[cst_1:.*]] = arith.constant dense<4> : vector<16xi32>
-    //CHECK: %[[r1:.*]] = vector.from_elements {{.*}} : vector<16xindex>
-    //CHECK: %[[r2:.*]] = arith.index_castui %[[r1]] : vector<16xindex> to vector<16xi32>
-    //CHECK: %[[r3:.*]] = arith.muli %[[r2]], %[[cst_1]] : vector<16xi32>
-    //CHECK: %[[r4:.*]] = vector.broadcast %[[r0]] : i32 to vector<16xi32>
-    //CHECK: %[[r5:.*]] = arith.addi %[[r4]], %[[r3]] : vector<16xi32>
-    %slm_tdesc = xegpu.create_tdesc %slm[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : memref<16xf32, 3> -> !xegpu.tensor_desc<16xf32, #slm>
+    //CHECK: %[[cst_1:.*]] = arith.constant dense<[0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60]> : vector<16xi32>
+    //CHECK: %[[r1:.*]] = vector.broadcast %[[r0]] : i32 to vector<16xi32>
+    //CHECK: %[[r2:.*]] = arith.addi %[[r1]], %[[cst_1]] : vector<16xi32>
+    %slm_tdesc = xegpu.create_tdesc %slm, %offsets : memref<16xf32, 3>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #slm>
 
     //CHECK: %[[c4_i8:.*]] = arith.constant 4 : i8
     //CHECK: %[[c0_i8:.*]] = arith.constant 0 : i8
@@ -33,26 +31,24 @@ gpu.module @test_kernel {
     //CHECK: %[[c3_i8:.*]] = arith.constant 3 : i8
     //CHECK: %[[c1_i8:.*]] = arith.constant 1 : i8
     //CHECK: func.call @llvm.genx.lsc.store.slm.v16i1.v16i32.v16f32
-    //CHECK-SAME: (%[[cst_0]], %[[c4_i8]], %[[c0_i8]], %[[c0_i8]], %[[c1_i16]], %[[c0_i32]], %[[c3_i8]], %[[c1_i8]], %[[c1_i8]], %[[c0_i8]], %[[r5]], %[[cst]], %[[c0_i32]])
+    //CHECK-SAME: (%[[cst_0]], %[[c4_i8]], %[[c0_i8]], %[[c0_i8]], %[[c1_i16]], %[[c0_i32]], %[[c3_i8]], %[[c1_i8]], %[[c1_i8]], %[[c0_i8]], %[[r2]], %[[cst]], %[[c0_i32]])
     //CHECK-SAME: (vector<16xi1>, i8, i8, i8, i16, i32, i8, i8, i8, i8, vector<16xi32>, vector<16xf32>, i32) -> ()
     xegpu.store %cst, %slm_tdesc, %mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #slm>, vector<16xi1>
 
-    //CHECK: %[[r6:.*]] = func.call @llvm.genx.lsc.load.slm.v16f32.v16i1.v16i32
-    //CHECK-SAME: (%[[cst_0]], %[[c0_i8]], %[[c0_i8]], %[[c0_i8]], %[[c1_i16]], %[[c0_i32]], %[[c3_i8]], %[[c1_i8]], %[[c1_i8]], %[[c0_i8]], %[[r5]], %[[c0_i32]])
+    //CHECK: %[[r3:.*]] = func.call @llvm.genx.lsc.load.slm.v16f32.v16i1.v16i32
+    //CHECK-SAME: (%[[cst_0]], %[[c0_i8]], %[[c0_i8]], %[[c0_i8]], %[[c1_i16]], %[[c0_i32]], %[[c3_i8]], %[[c1_i8]], %[[c1_i8]], %[[c0_i8]], %[[r2]], %[[c0_i32]])
     //CHECK-SAME: (vector<16xi1>, i8, i8, i8, i16, i32, i8, i8, i8, i8, vector<16xi32>, i32) -> vector<16xf32>
     %data = xegpu.load %slm_tdesc, %mask : !xegpu.tensor_desc<16xf32, #slm>, vector<16xi1> -> vector<16xf32>
 
     //CHECK: %[[intptr_2:.*]] = memref.extract_aligned_pointer_as_index %[[arg0]] : memref<16xf32> -> index
-    //CHECK: %[[r7:.*]] = arith.index_castui %[[intptr_2]] : index to i64
-    //CHECK: %[[cst_3:.*]] = arith.constant dense<4> : vector<16xi64>
-    //CHECK: %[[r8:.*]] = arith.index_castui %[[r1]] : vector<16xindex> to vector<16xi64>
-    //CHECK: %[[r9:.*]] = arith.muli %[[r8]], %[[cst_3]] : vector<16xi64>
-    //CHECK: %[[r10:.*]] = vector.broadcast %[[r7]] : i64 to vector<16xi64>
-    //CHECK: %[[r11:.*]] = arith.addi %[[r10]], %[[r9]] : vector<16xi64>
-    %tdesc = xegpu.create_tdesc %mem[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #global>
+    //CHECK: %[[r4:.*]] = arith.index_castui %[[intptr_2]] : index to i64
+    //CHECK: %[[cst_3:.*]] = arith.constant dense<[0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60]> : vector<16xi64>
+    //CHECK: %[[r5:.*]] = vector.broadcast %[[r4]] : i64 to vector<16xi64>
+    //CHECK: %[[r6:.*]] = arith.addi %[[r5]], %[[cst_3]] : vector<16xi64>
+    %tdesc = xegpu.create_tdesc %mem, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #global>
 
     //CHECK: func.call @llvm.genx.lsc.store.stateless.v16i1.v16i64.v16f32
-    //CHECK-SAME: (%[[cst_0]], %[[c4_i8]], %[[c0_i8]], %[[c0_i8]], %[[c1_i16]], %[[c0_i32]], %[[c3_i8]], %[[c1_i8]], %[[c1_i8]], %[[c0_i8]], %[[r11]], %[[r6]], %[[c0_i32]])
+    //CHECK-SAME: (%[[cst_0]], %[[c4_i8]], %[[c0_i8]], %[[c0_i8]], %[[c1_i16]], %[[c0_i32]], %[[c3_i8]], %[[c1_i8]], %[[c1_i8]], %[[c0_i8]], %[[r6]], %[[r3]], %[[c0_i32]])
     //CHECK-SAME: (vector<16xi1>, i8, i8, i8, i16, i32, i8, i8, i8, i8, vector<16xi64>, vector<16xf32>, i32) -> ()
     xegpu.store %data, %tdesc, %mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #global>, vector<16xi1>
 
diff --git a/test/Conversion/XeTileToXeGPU/array_length_load.mlir b/test/Conversion/XeTileToXeGPU/array_length_load.mlir
index 749e58e46..7fd8f87ca 100644
--- a/test/Conversion/XeTileToXeGPU/array_length_load.mlir
+++ b/test/Conversion/XeTileToXeGPU/array_length_load.mlir
@@ -7,8 +7,8 @@ gpu.module @test_kernel {
     %a_loaded = xetile.load_tile %a_tile : !xetile.tile<32x32xf16> -> vector<32x32xf16>
 
     // Do not let XeGPU do one load with multiple blocks (array_length > 1), where each block is finer than one GRF.
-    //CHECK: xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<1x32xf16> -> !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: xegpu.create_nd_tdesc %arg1[%c0, %c16] : memref<1x32xf16> -> !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<1x32xf16> -> !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.create_nd_tdesc %arg1[%c0, %c16] : memref<1x32xf16> -> !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
   	%b_tile = xetile.init_tile %b[%c0, %c0] : memref<1x32xf16> -> !xetile.tile<1x32xf16>
     %b_loaded = xetile.load_tile %b_tile : !xetile.tile<1x32xf16> -> vector<1x32xf16>
 
diff --git a/test/Conversion/XeTileToXeGPU/lit.local.cfg b/test/Conversion/XeTileToXeGPU/lit.local.cfg
new file mode 100644
index 000000000..097b2470c
--- /dev/null
+++ b/test/Conversion/XeTileToXeGPU/lit.local.cfg
@@ -0,0 +1,8 @@
+
+# need slm support for XeTile lowering
+excludes_slm_tests = [
+    'sg_mixed_scf.mlir',
+    'sg_gemm_1k_1k_1k_f16_f32_slm.mlir',
+  ]
+
+config.excludes.update(excludes_slm_tests)
diff --git a/test/Conversion/XeTileToXeGPU/reduction.mlir b/test/Conversion/XeTileToXeGPU/reduction.mlir
index 2db5c548d..fa61a6170 100644
--- a/test/Conversion/XeTileToXeGPU/reduction.mlir
+++ b/test/Conversion/XeTileToXeGPU/reduction.mlir
@@ -10,10 +10,10 @@ module {
       %c0 = arith.constant 0 : index
       %acc = arith.constant dense<0.0> : vector<16xf16>
       //CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c0]]] : memref<128x256xf16>
-      //CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+      //CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
       %t = xetile.init_tile %a[%c0, %c0] : memref<128x256xf16> -> !xetile.tile<16x32xf16>
       //CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}>
-      //CHECK-SAME : !xegpu.tensor_desc<16x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<16x32xf16>
+      //CHECK-SAME : !xegpu.tensor_desc<16x32xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>> -> vector<16x32xf16>
       %v = xetile.load_tile %t : !xetile.tile<16x32xf16> -> vector<16x32xf16>
 
       //CHECK: %[[R2:.*]] = vector.extract_strided_slice %[[R1]] {offsets = [0, 0], sizes = [1, 32], strides = [1, 1]} : vector<16x32xf16> to vector<1x32xf16>
@@ -118,9 +118,9 @@ module {
       %r = vector.multi_reduction <add>, %e, %acc [1] : vector<16x32xf16> to vector<16xf16>
       //CHECK: %[[R161:.*]] = vector.shape_cast %[[R160]] : vector<16xf16> to vector<2x8xf16>
       %c = vector.shape_cast %r: vector<16xf16> to vector<2x8xf16>
-      //CHECK: %[[R162:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c0]], %[[c0]]] : memref<128x256xf16> -> !xegpu.tensor_desc<2x8xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+      //CHECK: %[[R162:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c0]], %[[c0]]] : memref<128x256xf16> -> !xegpu.tensor_desc<2x8xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
       %s = xetile.init_tile %b[%c0, %c0] : memref<128x256xf16> -> !xetile.tile<2x8xf16>
-      //CHECK: xegpu.store_nd %[[R161]], %[[R162]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<2x8xf16>, !xegpu.tensor_desc<2x8xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+      //CHECK: xegpu.store_nd %[[R161]], %[[R162]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<2x8xf16>, !xegpu.tensor_desc<2x8xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
       xetile.store_tile %c, %s : vector<2x8xf16>, !xetile.tile<2x8xf16>
       gpu.return
     }
@@ -132,8 +132,8 @@ module {
       %a_tile = xetile.init_tile %a[%c0, %c0] : memref<8x32xf32> -> !xetile.tile<8x32xf32>
       %b_tile = xetile.init_tile %b[%c0, %c0] : memref<8x1xf32> -> !xetile.tile<8x1xf32>
 
-      //CHECK: xegpu.load_nd %{{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<8x16xf32>
-      //CHECK: xegpu.load_nd %{{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<8x16xf32>
+      //CHECK: xegpu.load_nd %{{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>> -> vector<8x16xf32>
+      //CHECK: xegpu.load_nd %{{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>> -> vector<8x16xf32>
       %a_loaded = xetile.load_tile %a_tile: !xetile.tile<8x32xf32> -> vector<8x32xf32>
 
       //CHECK: %[[R1:.*]] = vector.shuffle %{{.*}}, %{{.*}} [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] : vector<16xf32>, vector<16xf32>
@@ -173,10 +173,10 @@ module {
       %c0 = arith.constant 0 : index
       %acc = arith.constant dense<0.0> : vector<32xf16>
       //CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c0]]] : memref<128x256xf16>
-      //CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+      //CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
       %t = xetile.init_tile %a[%c0, %c0] : memref<128x256xf16> -> !xetile.tile<16x32xf16>
       //CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}>
-      //CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<16x32xf16>
+      //CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>> -> vector<16x32xf16>
       %v = xetile.load_tile %t : !xetile.tile<16x32xf16> -> vector<16x32xf16>
       //CHECK: %[[R2:.*]] = vector.extract_strided_slice %[[R1]] {offsets = [0, 0], sizes = [1, 32], strides = [1, 1]} : vector<16x32xf16> to vector<1x32xf16>
       //CHECK: %[[R3:.*]] = vector.extract_strided_slice %[[R1]] {offsets = [1, 0], sizes = [1, 32], strides = [1, 1]} : vector<16x32xf16> to vector<1x32xf16>
@@ -231,9 +231,9 @@ module {
       %r = vector.multi_reduction <add>, %e, %acc [0] : vector<16x32xf16> to vector<32xf16>
       //CHECK: %[[R118:.*]] = vector.shape_cast %[[R117]] : vector<32xf16> to vector<4x8xf16>
       %c = vector.shape_cast %r: vector<32xf16> to vector<4x8xf16>
-      //CHECK: %[[R119:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c0]], %[[c0]]] : memref<128x256xf16> -> !xegpu.tensor_desc<4x8xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+      //CHECK: %[[R119:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c0]], %[[c0]]] : memref<128x256xf16> -> !xegpu.tensor_desc<4x8xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
       %s = xetile.init_tile %b[%c0, %c0] : memref<128x256xf16> -> !xetile.tile<4x8xf16>
-      //CHECK: xegpu.store_nd %[[R118]], %[[R119]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<4x8xf16>, !xegpu.tensor_desc<4x8xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+      //CHECK: xegpu.store_nd %[[R118]], %[[R119]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<4x8xf16>, !xegpu.tensor_desc<4x8xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
       xetile.store_tile %c, %s : vector<4x8xf16>, !xetile.tile<4x8xf16>
       gpu.return
     }
diff --git a/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_f16_f32.mlir b/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_f16_f32.mlir
index 15c9d742b..6df1b36bf 100644
--- a/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_f16_f32.mlir
+++ b/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_f16_f32.mlir
@@ -25,95 +25,95 @@ gpu.module @test_kernel {
 
     //CHECK: %[[r2:.*]] = arith.addi %[[r0]], %[[c0]] : index
     //CHECK: %[[r3:.*]] = arith.addi %[[r1]], %[[c0]] : index
-    //CHECK: %[[r4:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r4:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[c16:.*]] = arith.constant 16 : index
     //CHECK: %[[r5:.*]] = arith.addi %[[r1]], %[[c16]] : index
-    //CHECK: %[[r6:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r6:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[c32:.*]] = arith.constant 32 : index
     //CHECK: %[[r7:.*]] = arith.addi %[[r1]], %[[c32]] : index
-    //CHECK: %[[r8:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r8:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[c48:.*]] = arith.constant 48 : index
     //CHECK: %[[r9:.*]] = arith.addi %[[r1]], %[[c48]] : index
-    //CHECK: %[[r10:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r10:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[c8:.*]] = arith.constant 8 : index
     //CHECK: %[[r11:.*]] = arith.addi %[[r0]], %[[c8]] : index
-    //CHECK: %[[r12:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r11]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: %[[r13:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r11]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: %[[r14:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r11]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: %[[r15:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r11]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r12:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r11]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r13:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r11]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r14:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r11]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r15:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r11]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[r16:.*]] = arith.addi %[[r0]], %[[c16]] : index
-    //CHECK: %[[r17:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r16]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: %[[r18:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r16]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: %[[r19:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r16]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: %[[r20:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r16]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r17:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r16]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r18:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r16]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r19:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r16]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r20:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r16]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[c24:.*]] = arith.constant 24 : index
     //CHECK: %[[r21:.*]] = arith.addi %[[r0]], %[[c24]] : index
-    //CHECK: %[[r22:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r21]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: %[[r23:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r21]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: %[[r24:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r21]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: %[[r25:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r21]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r22:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r21]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r23:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r21]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r24:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r21]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r25:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r21]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[r26:.*]] = arith.addi %[[r0]], %[[c32]] : index
-    //CHECK: %[[r27:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: %[[r28:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: %[[r29:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: %[[r30:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r27:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r28:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r29:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r30:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[c40:.*]] = arith.constant 40 : index
     //CHECK: %[[r31:.*]] = arith.addi %[[r0]], %[[c40]] : index
-    //CHECK: %[[r32:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r31]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: %[[r33:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r31]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: %[[r34:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r31]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: %[[r35:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r31]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r32:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r31]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r33:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r31]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r34:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r31]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r35:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r31]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[r36:.*]] = arith.addi %[[r0]], %[[c48]] : index
-    //CHECK: %[[r37:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r36]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: %[[r38:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r36]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: %[[r39:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r36]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: %[[r40:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r36]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r37:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r36]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r38:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r36]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r39:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r36]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r40:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r36]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[c56:.*]] = arith.constant 56 : index
     //CHECK: %[[r41:.*]] = arith.addi %[[r0]], %[[c56]] : index
-    //CHECK: %[[r42:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r41]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: %[[r43:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r41]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: %[[r44:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r41]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: %[[r45:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r41]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: %[[r46:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: %[[r47:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: %[[r48:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: %[[r49:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: %[[r50:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: %[[r51:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: %[[r52:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: %[[r53:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r42:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r41]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r43:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r41]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r44:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r41]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r45:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r41]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r46:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r47:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r48:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r49:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r50:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r51:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r52:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r53:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
     %c_init_tile = xetile.init_tile %C[%m, %n] : memref<1024x1024xf32> -> !xetile.tile<64x64xf32>
 
-    //CHECK: %[[r54:.*]] = xegpu.load_nd %[[r46]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xf32>
-    //CHECK: %[[r55:.*]] = xegpu.load_nd %[[r47]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xf32>
-    //CHECK: %[[r56:.*]] = xegpu.load_nd %[[r48]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xf32>
-    //CHECK: %[[r57:.*]] = xegpu.load_nd %[[r49]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xf32>
-    //CHECK: %[[r58:.*]] = xegpu.load_nd %[[r50]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xf32>
-    //CHECK: %[[r59:.*]] = xegpu.load_nd %[[r51]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xf32>
-    //CHECK: %[[r60:.*]] = xegpu.load_nd %[[r52]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xf32>
-    //CHECK: %[[r61:.*]] = xegpu.load_nd %[[r53]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xf32>
+    //CHECK: %[[r54:.*]] = xegpu.load_nd %[[r46]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xf32>
+    //CHECK: %[[r55:.*]] = xegpu.load_nd %[[r47]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xf32>
+    //CHECK: %[[r56:.*]] = xegpu.load_nd %[[r48]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xf32>
+    //CHECK: %[[r57:.*]] = xegpu.load_nd %[[r49]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xf32>
+    //CHECK: %[[r58:.*]] = xegpu.load_nd %[[r50]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xf32>
+    //CHECK: %[[r59:.*]] = xegpu.load_nd %[[r51]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xf32>
+    //CHECK: %[[r60:.*]] = xegpu.load_nd %[[r52]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xf32>
+    //CHECK: %[[r61:.*]] = xegpu.load_nd %[[r53]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xf32>
     %c_init_value = xetile.load_tile %c_init_tile : !xetile.tile<64x64xf32> -> vector<64x64xf32>
 
-    //CHECK: %[[r62:.*]] = xegpu.create_nd_tdesc %[[A]][%[[r2]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
-    //CHECK: %[[r63:.*]] = xegpu.create_nd_tdesc %[[A]][%[[r2]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
-    //CHECK: %[[r64:.*]] = xegpu.create_nd_tdesc %[[A]][%[[r26]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
-    //CHECK: %[[r65:.*]] = xegpu.create_nd_tdesc %[[A]][%[[r26]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
+    //CHECK: %[[r62:.*]] = xegpu.create_nd_tdesc %[[A]][%[[r2]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>
+    //CHECK: %[[r63:.*]] = xegpu.create_nd_tdesc %[[A]][%[[r2]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>
+    //CHECK: %[[r64:.*]] = xegpu.create_nd_tdesc %[[A]][%[[r26]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>
+    //CHECK: %[[r65:.*]] = xegpu.create_nd_tdesc %[[A]][%[[r26]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>
     %a_init_tile = xetile.init_tile %A[%m, %c0] : memref<1024x1024xf16> -> !xetile.tile<64x64xf16>
 
-    //CHECK: %[[r66:.*]] = xegpu.create_nd_tdesc %[[B]][%[[c0]], %[[r3]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
-    //CHECK: %[[r67:.*]] = xegpu.create_nd_tdesc %[[B]][%[[c0]], %[[r7]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
-    //CHECK: %[[r68:.*]] = xegpu.create_nd_tdesc %[[B]][%[[c32]], %[[r3]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
-    //CHECK: %[[r69:.*]] = xegpu.create_nd_tdesc %[[B]][%[[c32]], %[[r7]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
+    //CHECK: %[[r66:.*]] = xegpu.create_nd_tdesc %[[B]][%[[c0]], %[[r3]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>
+    //CHECK: %[[r67:.*]] = xegpu.create_nd_tdesc %[[B]][%[[c0]], %[[r7]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>
+    //CHECK: %[[r68:.*]] = xegpu.create_nd_tdesc %[[B]][%[[c32]], %[[r3]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>
+    //CHECK: %[[r69:.*]] = xegpu.create_nd_tdesc %[[B]][%[[c32]], %[[r7]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>
     %b_init_tile = xetile.init_tile %B[%c0, %n] : memref<1024x1024xf16> -> !xetile.tile<64x64xf16>
 
     //CHECK: %[[r72:.*]]:16 = scf.for %[[arg3:.*]] = %[[c0]] to %[[c1024]] step %[[c64]]
     //CHECK-SAME: iter_args(%[[arg4:.*]] = %[[r62]], %[[arg5:.*]] = %[[r63]], %[[arg6:.*]] = %[[r64]], %[[arg7:.*]] = %[[r65]], %[[arg8:.*]] = %[[r66]],
     //CHECK-SAME: %[[arg9:.*]] = %[[r67]], %[[arg10:.*]] = %[[r68]], %[[arg11:.*]] = %[[r69]], %[[arg12:.*]] = %[[r54]], %[[arg13:.*]] = %[[r55]],
     //CHECK-SAME: %[[arg14:.*]] = %[[r56]], %[[arg15:.*]] = %[[r57]], %[[arg16:.*]] = %[[r58]], %[[arg17:.*]] = %[[r59]], %[[arg18:.*]] = %[[r60]],
-    //CHECK-SAME: %[[arg19:.*]] = %[[r61]]) -> (!xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>,
-    //CHECK-SAME: !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>,
-    //CHECK-SAME: !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>,
-    //CHECK-SAME: !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>,
+    //CHECK-SAME: %[[arg19:.*]] = %[[r61]]) -> (!xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>,
+    //CHECK-SAME: !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>,
+    //CHECK-SAME: !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>,
+    //CHECK-SAME: !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>,
     //CHECK-SAME: vector<32x16xf32>, vector<32x16xf32>, vector<32x16xf32>, vector<32x16xf32>, vector<32x16xf32>, vector<32x16xf32>, vector<32x16xf32>, vector<32x16xf32>) {
     %out:3 = scf.for %k = %c0 to %c1024 step %c64
       iter_args(%a_tile = %a_init_tile, %b_tile = %b_init_tile, %c_value = %c_init_value)
@@ -152,16 +152,16 @@ gpu.module @test_kernel {
       //CHECK: %[[r208:.*]] = vector.extract_strided_slice %[[arg19]] {offsets = [24, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32>
 
 
-      //CHECK: %[[r105:.*]] = xegpu.load_nd %[[arg4]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x16xf16>
+      //CHECK: %[[r105:.*]] = xegpu.load_nd %[[arg4]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x16xf16>
       //CHECK: %[[r106:.*]] = vector.extract %[[r105]][0] : vector<32x16xf16> from vector<2x32x16xf16>
       //CHECK: %[[r107:.*]] = vector.extract %[[r105]][1] : vector<32x16xf16> from vector<2x32x16xf16>
-      //CHECK: %[[r108:.*]] = xegpu.load_nd %[[arg5]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x16xf16>
+      //CHECK: %[[r108:.*]] = xegpu.load_nd %[[arg5]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x16xf16>
       //CHECK: %[[r109:.*]] = vector.extract %[[r108]][0] : vector<32x16xf16> from vector<2x32x16xf16>
       //CHECK: %[[r110:.*]] = vector.extract %[[r108]][1] : vector<32x16xf16> from vector<2x32x16xf16>
-      //CHECK: %[[r111:.*]] = xegpu.load_nd %[[arg6]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x16xf16>
+      //CHECK: %[[r111:.*]] = xegpu.load_nd %[[arg6]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x16xf16>
       //CHECK: %[[r112:.*]] = vector.extract %[[r111]][0] : vector<32x16xf16> from vector<2x32x16xf16>
       //CHECK: %[[r113:.*]] = vector.extract %[[r111]][1] : vector<32x16xf16> from vector<2x32x16xf16>
-      //CHECK: %[[r114:.*]] = xegpu.load_nd %[[arg7]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x16xf16>
+      //CHECK: %[[r114:.*]] = xegpu.load_nd %[[arg7]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x16xf16>
       //CHECK: %[[r115:.*]] = vector.extract %[[r114]][0] : vector<32x16xf16> from vector<2x32x16xf16>
       //CHECK: %[[r116:.*]] = vector.extract %[[r114]][1] : vector<32x16xf16> from vector<2x32x16xf16>
       //CHECK: %[[r117:.*]] = vector.extract_strided_slice %[[r106]] {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf16> to vector<8x16xf16>
@@ -198,16 +198,16 @@ gpu.module @test_kernel {
       //CHECK: %[[r148:.*]] = vector.extract_strided_slice %[[r116]] {offsets = [24, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf16> to vector<8x16xf16>
       %a_value = xetile.load_tile %a_tile : !xetile.tile<64x64xf16> -> vector<64x64xf16>
 
-      //CHECK: %[[r149:.*]] = xegpu.load_nd %[[arg8]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x16xf16>
+      //CHECK: %[[r149:.*]] = xegpu.load_nd %[[arg8]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x16xf16>
       //CHECK: %[[r150:.*]] = vector.extract %[[r149]][0] : vector<32x16xf16> from vector<2x32x16xf16>
       //CHECK: %[[r151:.*]] = vector.extract %[[r149]][1] : vector<32x16xf16> from vector<2x32x16xf16>
-      //CHECK: %[[r152:.*]] = xegpu.load_nd %[[arg9]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x16xf16>
+      //CHECK: %[[r152:.*]] = xegpu.load_nd %[[arg9]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x16xf16>
       //CHECK: %[[r153:.*]] = vector.extract %[[r152]][0] : vector<32x16xf16> from vector<2x32x16xf16>
       //CHECK: %[[r154:.*]] = vector.extract %[[r152]][1] : vector<32x16xf16> from vector<2x32x16xf16>
-      //CHECK: %[[r155:.*]] = xegpu.load_nd %[[arg10]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x16xf16>
+      //CHECK: %[[r155:.*]] = xegpu.load_nd %[[arg10]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x16xf16>
       //CHECK: %[[r156:.*]] = vector.extract %[[r155]][0] : vector<32x16xf16> from vector<2x32x16xf16>
       //CHECK: %[[r157:.*]] = vector.extract %[[r155]][1] : vector<32x16xf16> from vector<2x32x16xf16>
-      //CHECK: %[[r158:.*]] = xegpu.load_nd %[[arg11]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x16xf16>
+      //CHECK: %[[r158:.*]] = xegpu.load_nd %[[arg11]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x16xf16>
       //CHECK: %[[r159:.*]] = vector.extract %[[r158]][0] : vector<32x16xf16> from vector<2x32x16xf16>
       //CHECK: %[[r160:.*]] = vector.extract %[[r158]][1] : vector<32x16xf16> from vector<2x32x16xf16>
       //CHECK: %[[r161:.*]] = vector.extract_strided_slice %[[r150]] {offsets = [0, 0], sizes = [16, 16], strides = [1, 1]} : vector<32x16xf16> to vector<16x16xf16>
@@ -384,21 +384,21 @@ gpu.module @test_kernel {
       //CHECK: %[[r358:.*]] = vector.shuffle %[[r288]], %[[r304]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : vector<8x16xf32>, vector<8x16xf32>
       //CHECK: %[[r359:.*]] = vector.shuffle %[[r320]], %[[r336]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : vector<8x16xf32>, vector<8x16xf32>
       //CHECK: %[[r360:.*]] = vector.shuffle %[[r358]], %[[r359]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] : vector<16x16xf32>, vector<16x16xf32>
-      //CHECK: %[[r361:.*]] = xegpu.update_nd_offset %[[arg4]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
-      //CHECK: %[[r362:.*]] = xegpu.update_nd_offset %[[arg5]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
-      //CHECK: %[[r363:.*]] = xegpu.update_nd_offset %[[arg6]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
-      //CHECK: %[[r364:.*]] = xegpu.update_nd_offset %[[arg7]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
-      //CHECK: %[[r365:.*]] = xegpu.update_nd_offset %[[arg8]], [%[[c64]], %[[c0]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
-      //CHECK: %[[r366:.*]] = xegpu.update_nd_offset %[[arg9]], [%[[c64]], %[[c0]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
-      //CHECK: %[[r367:.*]] = xegpu.update_nd_offset %[[arg10]], [%[[c64]], %[[c0]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
-      //CHECK: %[[r368:.*]] = xegpu.update_nd_offset %[[arg11]], [%[[c64]], %[[c0]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
+      //CHECK: %[[r361:.*]] = xegpu.update_nd_offset %[[arg4]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>
+      //CHECK: %[[r362:.*]] = xegpu.update_nd_offset %[[arg5]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>
+      //CHECK: %[[r363:.*]] = xegpu.update_nd_offset %[[arg6]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>
+      //CHECK: %[[r364:.*]] = xegpu.update_nd_offset %[[arg7]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>
+      //CHECK: %[[r365:.*]] = xegpu.update_nd_offset %[[arg8]], [%[[c64]], %[[c0]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>
+      //CHECK: %[[r366:.*]] = xegpu.update_nd_offset %[[arg9]], [%[[c64]], %[[c0]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>
+      //CHECK: %[[r367:.*]] = xegpu.update_nd_offset %[[arg10]], [%[[c64]], %[[c0]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>
+      //CHECK: %[[r368:.*]] = xegpu.update_nd_offset %[[arg11]], [%[[c64]], %[[c0]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>
       %a_next_tile = xetile.update_tile_offset %a_tile, [%c0, %c64] : !xetile.tile<64x64xf16>, index, index -> !xetile.tile<64x64xf16>
       %b_next_tile = xetile.update_tile_offset %b_tile, [%c64, %c0] : !xetile.tile<64x64xf16>, index, index -> !xetile.tile<64x64xf16>
 
       //CHECK: scf.yield %[[r361]], %[[r362]], %[[r363]], %[[r364]], %[[r365]], %[[r366]], %[[r367]], %[[r368]], %[[r339]], %[[r345]], %[[r351]], %[[r357]], %[[r342]], %[[r348]], %[[r354]], %[[r360]]
-      //CHECK-SAME: !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>,
-      //CHECK-SAME: !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>,
-      //CHECK-SAME: !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>, vector<32x16xf32>, vector<32x16xf32>,
+      //CHECK-SAME: !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>,
+      //CHECK-SAME: !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>,
+      //CHECK-SAME: !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>, vector<32x16xf32>, vector<32x16xf32>,
       //CHECK-SAME: vector<32x16xf32>, vector<32x16xf32>, vector<32x16xf32>, vector<32x16xf32>, vector<32x16xf32>, vector<32x16xf32>
       scf.yield %a_next_tile, %b_next_tile, %c_new_value
         : !xetile.tile<64x64xf16>, !xetile.tile<64x64xf16>, vector<64x64xf32>
@@ -435,38 +435,38 @@ gpu.module @test_kernel {
     //CHECK: %[[r102:.*]] = vector.extract_strided_slice %[[r72]]#15 {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32>
     //CHECK: %[[r103:.*]] = vector.extract_strided_slice %[[r72]]#15 {offsets = [16, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32>
     //CHECK: %[[r104:.*]] = vector.extract_strided_slice %[[r72]]#15 {offsets = [24, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32>
-    //CHECK: xegpu.store_nd %[[r73]], %[[r4]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: xegpu.store_nd %[[r77]], %[[r6]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: xegpu.store_nd %[[r81]], %[[r8]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: xegpu.store_nd %[[r85]], %[[r10]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: xegpu.store_nd %[[r74]], %[[r12]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: xegpu.store_nd %[[r78]], %[[r13]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: xegpu.store_nd %[[r82]], %[[r14]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: xegpu.store_nd %[[r86]], %[[r15]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: xegpu.store_nd %[[r75]], %[[r17]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: xegpu.store_nd %[[r79]], %[[r18]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: xegpu.store_nd %[[r83]], %[[r19]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: xegpu.store_nd %[[r87]], %[[r20]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: xegpu.store_nd %[[r76]], %[[r22]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: xegpu.store_nd %[[r80]], %[[r23]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: xegpu.store_nd %[[r84]], %[[r24]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: xegpu.store_nd %[[r88]], %[[r25]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: xegpu.store_nd %[[r89]], %[[r27]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: xegpu.store_nd %[[r93]], %[[r28]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: xegpu.store_nd %[[r97]], %[[r29]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: xegpu.store_nd %[[r101]], %[[r30]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: xegpu.store_nd %[[r90]], %[[r32]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: xegpu.store_nd %[[r94]], %[[r33]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: xegpu.store_nd %[[r98]], %[[r34]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: xegpu.store_nd %[[r102]], %[[r35]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: xegpu.store_nd %[[r91]], %[[r37]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: xegpu.store_nd %[[r95]], %[[r38]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: xegpu.store_nd %[[r99]], %[[r39]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: xegpu.store_nd %[[r103]], %[[r40]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: xegpu.store_nd %[[r92]], %[[r42]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: xegpu.store_nd %[[r96]], %[[r43]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: xegpu.store_nd %[[r100]], %[[r44]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: xegpu.store_nd %[[r104]], %[[r45]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r73]], %[[r4]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r77]], %[[r6]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r81]], %[[r8]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r85]], %[[r10]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r74]], %[[r12]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r78]], %[[r13]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r82]], %[[r14]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r86]], %[[r15]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r75]], %[[r17]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r79]], %[[r18]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r83]], %[[r19]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r87]], %[[r20]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r76]], %[[r22]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r80]], %[[r23]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r84]], %[[r24]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r88]], %[[r25]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r89]], %[[r27]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r93]], %[[r28]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r97]], %[[r29]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r101]], %[[r30]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r90]], %[[r32]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r94]], %[[r33]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r98]], %[[r34]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r102]], %[[r35]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r91]], %[[r37]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r95]], %[[r38]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r99]], %[[r39]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r103]], %[[r40]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r92]], %[[r42]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r96]], %[[r43]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r100]], %[[r44]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r104]], %[[r45]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
     xetile.store_tile %out#2, %c_init_tile: vector<64x64xf32>, !xetile.tile<64x64xf32>
 
     gpu.return
diff --git a/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_f16_f32_slm.mlir b/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_f16_f32_slm.mlir
index a57b1300f..2e075d714 100644
--- a/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_f16_f32_slm.mlir
+++ b/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_f16_f32_slm.mlir
@@ -2,7 +2,7 @@
 // RUN: --cse --convert-xetile-to-xegpu --cse %s -o -| FileCheck %s
 
 
-#tile_attr = #xetile.tile_attr<memory_scope = 3>
+#tile_attr = #xetile.tile_attr<memory_space = 3>
 
 // CHECK-LABEL: gpu.module @test_kernel {
 gpu.module @test_kernel {
@@ -26,37 +26,37 @@ gpu.module @test_kernel {
     %0 = arith.muli %block_id_x, %c16 : index
     %1 = arith.muli %block_id_y, %c16 : index
 
-    //CHECK: %[[r4:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r3]]] : memref<128x128xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: %[[r5:.*]] = xegpu.load_nd %[[r4]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<8x16xf32>
+    //CHECK: %[[r4:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r3]]] : memref<128x128xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r5:.*]] = xegpu.load_nd %[[r4]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>> -> vector<8x16xf32>
     %2 = xetile.init_tile %arg2[%0, %1] : memref<128x128xf32> -> !xetile.tile<8x16xf32>
     %3 = xetile.load_tile %2 {padding = 0.000000e+00 : f32}  : !xetile.tile<8x16xf32> -> vector<8x16xf32>
 
-    //CHECK: %[[r6:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[r2]], %[[c0]]] : memref<128x128xf16, 3> -> !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r6:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[r2]], %[[c0]]] : memref<128x128xf16, 3> -> !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_space =  slm, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[c1:.*]] = arith.constant 1 : index
     //CHECK: %[[r7:.*]] = arith.addi %[[r0]], %[[c1]] : index
-    //CHECK: %[[r8:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[r7]], %[[c0]]] : memref<128x128xf16, 3> -> !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r8:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[r7]], %[[c0]]] : memref<128x128xf16, 3> -> !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_space =  slm, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[c2:.*]] = arith.constant 2 : index
     //CHECK: %[[r9:.*]] = arith.addi %[[r0]], %[[c2]] : index
-    //CHECK: %[[r10:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[r9]], %[[c0]]] : memref<128x128xf16, 3> -> !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r10:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[r9]], %[[c0]]] : memref<128x128xf16, 3> -> !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_space =  slm, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[c3:.*]] = arith.constant 3 : index
     //CHECK: %[[r11:.*]] = arith.addi %[[r0]], %[[c3]] : index
-    //CHECK: %[[r12:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[r11]], %[[c0]]] : memref<128x128xf16, 3> -> !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r12:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[r11]], %[[c0]]] : memref<128x128xf16, 3> -> !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_space =  slm, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[c4:.*]] = arith.constant 4 : index
     //CHECK: %[[r13:.*]] = arith.addi %[[r0]], %[[c4]] : index
-    //CHECK: %[[r14:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[r13]], %[[c0]]] : memref<128x128xf16, 3> -> !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r14:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[r13]], %[[c0]]] : memref<128x128xf16, 3> -> !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_space =  slm, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[c5:.*]] = arith.constant 5 : index
     //CHECK: %[[r15:.*]] = arith.addi %[[r0]], %[[c5]] : index
-    //CHECK: %[[r16:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[r15]], %[[c0]]] : memref<128x128xf16, 3> -> !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r16:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[r15]], %[[c0]]] : memref<128x128xf16, 3> -> !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_space =  slm, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[c6:.*]] = arith.constant 6 : index
     //CHECK: %[[r17:.*]] = arith.addi %[[r0]], %[[c6]] : index
-    //CHECK: %[[r18:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[r17]], %[[c0]]] : memref<128x128xf16, 3> -> !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r18:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[r17]], %[[c0]]] : memref<128x128xf16, 3> -> !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_space =  slm, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[c7:.*]] = arith.constant 7 : index
     //CHECK: %[[r19:.*]] = arith.addi %[[r0]], %[[c7]] : index
-    //CHECK: %[[r20:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[r19]], %[[c0]]] : memref<128x128xf16, 3> -> !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r20:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[r19]], %[[c0]]] : memref<128x128xf16, 3> -> !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_space =  slm, array_length = 1 : i64, boundary_check = true>>
     %4 = xetile.init_tile %arg0[%0, %c0] : memref<128x128xf16, 3> -> !xetile.tile<8x16xf16, #tile_attr>
 
 
-    //CHECK: %[[r21:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c0]], %[[r3]]] : memref<128x128xf16, 3> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r21:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c0]], %[[r3]]] : memref<128x128xf16, 3> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_space =  slm, array_length = 1 : i64, boundary_check = true>>
     %5 = xetile.init_tile %arg1[%c0, %1] : memref<128x128xf16, 3> -> !xetile.tile<16x16xf16, #tile_attr>
 
     //CHECK: %[[r37:.*]]:10 = scf.for %[[arg3:.*]] = %[[c0]] to %[[c128]] step %[[c16]]
@@ -64,25 +64,25 @@ gpu.module @test_kernel {
     //CHECK-SAME: %[[arg7:.*]] = %[[r12]], %[[arg8:.*]] = %[[r14]], %[[arg9:.*]] = %[[r16]],
     //CHECK-SAME: %[[arg10:.*]] = %[[r18]], %[[arg11:.*]] = %[[r20]], %[[arg12:.*]] = %[[r21]],
     //CHECK-SAME: %[[arg28:.*]] = %[[r5]])
-    //CHECK-SAME: !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true>>,
-    //CHECK-SAME: !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true>>,
-    //CHECK-SAME: !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true>>,
-    //CHECK-SAME: !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true>>,
-    //CHECK-SAME: !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true>>,
-    //CHECK-SAME: !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true>>,
-    //CHECK-SAME: !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true>>,
-    //CHECK-SAME: !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true>>,
-    //CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true>>, vector<8x16xf32>
+    //CHECK-SAME: !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_space =  slm, array_length = 1 : i64, boundary_check = true>>,
+    //CHECK-SAME: !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_space =  slm, array_length = 1 : i64, boundary_check = true>>,
+    //CHECK-SAME: !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_space =  slm, array_length = 1 : i64, boundary_check = true>>,
+    //CHECK-SAME: !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_space =  slm, array_length = 1 : i64, boundary_check = true>>,
+    //CHECK-SAME: !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_space =  slm, array_length = 1 : i64, boundary_check = true>>,
+    //CHECK-SAME: !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_space =  slm, array_length = 1 : i64, boundary_check = true>>,
+    //CHECK-SAME: !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_space =  slm, array_length = 1 : i64, boundary_check = true>>,
+    //CHECK-SAME: !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_space =  slm, array_length = 1 : i64, boundary_check = true>>,
+    //CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_space =  slm, array_length = 1 : i64, boundary_check = true>>, vector<8x16xf32>
     %6:3 = scf.for %arg3 = %c0 to %c128 step %c16 iter_args(%arg4 = %4, %arg5 = %5, %arg6 = %3)
           -> (!xetile.tile<8x16xf16, #tile_attr>, !xetile.tile<16x16xf16, #tile_attr>, vector<8x16xf32>) {
-      //CHECK: %[[r38:.*]] = xegpu.load_nd %[[arg4]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true>> -> vector<1x16xf16>
-      //CHECK: %[[r39:.*]] = xegpu.load_nd %[[arg5]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true>> -> vector<1x16xf16>
-      //CHECK: %[[r40:.*]] = xegpu.load_nd %[[arg6]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true>> -> vector<1x16xf16>
-      //CHECK: %[[r41:.*]] = xegpu.load_nd %[[arg7]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true>> -> vector<1x16xf16>
-      //CHECK: %[[r42:.*]] = xegpu.load_nd %[[arg8]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true>> -> vector<1x16xf16>
-      //CHECK: %[[r43:.*]] = xegpu.load_nd %[[arg9]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true>> -> vector<1x16xf16>
-      //CHECK: %[[r44:.*]] = xegpu.load_nd %[[arg10]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true>> -> vector<1x16xf16>
-      //CHECK: %[[r45:.*]] = xegpu.load_nd %[[arg11]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true>> -> vector<1x16xf16>
+      //CHECK: %[[r38:.*]] = xegpu.load_nd %[[arg4]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_space =  slm, array_length = 1 : i64, boundary_check = true>> -> vector<1x16xf16>
+      //CHECK: %[[r39:.*]] = xegpu.load_nd %[[arg5]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_space =  slm, array_length = 1 : i64, boundary_check = true>> -> vector<1x16xf16>
+      //CHECK: %[[r40:.*]] = xegpu.load_nd %[[arg6]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_space =  slm, array_length = 1 : i64, boundary_check = true>> -> vector<1x16xf16>
+      //CHECK: %[[r41:.*]] = xegpu.load_nd %[[arg7]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_space =  slm, array_length = 1 : i64, boundary_check = true>> -> vector<1x16xf16>
+      //CHECK: %[[r42:.*]] = xegpu.load_nd %[[arg8]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_space =  slm, array_length = 1 : i64, boundary_check = true>> -> vector<1x16xf16>
+      //CHECK: %[[r43:.*]] = xegpu.load_nd %[[arg9]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_space =  slm, array_length = 1 : i64, boundary_check = true>> -> vector<1x16xf16>
+      //CHECK: %[[r44:.*]] = xegpu.load_nd %[[arg10]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_space =  slm, array_length = 1 : i64, boundary_check = true>> -> vector<1x16xf16>
+      //CHECK: %[[r45:.*]] = xegpu.load_nd %[[arg11]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_space =  slm, array_length = 1 : i64, boundary_check = true>> -> vector<1x16xf16>
       //CHECK: %[[r46:.*]] = vector.shuffle %[[r38]], %[[r39]] [0, 1] : vector<1x16xf16>, vector<1x16xf16>
       //CHECK: %[[r47:.*]] = vector.shuffle %[[r40]], %[[r41]] [0, 1] : vector<1x16xf16>, vector<1x16xf16>
       //CHECK: %[[r48:.*]] = vector.shuffle %[[r42]], %[[r43]] [0, 1] : vector<1x16xf16>, vector<1x16xf16>
@@ -92,27 +92,27 @@ gpu.module @test_kernel {
       //CHECK: %[[r52:.*]] = vector.shuffle %[[r50]], %[[r51]] [0, 1, 2, 3, 4, 5, 6, 7] : vector<4x16xf16>, vector<4x16xf16>
       %7 = xetile.load_tile %arg4 {padding = 0.000000e+00 : f32}  : !xetile.tile<8x16xf16, #tile_attr> -> vector<8x16xf16>
 
-      //CHECK: %[[r53:.*]] = xegpu.load_nd %[[arg12]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true>> -> vector<16x16xf16>
+      //CHECK: %[[r53:.*]] = xegpu.load_nd %[[arg12]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_space =  slm, array_length = 1 : i64, boundary_check = true>> -> vector<16x16xf16>
       %8 = xetile.load_tile %arg5 {padding = 0.000000e+00 : f32}  : !xetile.tile<16x16xf16, #tile_attr> -> vector<16x16xf16>
 
 
       //CHECK: %[[r84:.*]] = xegpu.dpas %[[r52]], %[[r53]], %[[arg28]] : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
       %9 = xetile.tile_mma %7, %8, %arg6 : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
 
-      //CHECK: %[[r85:.*]] = xegpu.update_nd_offset %[[arg4]], [%[[c0]], %[[c16]]] : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true>>
-      //CHECK: %[[r86:.*]] = xegpu.update_nd_offset %[[arg5]], [%[[c0]], %[[c16]]] : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true>>
-      //CHECK: %[[r87:.*]] = xegpu.update_nd_offset %[[arg6]], [%[[c0]], %[[c16]]] : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true>>
-      //CHECK: %[[r88:.*]] = xegpu.update_nd_offset %[[arg7]], [%[[c0]], %[[c16]]] : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true>>
-      //CHECK: %[[r89:.*]] = xegpu.update_nd_offset %[[arg8]], [%[[c0]], %[[c16]]] : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true>>
-      //CHECK: %[[r90:.*]] = xegpu.update_nd_offset %[[arg9]], [%[[c0]], %[[c16]]] : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true>>
-      //CHECK: %[[r91:.*]] = xegpu.update_nd_offset %[[arg10]], [%[[c0]], %[[c16]]] : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true>>
-      //CHECK: %[[r92:.*]] = xegpu.update_nd_offset %[[arg11]], [%[[c0]], %[[c16]]] : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true>>
+      //CHECK: %[[r85:.*]] = xegpu.update_nd_offset %[[arg4]], [%[[c0]], %[[c16]]] : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_space =  slm, array_length = 1 : i64, boundary_check = true>>
+      //CHECK: %[[r86:.*]] = xegpu.update_nd_offset %[[arg5]], [%[[c0]], %[[c16]]] : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_space =  slm, array_length = 1 : i64, boundary_check = true>>
+      //CHECK: %[[r87:.*]] = xegpu.update_nd_offset %[[arg6]], [%[[c0]], %[[c16]]] : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_space =  slm, array_length = 1 : i64, boundary_check = true>>
+      //CHECK: %[[r88:.*]] = xegpu.update_nd_offset %[[arg7]], [%[[c0]], %[[c16]]] : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_space =  slm, array_length = 1 : i64, boundary_check = true>>
+      //CHECK: %[[r89:.*]] = xegpu.update_nd_offset %[[arg8]], [%[[c0]], %[[c16]]] : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_space =  slm, array_length = 1 : i64, boundary_check = true>>
+      //CHECK: %[[r90:.*]] = xegpu.update_nd_offset %[[arg9]], [%[[c0]], %[[c16]]] : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_space =  slm, array_length = 1 : i64, boundary_check = true>>
+      //CHECK: %[[r91:.*]] = xegpu.update_nd_offset %[[arg10]], [%[[c0]], %[[c16]]] : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_space =  slm, array_length = 1 : i64, boundary_check = true>>
+      //CHECK: %[[r92:.*]] = xegpu.update_nd_offset %[[arg11]], [%[[c0]], %[[c16]]] : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_space =  slm, array_length = 1 : i64, boundary_check = true>>
       %10 = xetile.update_tile_offset %arg4, [%c0,  %c16] : !xetile.tile<8x16xf16, #tile_attr>, index, index -> !xetile.tile<8x16xf16, #tile_attr>
-      //CHECK: %[[r108:.*]] = xegpu.update_nd_offset %[[arg12]], [%[[c16]], %[[c0]]] : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true>>
+      //CHECK: %[[r108:.*]] = xegpu.update_nd_offset %[[arg12]], [%[[c16]], %[[c0]]] : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_space =  slm, array_length = 1 : i64, boundary_check = true>>
       %11 = xetile.update_tile_offset %arg5, [%c16,  %c0] : !xetile.tile<16x16xf16, #tile_attr>, index, index -> !xetile.tile<16x16xf16, #tile_attr>
       scf.yield %10, %11, %9 : !xetile.tile<8x16xf16, #tile_attr>, !xetile.tile<16x16xf16, #tile_attr>, vector<8x16xf32>
     }
-    //CHECK: xegpu.store_nd %[[r37]]#9, %[[r4]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r37]]#9, %[[r4]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
     xetile.store_tile %6#2,  %2 : vector<8x16xf32>, !xetile.tile<8x16xf32>
     gpu.return
   }
diff --git a/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_i8_i32.mlir b/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_i8_i32.mlir
index 2f477344e..f1cf087eb 100644
--- a/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_i8_i32.mlir
+++ b/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_i8_i32.mlir
@@ -24,44 +24,44 @@ gpu.module @test_kernel {
 
     //CHECK: %[[r2:.*]] = arith.addi %[[r0]], %[[c0]] : index
     //CHECK: %[[r3:.*]] = arith.addi %[[r1]], %[[c0]] : index
-    //CHECK: %[[r4:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r3]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r4:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r3]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[c16:.*]] = arith.constant 16 : index
     //CHECK: %[[r5:.*]] = arith.addi %[[r1]], %[[c16]] : index
-    //CHECK: %[[r6:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r5]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r6:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r5]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[c8:.*]] = arith.constant 8 : index
     //CHECK: %[[r7:.*]] = arith.addi %[[r0]], %[[c8]] : index
-    //CHECK: %[[r8:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r7]], %[[r3]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: %[[r9:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r7]], %[[r5]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r8:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r7]], %[[r3]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r9:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r7]], %[[r5]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[r10:.*]] = arith.addi %[[r0]], %[[c16]] : index
-    //CHECK: %[[r11:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r10]], %[[r3]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: %[[r12:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r10]], %[[r5]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r11:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r10]], %[[r3]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r12:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r10]], %[[r5]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[c24:.*]] = arith.constant 24 : index
     //CHECK: %[[r13:.*]] = arith.addi %[[r0]], %[[c24]] : index
-    //CHECK: %[[r14:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r13]], %[[r3]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: %[[r15:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r13]], %[[r5]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: %[[r16:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r3]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<32x16xi32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: %[[r17:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r5]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<32x16xi32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: %[[r18:.*]] = xegpu.load_nd %[[r16]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xi32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xi32>
-    //CHECK: %[[r19:.*]] = xegpu.load_nd %[[r17]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xi32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xi32>
+    //CHECK: %[[r14:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r13]], %[[r3]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r15:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r13]], %[[r5]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r16:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r3]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<32x16xi32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r17:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r5]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<32x16xi32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r18:.*]] = xegpu.load_nd %[[r16]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xi32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xi32>
+    //CHECK: %[[r19:.*]] = xegpu.load_nd %[[r17]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xi32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xi32>
     %c_init_tile = xetile.init_tile %C[%m, %n] : memref<1024x1024xi32> -> !xetile.tile<32x32xi32>
     %c_init_value = xetile.load_tile %c_init_tile : !xetile.tile<32x32xi32> -> vector<32x32xi32>
 
-    //CHECK: %20 = xegpu.create_nd_tdesc %[[arg0]][%2, %[[c0]]] : memref<1024x1024xi8> -> !xegpu.tensor_desc<32x32xi8, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %20 = xegpu.create_nd_tdesc %[[arg0]][%2, %[[c0]]] : memref<1024x1024xi8> -> !xegpu.tensor_desc<32x32xi8, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
     %a_init_tile = xetile.init_tile %A[%m, %c0] : memref<1024x1024xi8> -> !xetile.tile<32x32xi8>
 
-    //CHECK: %21 = xegpu.create_nd_tdesc %[[arg1]][%c0, %3] : memref<1024x1024xi8> -> !xegpu.tensor_desc<32x16xi8, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
+    //CHECK: %21 = xegpu.create_nd_tdesc %[[arg1]][%c0, %3] : memref<1024x1024xi8> -> !xegpu.tensor_desc<32x16xi8, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>
     %b_init_tile = xetile.init_tile %B[%c0, %n] : memref<1024x1024xi8> -> !xetile.tile<32x32xi8>
     %out:3 = scf.for %k = %c0 to %c1024 step %c32 iter_args(%a_tile = %a_init_tile, %b_tile = %b_init_tile, %c_value = %c_init_value)
                   -> (!xetile.tile<32x32xi8>, !xetile.tile<32x32xi8>, vector<32x32xi32>) {
 
-      //CHECK: %[[r39:.*]] = xegpu.load_nd %{{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x32xi8, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x32xi8>
+      //CHECK: %[[r39:.*]] = xegpu.load_nd %{{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x32xi8, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x32xi8>
       //CHECK: %[[r40:.*]] = vector.extract_strided_slice %[[r39]] {offsets = [0, 0], sizes = [8, 32], strides = [1, 1]} : vector<32x32xi8> to vector<8x32xi8>
       //CHECK: %[[r41:.*]] = vector.extract_strided_slice %[[r39]] {offsets = [8, 0], sizes = [8, 32], strides = [1, 1]} : vector<32x32xi8> to vector<8x32xi8>
       //CHECK: %[[r42:.*]] = vector.extract_strided_slice %[[r39]] {offsets = [16, 0], sizes = [8, 32], strides = [1, 1]} : vector<32x32xi8> to vector<8x32xi8>
       //CHECK: %[[r43:.*]] = vector.extract_strided_slice %[[r39]] {offsets = [24, 0], sizes = [8, 32], strides = [1, 1]} : vector<32x32xi8> to vector<8x32xi8>
       %a_value = xetile.load_tile %a_tile : !xetile.tile<32x32xi8> -> vector<32x32xi8>
 
-      //CHECK: %[[r44:.*]] = xegpu.load_nd %{{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xi8, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x16xi8>
+      //CHECK: %[[r44:.*]] = xegpu.load_nd %{{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xi8, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x16xi8>
       //CHECK: %[[r45:.*]] = vector.extract %[[r44]][0] : vector<32x16xi8> from vector<2x32x16xi8>
       //CHECK: %[[r46:.*]] = vector.extract %[[r44]][1] : vector<32x16xi8> from vector<2x32x16xi8>
       %b_value = xetile.load_tile %b_tile : !xetile.tile<32x32xi8> -> vector<32x32xi8>
@@ -69,14 +69,14 @@ gpu.module @test_kernel {
       //CHECK-COUNT-8: xegpu.dpas {{.*}} : vector<8x32xi8>, vector<32x16xi8>, vector<8x16xi32> -> vector<8x16xi32>
       %c_new_value = xetile.tile_mma %a_value, %b_value, %c_value : vector<32x32xi8>, vector<32x32xi8>, vector<32x32xi32> -> vector<32x32xi32>
 
-      //CHECK: xegpu.update_nd_offset %{{.*}}, [%[[c0]], %[[c32]]] : !xegpu.tensor_desc<32x32xi8, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-      //CHECK: xegpu.update_nd_offset %{{.*}}, [%[[c32]], %[[c0]]] : !xegpu.tensor_desc<32x16xi8, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
+      //CHECK: xegpu.update_nd_offset %{{.*}}, [%[[c0]], %[[c32]]] : !xegpu.tensor_desc<32x32xi8, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+      //CHECK: xegpu.update_nd_offset %{{.*}}, [%[[c32]], %[[c0]]] : !xegpu.tensor_desc<32x16xi8, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>
       %a_next_tile = xetile.update_tile_offset %a_tile, [%c0, %c32] : !xetile.tile<32x32xi8>, index, index -> !xetile.tile<32x32xi8>
       %b_next_tile = xetile.update_tile_offset %b_tile, [%c32, %c0] : !xetile.tile<32x32xi8>, index, index -> !xetile.tile<32x32xi8>
       scf.yield %a_next_tile, %b_next_tile, %c_new_value : !xetile.tile<32x32xi8>, !xetile.tile<32x32xi8>, vector<32x32xi32>
     }
 
-    //CHECK-COUNT-8: xegpu.store_nd %{{.*}}, %{{.*}} <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xi32>, !xegpu.tensor_desc<8x16xi32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK-COUNT-8: xegpu.store_nd %{{.*}}, %{{.*}} <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xi32>, !xegpu.tensor_desc<8x16xi32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
     xetile.store_tile %out#2, %c_init_tile {innner_blocks = [8, 16]}: vector<32x32xi32>, !xetile.tile<32x32xi32>
     gpu.return
   }
diff --git a/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_tf32_tf32.mlir b/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_tf32_tf32.mlir
index db8022dad..0e29a1fc5 100755
--- a/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_tf32_tf32.mlir
+++ b/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_tf32_tf32.mlir
@@ -23,43 +23,43 @@ gpu.module @test_kernel {
 
     //CHECK: %[[r2:.*]] = arith.addi %[[r0]], %[[c0]] : index
     //CHECK: %[[r3:.*]] = arith.addi %[[r1]], %[[c0]] : index
-    //CHECK: %[[r4:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r4:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[c16:.*]] = arith.constant 16 : index
     //CHECK: %[[r5:.*]] = arith.addi %[[r1]], %[[c16]] : index
-    //CHECK: %[[r6:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r6:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[c8:.*]] = arith.constant 8 : index
     //CHECK: %[[r7:.*]] = arith.addi %[[r0]], %[[c8]] : index
-    //CHECK: %[[r8:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r7]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: %[[r9:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r7]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r8:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r7]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r9:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r7]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[r10:.*]] = arith.addi %[[r0]], %[[c16]] : index
-    //CHECK: %[[r11:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r10]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: %[[r12:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r10]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r11:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r10]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r12:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r10]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[c24:.*]] = arith.constant 24 : index
     //CHECK: %[[r13:.*]] = arith.addi %[[r0]], %[[c24]] : index
-    //CHECK: %[[r14:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r13]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: %[[r15:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r13]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: %[[r16:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: %[[r17:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r14:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r13]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r15:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r13]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r16:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r17:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
     %2 = xetile.init_tile %arg2[%0, %1] : memref<1024x1024xf32> -> !xetile.tile<32x32xf32>
 
-    //CHECK: %[[r18:.*]] = xegpu.load_nd %[[r16]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xf32>
-    //CHECK: %[[r19:.*]] = xegpu.load_nd %[[r17]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xf32>
+    //CHECK: %[[r18:.*]] = xegpu.load_nd %[[r16]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xf32>
+    //CHECK: %[[r19:.*]] = xegpu.load_nd %[[r17]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xf32>
     %3 = xetile.load_tile %2 {padding = 0.000000e+00 : f32}  : !xetile.tile<32x32xf32> -> vector<32x32xf32>
 
-    //CHECK: %[[r20:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[r2]], %[[c0]]] : memref<1024x1024xtf32> -> !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
-    //CHECK: %[[r21:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[r2]], %[[c16]]] : memref<1024x1024xtf32> -> !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
+    //CHECK: %[[r20:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[r2]], %[[c0]]] : memref<1024x1024xtf32> -> !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>
+    //CHECK: %[[r21:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[r2]], %[[c16]]] : memref<1024x1024xtf32> -> !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>
     %4 = xetile.init_tile %arg0[%0, %c0] : memref<1024x1024xtf32> -> !xetile.tile<32x32xtf32>
 
-    //CHECK: %[[r22:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c0]], %[[r3]]] : memref<1024x1024xtf32> -> !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: %[[r23:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c0]], %[[r5]]] : memref<1024x1024xtf32> -> !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r22:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c0]], %[[r3]]] : memref<1024x1024xtf32> -> !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r23:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c0]], %[[r5]]] : memref<1024x1024xtf32> -> !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
     %5 = xetile.init_tile %arg1[%c0, %1] : memref<1024x1024xtf32> -> !xetile.tile<32x32xtf32>
 
     //CHECK: %[[r24:.*]]:6 = scf.for %[[arg3:.*]] = %[[c0]] to %[[c1024]] step %[[c64]]
     //CHECK-SAME: iter_args(%[[arg4:.*]] = %[[r20]], %[[arg5:.*]] = %[[r21]], %[[arg6:.*]] = %[[r22]], %[[arg7:.*]] = %[[r23]], %[[arg8:.*]] = %[[r18]], %[[arg9:.*]] = %[[r19]])
-    //CHECK-SAME: !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>,
-    //CHECK-SAME: !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>,
-    //CHECK-SAME: !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>,
-    //CHECK-SAME: !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, vector<32x16xf32>, vector<32x16xf32>
+    //CHECK-SAME: !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>,
+    //CHECK-SAME: !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>,
+    //CHECK-SAME: !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>,
+    //CHECK-SAME: !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>, vector<32x16xf32>, vector<32x16xf32>
     %6:3 = scf.for %arg3 = %c0 to %c1024 step %c64 iter_args(%arg4 = %4, %arg5 = %5, %arg6 = %3) -> (!xetile.tile<32x32xtf32>, !xetile.tile<32x32xtf32>, vector<32x32xf32>) {
       //CHECK: %[[r65:.*]] = vector.extract_strided_slice %[[arg8]] {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32>
       //CHECK: %[[r66:.*]] = vector.extract_strided_slice %[[arg8]] {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32>
@@ -70,10 +70,10 @@ gpu.module @test_kernel {
       //CHECK: %[[r71:.*]] = vector.extract_strided_slice %[[arg9]] {offsets = [16, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32>
       //CHECK: %[[r72:.*]] = vector.extract_strided_slice %[[arg9]] {offsets = [24, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32>
 
-      //CHECK: %[[r33:.*]] = xegpu.load_nd %[[arg4]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x8xtf32>
+      //CHECK: %[[r33:.*]] = xegpu.load_nd %[[arg4]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x8xtf32>
       //CHECK: %[[r34:.*]] = vector.extract %[[r33]][0] : vector<32x8xtf32> from vector<2x32x8xtf32>
       //CHECK: %[[r35:.*]] = vector.extract %[[r33]][1] : vector<32x8xtf32> from vector<2x32x8xtf32>
-      //CHECK: %[[r36:.*]] = xegpu.load_nd %[[arg5]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x8xtf32>
+      //CHECK: %[[r36:.*]] = xegpu.load_nd %[[arg5]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x8xtf32>
       //CHECK: %[[r37:.*]] = vector.extract %[[r36]][0] : vector<32x8xtf32> from vector<2x32x8xtf32>
       //CHECK: %[[r38:.*]] = vector.extract %[[r36]][1] : vector<32x8xtf32> from vector<2x32x8xtf32>
       //CHECK: %[[r39:.*]] = vector.extract_strided_slice %[[r34]] {offsets = [0, 0], sizes = [8, 8], strides = [1, 1]} : vector<32x8xtf32> to vector<8x8xtf32>
@@ -94,8 +94,8 @@ gpu.module @test_kernel {
       //CHECK: %[[r54:.*]] = vector.extract_strided_slice %[[r38]] {offsets = [24, 0], sizes = [8, 8], strides = [1, 1]} : vector<32x8xtf32> to vector<8x8xtf32>
       %7 = xetile.load_tile %arg4 {padding = 0.000000e+00 : f32}  : !xetile.tile<32x32xtf32> -> vector<32x32xtf32>
 
-      //CHECK: %[[r55:.*]] = xegpu.load_nd %[[arg6]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xtf32>
-      //CHECK: %[[r56:.*]] = xegpu.load_nd %[[arg7]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xtf32>
+      //CHECK: %[[r55:.*]] = xegpu.load_nd %[[arg6]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xtf32>
+      //CHECK: %[[r56:.*]] = xegpu.load_nd %[[arg7]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xtf32>
       //CHECK: %[[r57:.*]] = vector.extract_strided_slice %[[r55]] {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xtf32> to vector<8x16xtf32>
       //CHECK: %[[r58:.*]] = vector.extract_strided_slice %[[r55]] {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xtf32> to vector<8x16xtf32>
       //CHECK: %[[r59:.*]] = vector.extract_strided_slice %[[r55]] {offsets = [16, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xtf32> to vector<8x16xtf32>
@@ -149,19 +149,19 @@ gpu.module @test_kernel {
 
       %9 = xetile.tile_mma %7, %8, %arg6 : vector<32x32xtf32>, vector<32x32xtf32>, vector<32x32xf32> -> vector<32x32xf32>
 
-      //CHECK: %[[r111:.*]] = xegpu.update_nd_offset %[[arg4]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
-      //CHECK: %[[r112:.*]] = xegpu.update_nd_offset %[[arg5]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
+      //CHECK: %[[r111:.*]] = xegpu.update_nd_offset %[[arg4]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>
+      //CHECK: %[[r112:.*]] = xegpu.update_nd_offset %[[arg5]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>
       %10 = xetile.update_tile_offset %arg4, [%c0,  %c64] : !xetile.tile<32x32xtf32>, index, index -> !xetile.tile<32x32xtf32>
 
-      //CHECK: %[[r113:.*]] = xegpu.update_nd_offset %[[arg6]], [%[[c64]], %[[c0]]] : !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-      //CHECK: %[[r114:.*]] = xegpu.update_nd_offset %[[arg7]], [%[[c64]], %[[c0]]] : !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+      //CHECK: %[[r113:.*]] = xegpu.update_nd_offset %[[arg6]], [%[[c64]], %[[c0]]] : !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+      //CHECK: %[[r114:.*]] = xegpu.update_nd_offset %[[arg7]], [%[[c64]], %[[c0]]] : !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
       %11 = xetile.update_tile_offset %arg5, [%c64,  %c0] : !xetile.tile<32x32xtf32>, index, index -> !xetile.tile<32x32xtf32>
 
       //CHECK: scf.yield %[[r111]], %[[r112]], %[[r113]], %[[r114]], %[[r107]], %[[r110]]
-      //CHECK-SAME: !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>,
-      //CHECK-SAME: !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>,
-      //CHECK-SAME: !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>,
-      //CHECK-SAME: !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, vector<32x16xf32>, vector<32x16xf32>
+      //CHECK-SAME: !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>,
+      //CHECK-SAME: !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>,
+      //CHECK-SAME: !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>,
+      //CHECK-SAME: !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>, vector<32x16xf32>, vector<32x16xf32>
       scf.yield %10, %11, %9 : !xetile.tile<32x32xtf32>, !xetile.tile<32x32xtf32>, vector<32x32xf32>
     }
 
@@ -173,14 +173,14 @@ gpu.module @test_kernel {
     //CHECK: %[[r30:.*]] = vector.extract_strided_slice %[[r24]]#5 {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32>
     //CHECK: %[[r31:.*]] = vector.extract_strided_slice %[[r24]]#5 {offsets = [16, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32>
     //CHECK: %[[r32:.*]] = vector.extract_strided_slice %[[r24]]#5 {offsets = [24, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32>
-    //CHECK: xegpu.store_nd %[[r25]], %[[r4]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: xegpu.store_nd %[[r29]], %[[r6]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: xegpu.store_nd %[[r26]], %[[r8]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: xegpu.store_nd %[[r30]], %[[r9]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: xegpu.store_nd %[[r27]], %[[r11]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: xegpu.store_nd %[[r31]], %[[r12]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: xegpu.store_nd %[[r28]], %[[r14]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: xegpu.store_nd %[[r32]], %[[r15]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r25]], %[[r4]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r29]], %[[r6]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r26]], %[[r8]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r30]], %[[r9]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r27]], %[[r11]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r31]], %[[r12]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r28]], %[[r14]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r32]], %[[r15]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
     xetile.store_tile %6#2,  %2 : vector<32x32xf32>, !xetile.tile<32x32xf32>
     //CHECK: gpu.return
     gpu.return
diff --git a/test/Conversion/XeTileToXeGPU/sg_gemm_transpose_b.mlir b/test/Conversion/XeTileToXeGPU/sg_gemm_transpose_b.mlir
index 43deb87d2..0723d481e 100644
--- a/test/Conversion/XeTileToXeGPU/sg_gemm_transpose_b.mlir
+++ b/test/Conversion/XeTileToXeGPU/sg_gemm_transpose_b.mlir
@@ -17,18 +17,18 @@ gpu.module @test_kernel {
     %c_init_tile = xetile.init_tile %C[%m, %n] : memref<1024x1024xf32> -> !xetile.tile<32x32xf32>
     %c_init_value = xetile.load_tile %c_init_tile : !xetile.tile<32x32xf32> -> vector<32x32xf32>
     %a_init_tile = xetile.init_tile %A[%m, %c0] : memref<1024x1024xf16> -> !xetile.tile<32x32xf16>
-// CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[arg1]][%{{.*}}, %{{.*}}] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[arg1]][%{{.*}}, %{{.*}}] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+// CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[arg1]][%{{.*}}, %{{.*}}] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[arg1]][%{{.*}}, %{{.*}}] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
     %b_init_tile = xetile.init_tile %B[%c0, %n] : memref<1024x1024xf16> -> !xetile.tile<32x32xf16>
-// CHECK: scf.for %{{.*}}= %{{.*}}to %{{.*}}step %{{.*}}iter_args(%{{.*}}= %{{.*}}, %[[ARG5:.*]] = %[[T1]], %[[ARG6:.*]] = %[[T2]], %{{.*}}= %{{.*}}, %{{.*}}= %{{.*}}, %{{.*}}= %{{.*}}, %{{.*}}= %{{.*}}, %{{.*}}= %{{.*}}, %{{.*}}= %{{.*}}, %{{.*}}= %{{.*}}, %{{.*}} = %{{.*}}) -> (!xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>) {
+// CHECK: scf.for %{{.*}}= %{{.*}}to %{{.*}}step %{{.*}}iter_args(%{{.*}}= %{{.*}}, %[[ARG5:.*]] = %[[T1]], %[[ARG6:.*]] = %[[T2]], %{{.*}}= %{{.*}}, %{{.*}}= %{{.*}}, %{{.*}}= %{{.*}}, %{{.*}}= %{{.*}}, %{{.*}}= %{{.*}}, %{{.*}}= %{{.*}}, %{{.*}}= %{{.*}}, %{{.*}} = %{{.*}}) -> (!xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>) {
     %out:3 = scf.for %k = %c0 to %c1024 step %c32
       iter_args(%a_tile = %a_init_tile, %b_tile = %b_init_tile, %c_value = %c_init_value)
       -> (!xetile.tile<32x32xf16>, !xetile.tile<32x32xf16>, vector<32x32xf32>) {
       %a_value = xetile.load_tile %a_tile : !xetile.tile<32x32xf16> -> vector<32x32xf16>
   // Check if array_length is 1 for the load + transpose + MMA B case.
   //
-  // xegpu.load_nd %[[ARG5]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xf16>
-  // xegpu.load_nd %[[ARG6]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xf16>
+  // xegpu.load_nd %[[ARG5]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xf16>
+  // xegpu.load_nd %[[ARG6]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xf16>
       %b_value = xetile.load_tile %b_tile : !xetile.tile<32x32xf16> -> vector<32x32xf16>
       %b_transpose = xetile.transpose %b_value, [1, 0] : vector<32x32xf16> -> vector<32x32xf16>
       %c_new_value = xetile.tile_mma %a_value, %b_transpose, %c_value : vector<32x32xf16>, vector<32x32xf16>, vector<32x32xf32> -> vector<32x32xf32>
diff --git a/test/Conversion/XeTileToXeGPU/sg_load_tile.mlir b/test/Conversion/XeTileToXeGPU/sg_load_tile.mlir
index 2fb7cc259..194edb960 100644
--- a/test/Conversion/XeTileToXeGPU/sg_load_tile.mlir
+++ b/test/Conversion/XeTileToXeGPU/sg_load_tile.mlir
@@ -8,10 +8,10 @@ gpu.module @test_kernel {
     //CHECK: %[[c64:.*]] = arith.constant 64 : index
     %c64 = arith.constant 64 : index
     //CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c64]]]
-    //CHECK-SAME: memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK-SAME: memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
   	%1 = xetile.init_tile %a[%c0, %c64] : memref<1024x1024xf16> -> !xetile.tile<32x32xf16>
     //CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}>
-    //CHECK-SAME: !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x32xf16>
+    //CHECK-SAME: !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x32xf16>
     %2 = xetile.load_tile %1 : !xetile.tile<32x32xf16> -> vector<32x32xf16>
   	gpu.return
   }
diff --git a/test/Conversion/XeTileToXeGPU/sg_mixed_scf.mlir b/test/Conversion/XeTileToXeGPU/sg_mixed_scf.mlir
index 3a8ec5510..565804d46 100755
--- a/test/Conversion/XeTileToXeGPU/sg_mixed_scf.mlir
+++ b/test/Conversion/XeTileToXeGPU/sg_mixed_scf.mlir
@@ -56,7 +56,7 @@ gpu.module @postop_reduce_m attributes {spirv.target_env = #spirv.target_env<#sp
       %26 = arith.muli %arg3, %c1024 : index
       %27 = arith.addi %26, %13 : index
       %28 = arith.addi %27, %16 : index
-      //CHECK: %{{.*}} = xegpu.create_nd_tdesc {{.*}} : memref<2048x12288xbf16> -> !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
+      //CHECK: %{{.*}} = xegpu.create_nd_tdesc {{.*}} : memref<2048x12288xbf16> -> !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>
       %29 = xetile.init_tile %arg1[%28, %15] : memref<2048x12288xbf16> -> !xetile.tile<32x32xbf16>
 
       %30 = scf.for %arg4 = %c0 to %c2 step %c1 iter_args(%arg5 = %cst) -> (vector<1x4xf32>) {
@@ -65,23 +65,23 @@ gpu.module @postop_reduce_m attributes {spirv.target_env = #spirv.target_env<#sp
         %35 = arith.addi %34, %10 : index
         %36 = arith.addi %35, %11 : index
 
-        //CHECK: %{{.*}} = xegpu.create_nd_tdesc %{{.*}} : memref<16384x12288xbf16> -> !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
+        //CHECK: %{{.*}} = xegpu.create_nd_tdesc %{{.*}} : memref<16384x12288xbf16> -> !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>
         %37 = xetile.init_tile %arg0[%36, %9] : memref<16384x12288xbf16> -> !xetile.tile<32x32xbf16>
         %38:3 = scf.for %arg6 = %c0 to %c12288 step %c32 iter_args(%arg7 = %37, %arg8 = %29, %arg9 = %cst_0) -> (!xetile.tile<32x32xbf16>, !xetile.tile<32x32xbf16>, vector<32x32xf32>) {
 
-          //CHECK: %{{.*}} = xegpu.load_nd %{{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x16xbf16>
+          //CHECK: %{{.*}} = xegpu.load_nd %{{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x16xbf16>
           //CHECK-COUNT-2: %{{.*}} = vector.extract %{{.*}} : vector<32x16xbf16> from vector<2x32x16xbf16>
           //CHECK-COUNT-8: %{{.*}} = vector.extract_strided_slice %{{.*}} {offsets = {{.*}}, sizes = [8, 16], strides = [1, 1]} : vector<32x16xbf16> to vector<8x16xbf16>
           %48 = xetile.load_tile %arg7 {padding = 0.000000e+00 : f32}  : !xetile.tile<32x32xbf16> -> vector<32x32xbf16>
 
 
-          //CHECK: %{{.*}} = xegpu.load_nd %{{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x16xbf16>
+          //CHECK: %{{.*}} = xegpu.load_nd %{{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x16xbf16>
           //CHECK-COUNT-2: %{{.*}} = vector.extract %{{.*}} : vector<32x16xbf16> from vector<2x32x16xbf16>
           //CHECK-COUNT-4: %{{.*}} = vector.extract_strided_slice %{{.*}} {offsets = {{.*}}, sizes = [16, 16], strides = [1, 1]} : vector<32x16xbf16> to vector<16x16xbf16>
           %49 = xetile.load_tile %arg8 {padding = 0.000000e+00 : f32}  : !xetile.tile<32x32xbf16> -> vector<32x32xbf16>
 
-          //CHECK: %{{.*}} = xegpu.update_nd_offset %{{.*}} : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
-          //CHECK: %{{.*}} = xegpu.update_nd_offset %{{.*}} : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
+          //CHECK: %{{.*}} = xegpu.update_nd_offset %{{.*}} : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>
+          //CHECK: %{{.*}} = xegpu.update_nd_offset %{{.*}} : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>
           %50 = xetile.update_tile_offset %arg7, [%c0,  %c32] : !xetile.tile<32x32xbf16>, index, index -> !xetile.tile<32x32xbf16>
           %51 = xetile.update_tile_offset %arg8, [%c0,  %c32] : !xetile.tile<32x32xbf16>, index, index -> !xetile.tile<32x32xbf16>
 
@@ -101,24 +101,24 @@ gpu.module @postop_reduce_m attributes {spirv.target_env = #spirv.target_env<#sp
         %41 = vector.shape_cast %40 : vector<32xf32> to vector<1x32xf32>
         %alloc = memref.alloc() : memref<8x128xf32, 3>
 
-        //CHECK: %{{.*}} = xegpu.create_nd_tdesc %{{.*}} : memref<8x128xf32, 3> -> !xegpu.tensor_desc<1x32xf32, #xegpu.block_tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true>>
-        %42 = xetile.init_tile %alloc[%17, %13] : memref<8x128xf32, 3> -> !xetile.tile<1x32xf32, #xetile.tile_attr<memory_scope = 3>>
-
-        //CHECK: xegpu.store_nd %{{.*}}, %{{.*}} <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<1x32xf32>, !xegpu.tensor_desc<1x32xf32, #xegpu.block_tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true>>
-        xetile.store_tile %41,  %42 : vector<1x32xf32>, !xetile.tile<1x32xf32, #xetile.tile_attr<memory_scope = 3>>
-
-        //CHECK: xegpu.create_nd_tdesc %{{.*}} : memref<8x128xf32, 3> -> !xegpu.tensor_desc<1x4xf32, #xegpu.block_tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true>>
-        //CHECK: xegpu.create_nd_tdesc %{{.*}} : memref<8x128xf32, 3> -> !xegpu.tensor_desc<1x4xf32, #xegpu.block_tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true>>
-        //CHECK: xegpu.create_nd_tdesc %{{.*}} : memref<8x128xf32, 3> -> !xegpu.tensor_desc<1x4xf32, #xegpu.block_tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true>>
-        //CHECK: xegpu.create_nd_tdesc %{{.*}} : memref<8x128xf32, 3> -> !xegpu.tensor_desc<1x4xf32, #xegpu.block_tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true>>
-        //CHECK: xegpu.create_nd_tdesc %{{.*}} : memref<8x128xf32, 3> -> !xegpu.tensor_desc<1x4xf32, #xegpu.block_tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true>>
-        //CHECK: xegpu.create_nd_tdesc %{{.*}} : memref<8x128xf32, 3> -> !xegpu.tensor_desc<1x4xf32, #xegpu.block_tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true>>
-        //CHECK: xegpu.create_nd_tdesc %{{.*}} : memref<8x128xf32, 3> -> !xegpu.tensor_desc<1x4xf32, #xegpu.block_tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true>>
-        //CHECK: xegpu.create_nd_tdesc %{{.*}} : memref<8x128xf32, 3> -> !xegpu.tensor_desc<1x4xf32, #xegpu.block_tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true>>
-        //CHECK-COUNT-8: xegpu.load_nd {{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<1x4xf32, #xegpu.block_tdesc_attr<memory_scope =  slm, array_length = 1 : i64, boundary_check = true>> -> vector<1x4xf32>
+        //CHECK: %{{.*}} = xegpu.create_nd_tdesc %{{.*}} : memref<8x128xf32, 3> -> !xegpu.tensor_desc<1x32xf32, #xegpu.block_tdesc_attr<memory_space =  slm, array_length = 1 : i64, boundary_check = true>>
+        %42 = xetile.init_tile %alloc[%17, %13] : memref<8x128xf32, 3> -> !xetile.tile<1x32xf32, #xetile.tile_attr<memory_space = 3>>
+
+        //CHECK: xegpu.store_nd %{{.*}}, %{{.*}} <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<1x32xf32>, !xegpu.tensor_desc<1x32xf32, #xegpu.block_tdesc_attr<memory_space =  slm, array_length = 1 : i64, boundary_check = true>>
+        xetile.store_tile %41,  %42 : vector<1x32xf32>, !xetile.tile<1x32xf32, #xetile.tile_attr<memory_space = 3>>
+
+        //CHECK: xegpu.create_nd_tdesc %{{.*}} : memref<8x128xf32, 3> -> !xegpu.tensor_desc<1x4xf32, #xegpu.block_tdesc_attr<memory_space =  slm, array_length = 1 : i64, boundary_check = true>>
+        //CHECK: xegpu.create_nd_tdesc %{{.*}} : memref<8x128xf32, 3> -> !xegpu.tensor_desc<1x4xf32, #xegpu.block_tdesc_attr<memory_space =  slm, array_length = 1 : i64, boundary_check = true>>
+        //CHECK: xegpu.create_nd_tdesc %{{.*}} : memref<8x128xf32, 3> -> !xegpu.tensor_desc<1x4xf32, #xegpu.block_tdesc_attr<memory_space =  slm, array_length = 1 : i64, boundary_check = true>>
+        //CHECK: xegpu.create_nd_tdesc %{{.*}} : memref<8x128xf32, 3> -> !xegpu.tensor_desc<1x4xf32, #xegpu.block_tdesc_attr<memory_space =  slm, array_length = 1 : i64, boundary_check = true>>
+        //CHECK: xegpu.create_nd_tdesc %{{.*}} : memref<8x128xf32, 3> -> !xegpu.tensor_desc<1x4xf32, #xegpu.block_tdesc_attr<memory_space =  slm, array_length = 1 : i64, boundary_check = true>>
+        //CHECK: xegpu.create_nd_tdesc %{{.*}} : memref<8x128xf32, 3> -> !xegpu.tensor_desc<1x4xf32, #xegpu.block_tdesc_attr<memory_space =  slm, array_length = 1 : i64, boundary_check = true>>
+        //CHECK: xegpu.create_nd_tdesc %{{.*}} : memref<8x128xf32, 3> -> !xegpu.tensor_desc<1x4xf32, #xegpu.block_tdesc_attr<memory_space =  slm, array_length = 1 : i64, boundary_check = true>>
+        //CHECK: xegpu.create_nd_tdesc %{{.*}} : memref<8x128xf32, 3> -> !xegpu.tensor_desc<1x4xf32, #xegpu.block_tdesc_attr<memory_space =  slm, array_length = 1 : i64, boundary_check = true>>
+        //CHECK-COUNT-8: xegpu.load_nd {{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<1x4xf32, #xegpu.block_tdesc_attr<memory_space =  slm, array_length = 1 : i64, boundary_check = true>> -> vector<1x4xf32>
         //CHECK-COUNT-8: arith.addf %{{.*}}, %{{.*}} : vector<1x4xf32>
-        %43 = xetile.init_tile %alloc[%21, %23] : memref<8x128xf32, 3> -> !xetile.tile<8x4xf32, #xetile.tile_attr<memory_scope = 3>>
-        %44 = xetile.load_tile %43 {padding = 0.000000e+00 : f32}  : !xetile.tile<8x4xf32, #xetile.tile_attr<memory_scope = 3>> -> vector<8x4xf32>
+        %43 = xetile.init_tile %alloc[%21, %23] : memref<8x128xf32, 3> -> !xetile.tile<8x4xf32, #xetile.tile_attr<memory_space = 3>>
+        %44 = xetile.load_tile %43 {padding = 0.000000e+00 : f32}  : !xetile.tile<8x4xf32, #xetile.tile_attr<memory_space = 3>> -> vector<8x4xf32>
         %45 = vector.multi_reduction <add>, %44, %cst_2 [0] : vector<8x4xf32> to vector<4xf32>
         %46 = vector.shape_cast %45 : vector<4xf32> to vector<1x4xf32>
         %47 = arith.addf %arg5, %46 : vector<1x4xf32>
@@ -126,8 +126,8 @@ gpu.module @postop_reduce_m attributes {spirv.target_env = #spirv.target_env<#sp
       } {lowerBoundMap = affine_map<() -> (0)>, operandSegmentSizes = array<i32: 0, 0, 1>, step = 1 : index, syn.mm_dim = 0 : i64, syn.parall_level = 2 : i64, upperBoundMap = affine_map<() -> (2)>}
 
       //CHECK: %{{.*}} = arith.addi %{{.*}}, %{{.*}} : index
-      //CHECK: %{{.*}} = xegpu.create_nd_tdesc %{{.*}} : memref<32x2048xf32> -> !xegpu.tensor_desc<1x4xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-      //CHECK: xegpu.store_nd %{{.*}}, %{{.*}} <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<1x4xf32>, !xegpu.tensor_desc<1x4xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+      //CHECK: %{{.*}} = xegpu.create_nd_tdesc %{{.*}} : memref<32x2048xf32> -> !xegpu.tensor_desc<1x4xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+      //CHECK: xegpu.store_nd %{{.*}}, %{{.*}} <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<1x4xf32>, !xegpu.tensor_desc<1x4xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
       %31 = arith.addi %26, %16 : index
       %32 = xetile.init_tile %arg2[%25, %31] : memref<32x2048xf32> -> !xetile.tile<1x4xf32>
       xetile.store_tile %30,  %32 : vector<1x4xf32>, !xetile.tile<1x4xf32>
diff --git a/test/Conversion/XeTileToXeGPU/sg_scf_for.mlir b/test/Conversion/XeTileToXeGPU/sg_scf_for.mlir
index a452e0454..b500bf23d 100644
--- a/test/Conversion/XeTileToXeGPU/sg_scf_for.mlir
+++ b/test/Conversion/XeTileToXeGPU/sg_scf_for.mlir
@@ -12,19 +12,19 @@ gpu.module @test_kernel {
     %c64 = arith.constant 64 : index
     %c1024 = arith.constant 1024 : index
 
-    //CHECK: %[[r0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
   	%1 = xetile.init_tile %a[%c0, %c64] : memref<1024x1024xf16> -> !xetile.tile<32x32xf16>
 
-    //CHECK: %[[r1:.*]]:2 = scf.for %[[arg2:.*]] = %[[c0]] to %[[c1024]] step %[[c64]] iter_args(%[[arg3:.*]] = %[[r0]], %[[arg4:.*]] = %[[cst]]) -> (!xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, vector<32x32xf16>) {
+    //CHECK: %[[r1:.*]]:2 = scf.for %[[arg2:.*]] = %[[c0]] to %[[c1024]] step %[[c64]] iter_args(%[[arg3:.*]] = %[[r0]], %[[arg4:.*]] = %[[cst]]) -> (!xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>, vector<32x32xf16>) {
     %nexta, %res = scf.for %k= %c0 to %c1024 step %c64 iter_args(%subA = %1, %subB = %cst) -> (!xetile.tile<32x32xf16>, vector<32x32xf16>) {
 
-      //CHECK: %[[r10:.*]] = xegpu.load_nd %[[arg3]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x32xf16>
+      //CHECK: %[[r10:.*]] = xegpu.load_nd %[[arg3]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x32xf16>
       %3 = xetile.load_tile %subA : !xetile.tile<32x32xf16> -> vector<32x32xf16>
 
-      //CHECK: %[[r11:.*]] = xegpu.update_nd_offset %[[arg3]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+      //CHECK: %[[r11:.*]] = xegpu.update_nd_offset %[[arg3]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
       %5 = xetile.update_tile_offset %subA, [%c0, %c64]: !xetile.tile<32x32xf16>, index, index -> !xetile.tile<32x32xf16>
 
-      //CHECK: scf.yield %[[r11]], %[[r10]] : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, vector<32x32xf16>
+      //CHECK: scf.yield %[[r11]], %[[r10]] : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>, vector<32x32xf16>
       scf.yield %5, %3: !xetile.tile<32x32xf16>, vector<32x32xf16>
     }
     //CHECK: %[[r2:.*]] = vector.extract_strided_slice %[[r1]]#1 {offsets = [0, 0], sizes = [8, 32], strides = [1, 1]} : vector<32x32xf16> to vector<8x32xf16>
@@ -33,19 +33,19 @@ gpu.module @test_kernel {
     //CHECK: %[[r5:.*]] = vector.extract_strided_slice %[[r1]]#1 {offsets = [24, 0], sizes = [8, 32], strides = [1, 1]} : vector<32x32xf16> to vector<8x32xf16>
 
 
-    //CHECK: %[[r6:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c0]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r6:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c0]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[c8:.*]] = arith.constant 8 : index
-    //CHECK: %[[r7:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c8]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r7:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c8]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[c16:.*]] = arith.constant 16 : index
-    //CHECK: %[[r8:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c16]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r8:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c16]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[c24:.*]] = arith.constant 24 : index
-    //CHECK: %[[r9:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c24]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[r9:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c24]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
   	%5 = xetile.init_tile %b[%c0, %c64] : memref<1024x1024xf16> -> !xetile.tile<32x32xf16>
 
-    //CHECK: xegpu.store_nd %[[r2]], %[[r6]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: xegpu.store_nd %[[r3]], %[[r7]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: xegpu.store_nd %[[r4]], %[[r8]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: xegpu.store_nd %[[r5]], %[[r9]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r2]], %[[r6]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r3]], %[[r7]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r4]], %[[r8]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[r5]], %[[r9]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
     xetile.store_tile %res, %5: vector<32x32xf16>, !xetile.tile<32x32xf16>
 
     //CHECK: gpu.return
diff --git a/test/Conversion/XeTileToXeGPU/sg_softmax.mlir b/test/Conversion/XeTileToXeGPU/sg_softmax.mlir
index e07c15bb5..f874de792 100644
--- a/test/Conversion/XeTileToXeGPU/sg_softmax.mlir
+++ b/test/Conversion/XeTileToXeGPU/sg_softmax.mlir
@@ -4,13 +4,13 @@ gpu.module @test_kernel {
     //CHECK-SAME: (%[[arg0:.*]]: memref<1024x1024xf16>)
     gpu.func @sglevel_softmax_dim_0(%a: memref<1024x1024xf16>) {
       //CHECK: %[[c0:.*]] = arith.constant 0 : index
-      //CHECK: %[[r0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+      //CHECK: %[[r0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
       //CHECK: %[[c32:.*]] = arith.constant 32 : index
-      //CHECK: %[[r1:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+      //CHECK: %[[r1:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
       %1 = xetile.init_tile %a[0, 0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16>
 
-      //CHECK: %[[r2:.*]] = xegpu.load_nd %[[r0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x32xf16>
-      //CHECK: %[[r3:.*]] = xegpu.load_nd %[[r1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x32xf16>
+      //CHECK: %[[r2:.*]] = xegpu.load_nd %[[r0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x32xf16>
+      //CHECK: %[[r3:.*]] = xegpu.load_nd %[[r1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x32xf16>
       %2 = xetile.load_tile %1: !xetile.tile<32x64xf16> -> vector<32x64xf16>
 
       //CHECK-COUNT-32: {{.*}} = vector.extract_strided_slice %[[r2]] {offsets = [{{.*}}], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
@@ -30,12 +30,12 @@ gpu.module @test_kernel {
     //CHECK-SAME: (%[[arg0:.*]]: memref<1024x1024xf16>)
     gpu.func @sglevel_softmax_dim_1(%a: memref<1024x1024xf16>) {
       //CHECK: %[[c0:.*]] = arith.constant 0 : index
-      //CHECK: %[[r0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+      //CHECK: %[[r0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
       //CHECK: %[[c32:.*]] = arith.constant 32 : index
-      //CHECK: %[[r1:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+      //CHECK: %[[r1:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
       %1 = xetile.init_tile %a[0, 0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16>
-      //CHECK: %[[r2:.*]] = xegpu.load_nd %[[r0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x32xf16>
-      //CHECK: %[[r3:.*]] = xegpu.load_nd %[[r1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x32xf16>
+      //CHECK: %[[r2:.*]] = xegpu.load_nd %[[r0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x32xf16>
+      //CHECK: %[[r3:.*]] = xegpu.load_nd %[[r1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x32xf16>
       %2 = xetile.load_tile %1: !xetile.tile<32x64xf16> -> vector<32x64xf16>
       //CHECK-COUNT-32: {{.*}} = vector.extract_strided_slice %[[r2]] {offsets = [{{.*}}], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
       //CHECK-COUNT-32: {{.*}} = vector.extract_strided_slice %[[r3]] {offsets = [{{.*}}], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
diff --git a/test/Conversion/XeTileToXeGPU/sg_store_tile.mlir b/test/Conversion/XeTileToXeGPU/sg_store_tile.mlir
index 07d7111f4..9b8007a62 100644
--- a/test/Conversion/XeTileToXeGPU/sg_store_tile.mlir
+++ b/test/Conversion/XeTileToXeGPU/sg_store_tile.mlir
@@ -12,28 +12,28 @@ gpu.module @test_kernel {
 		%result = arith.constant dense<0.0>: vector<32x32xf32>
     //CHECK: %[[c0:.*]] = arith.constant 0 : index
     //CHECK: %[[c32:.*]] = arith.constant 32 : index
-    //CHECK: %[[R4:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c32]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[R4:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c32]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[c48:.*]] = arith.constant 48 : index
-    //CHECK: %[[R5:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c48]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[R5:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c48]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[c8:.*]] = arith.constant 8 : index
-    //CHECK: %[[R6:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c8]], %[[c32]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: %[[R7:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c8]], %[[c48]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[R6:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c8]], %[[c32]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[R7:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c8]], %[[c48]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[c16:.*]] = arith.constant 16 : index
-    //CHECK: %[[R8:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c16]], %[[c32]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: %[[R9:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c16]], %[[c48]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[R8:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c16]], %[[c32]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[R9:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c16]], %[[c48]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
     //CHECK: %[[c24:.*]] = arith.constant 24 : index
-    //CHECK: %[[R10:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c24]], %[[c32]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: %[[R11:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c24]], %[[c48]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[R10:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c24]], %[[c32]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: %[[R11:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c24]], %[[c48]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
 		%1 = xetile.init_tile %a[0, 32] : memref<1024x1024xf32> -> !xetile.tile<32x32xf32>
 
-    //CHECK: xegpu.store_nd %[[R0]], %[[R4]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: xegpu.store_nd %[[R0]], %[[R5]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: xegpu.store_nd %[[R1]], %[[R6]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: xegpu.store_nd %[[R1]], %[[R7]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: xegpu.store_nd %[[R2]], %[[R8]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: xegpu.store_nd %[[R2]], %[[R9]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: xegpu.store_nd %[[R3]], %[[R10]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    //CHECK: xegpu.store_nd %[[R3]], %[[R11]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[R0]], %[[R4]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[R0]], %[[R5]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[R1]], %[[R6]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[R1]], %[[R7]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[R2]], %[[R8]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[R2]], %[[R9]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[R3]], %[[R10]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    //CHECK: xegpu.store_nd %[[R3]], %[[R11]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
 		xetile.store_tile %result, %1: vector<32x32xf32>, !xetile.tile<32x32xf32>
 		gpu.return
 	}
diff --git a/test/Conversion/XeTileToXeGPU/sg_tile_mma.mlir b/test/Conversion/XeTileToXeGPU/sg_tile_mma.mlir
index 93524d094..bc1871af5 100644
--- a/test/Conversion/XeTileToXeGPU/sg_tile_mma.mlir
+++ b/test/Conversion/XeTileToXeGPU/sg_tile_mma.mlir
@@ -9,10 +9,10 @@ gpu.module @test_kernel {
     //CHECK: %[[c64:.*]] = arith.constant 64 : index
     %c64 = arith.constant 64 : index
 
-    //CHECK: %[[r0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
+    //CHECK: %[[r0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>
   	%1 = xetile.init_tile %a[%c0, %c64] : memref<1024x1024xf16> -> !xetile.tile<32x32xf16>
 
-    //CHECK: %[[r1:.*]] = xegpu.load_nd %[[r0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x16xf16>
+    //CHECK: %[[r1:.*]] = xegpu.load_nd %[[r0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x16xf16>
     //CHECK: %[[r2:.*]] = vector.extract %[[r1]][0] : vector<32x16xf16> from vector<2x32x16xf16>
     //CHECK: %[[r3:.*]] = vector.extract %[[r1]][1] : vector<32x16xf16> from vector<2x32x16xf16>
     //CHECK: %[[r4:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf16> to vector<8x16xf16>
@@ -24,15 +24,15 @@ gpu.module @test_kernel {
     //CHECK: %[[r10:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [16, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf16> to vector<8x16xf16>
     //CHECK: %[[r11:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [24, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf16> to vector<8x16xf16>
     %2 = xetile.load_tile %1 : !xetile.tile<32x32xf16> -> vector<32x32xf16>
-    //CHECK: %[[r12:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c64]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
+    //CHECK: %[[r12:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c64]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>
     //CHECK: %[[c32:.*]] = arith.constant 32 : index
-    //CHECK: %[[r13:.*]] = xegpu.create_nd_tdesc %arg1[%[[c64]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
+    //CHECK: %[[r13:.*]] = xegpu.create_nd_tdesc %arg1[%[[c64]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>
   	%3 = xetile.init_tile %b[%c64, %c0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16>
 
-    //CHECK: %[[r14:.*]] = xegpu.load_nd %[[r12]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x16xf16>
+    //CHECK: %[[r14:.*]] = xegpu.load_nd %[[r12]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x16xf16>
     //CHECK: %[[r15:.*]] = vector.extract %[[r14]][0] : vector<32x16xf16> from vector<2x32x16xf16>
     //CHECK: %[[r16:.*]] = vector.extract %[[r14]][1] : vector<32x16xf16> from vector<2x32x16xf16>
-    //CHECK: %[[r17:.*]] = xegpu.load_nd %[[r13]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x16xf16>
+    //CHECK: %[[r17:.*]] = xegpu.load_nd %[[r13]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x16xf16>
     //CHECK: %[[r18:.*]] = vector.extract %[[r17]][0] : vector<32x16xf16> from vector<2x32x16xf16>
     //CHECK: %[[r19:.*]] = vector.extract %[[r17]][1] : vector<32x16xf16> from vector<2x32x16xf16>
     //CHECK: %[[r20:.*]] = vector.extract_strided_slice %[[r15]] {offsets = [0, 0], sizes = [16, 16], strides = [1, 1]} : vector<32x16xf16> to vector<16x16xf16>
diff --git a/test/Conversion/XeTileToXeGPU/sg_tiled_broadcast.mlir b/test/Conversion/XeTileToXeGPU/sg_tiled_broadcast.mlir
index d24ae787b..f503b4dea 100644
--- a/test/Conversion/XeTileToXeGPU/sg_tiled_broadcast.mlir
+++ b/test/Conversion/XeTileToXeGPU/sg_tiled_broadcast.mlir
@@ -10,7 +10,7 @@ gpu.module @test_kernel {
     %3 = xetile.tile_unpack %2 {inner_blocks = array<i64: 1, 16>}: vector<32x4x1x16xf16> -> vector<32x64xf16>
     %4 = xetile.init_tile %arg0[0, 0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16, #xetile.tile_attr<inner_blocks = [1, 16]>>
     %5 = xetile.tile_pack %3 {inner_blocks = array<i64: 1, 16>}: vector<32x64xf16> -> vector<32x4x1x16xf16>
-    // CHECK-COUNT-128: xegpu.store_nd %[[cst]], %{{.*}} <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<1x16xf16>, !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    // CHECK-COUNT-128: xegpu.store_nd %[[cst]], %{{.*}} <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<1x16xf16>, !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
     xetile.store_tile %5,  %4 : vector<32x4x1x16xf16>, !xetile.tile<32x64xf16, #xetile.tile_attr<inner_blocks = [1, 16]>>
     gpu.return
   }
@@ -87,7 +87,7 @@ gpu.module @test_kernel {
     %2 = xetile.tile_unpack %1 {inner_blocks = array<i64: 1, 16>}: vector<32x4x1x16xf16> -> vector<32x64xf16>
     %3 = xetile.init_tile %arg0[0, 0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16, #xetile.tile_attr<inner_blocks = [1, 16]>>
     %4 = xetile.tile_pack %2 {inner_blocks = array<i64: 1, 16>}: vector<32x64xf16> -> vector<32x4x1x16xf16>
-    // CHECK-COUNT-128: xegpu.store_nd %{{.*}}, %{{.*}} <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<1x16xf16>, !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    // CHECK-COUNT-128: xegpu.store_nd %{{.*}}, %{{.*}} <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<1x16xf16>, !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
     xetile.store_tile %4,  %3 : vector<32x4x1x16xf16>, !xetile.tile<32x64xf16, #xetile.tile_attr<inner_blocks = [1, 16]>>
     gpu.return
   }
diff --git a/test/Conversion/XeTileToXeGPU/sg_tiled_load_tile.mlir b/test/Conversion/XeTileToXeGPU/sg_tiled_load_tile.mlir
index cb2f84467..92d59f0ee 100644
--- a/test/Conversion/XeTileToXeGPU/sg_tiled_load_tile.mlir
+++ b/test/Conversion/XeTileToXeGPU/sg_tiled_load_tile.mlir
@@ -9,11 +9,11 @@ gpu.module @test_kernel {
     %c64 = arith.constant 64 : index
 
     // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[%[[C0]], %[[C64]]] : memref<1024x1024xf16>
-    // CHECK-SAME: !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    // CHECK-SAME: !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
     %0 = xetile.init_tile %arg0[%c0, %c64] : memref<1024x1024xf16> -> !xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 32]>>
 
     // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}>
-    // CHECK-SAME: !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x32xf16>
+    // CHECK-SAME: !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x32xf16>
     %1 = xetile.load_tile %0 {padding = 0.000000e+00 : f32}  : !xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 32]>> -> vector<1x1x32x32xf16>
     gpu.return
   }
diff --git a/test/Conversion/XeTileToXeGPU/sg_tiled_scf_for.mlir b/test/Conversion/XeTileToXeGPU/sg_tiled_scf_for.mlir
index a12536ed1..62d7e313d 100644
--- a/test/Conversion/XeTileToXeGPU/sg_tiled_scf_for.mlir
+++ b/test/Conversion/XeTileToXeGPU/sg_tiled_scf_for.mlir
@@ -14,30 +14,30 @@
       // CHECK: %[[c1024:.*]] = arith.constant 1024 : index
       %c1024 = arith.constant 1024 : index
 
-      // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%[[c0]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+      // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%[[c0]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
       %0 = xetile.init_tile %arg0[%c0, %c64] : memref<1024x1024xf16> -> !xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 32]>>
 
       // CHECK: %[[R1:.*]]:2 = scf.for %[[arg2:.*]] = %[[c0]] to %[[c1024]] step %[[c64]]
-      // CHECK-SAME: iter_args(%[[arg3:.*]] = %[[R0]], %[[arg4:.*]] = %[[cst]]) -> (!xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, vector<32x32xf16>)
+      // CHECK-SAME: iter_args(%[[arg3:.*]] = %[[R0]], %[[arg4:.*]] = %[[cst]]) -> (!xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>, vector<32x32xf16>)
       %1:2 = scf.for %arg2 = %c0 to %c1024 step %c64 iter_args(%arg3 = %0, %arg4 = %cst) -> (!xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 32]>>, vector<1x1x32x32xf16>) {
 
-        // CHECK: %[[R10:.*]] = xegpu.load_nd %[[arg3]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x32xf16>
+        // CHECK: %[[R10:.*]] = xegpu.load_nd %[[arg3]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x32xf16>
         %5 = xetile.load_tile %arg3 {padding = 0.000000e+00 : f32}  : !xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 32]>> -> vector<1x1x32x32xf16>
 
-        // CHECK: %[[R11:.*]] = xegpu.update_nd_offset %[[arg3]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+        // CHECK: %[[R11:.*]] = xegpu.update_nd_offset %[[arg3]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
         %6 = xetile.update_tile_offset %arg3, [%c0,  %c64] : !xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 32]>>, index, index -> !xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 32]>>
 
-        // CHECK: scf.yield %[[R11]], %[[R10]] : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, vector<32x32xf16>
+        // CHECK: scf.yield %[[R11]], %[[R10]] : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>, vector<32x32xf16>
         scf.yield %6, %5 : !xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 32]>>, vector<1x1x32x32xf16>
       }
 
-      // CHECK: %[[R2:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%[[c0]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+      // CHECK: %[[R2:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%[[c0]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
       // CHECK: %[[c8:.*]] = arith.constant 8 : index
-      // CHECK: %[[R3:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%[[c8]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+      // CHECK: %[[R3:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%[[c8]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
       // CHECK: %[[c16:.*]] = arith.constant 16 : index
-      // CHECK: %[[R4:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%[[c16]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+      // CHECK: %[[R4:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%[[c16]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
       // CHECK: %[[c24:.*]] = arith.constant 24 : index
-      // CHECK: %[[R5:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%[[c24]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+      // CHECK: %[[R5:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%[[c24]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
       %2 = xetile.init_tile %arg1[%c0, %c64] : memref<1024x1024xf16> -> !xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [8, 32]>>
 
       // CHECK: %[[R6:.*]] = vector.extract_strided_slice %[[R1]]#1 {offsets = [0, 0], sizes = [8, 32], strides = [1, 1]} : vector<32x32xf16> to vector<8x32xf16>
@@ -47,10 +47,10 @@
       %3 = xetile.tile_unpack %1#1 {inner_blocks = array<i64: 32, 32>}  : vector<1x1x32x32xf16> -> vector<32x32xf16>
       %4 = xetile.tile_pack %3 {inner_blocks = array<i64: 8, 32>}: vector<32x32xf16> -> vector<4x1x8x32xf16>
 
-      // CHECK: xegpu.store_nd %[[R6]], %[[R2]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-      // CHECK: xegpu.store_nd %[[R7]], %[[R3]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-      // CHECK: xegpu.store_nd %[[R8]], %[[R4]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-      // CHECK: xegpu.store_nd %[[R9]], %[[R5]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+      // CHECK: xegpu.store_nd %[[R6]], %[[R2]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+      // CHECK: xegpu.store_nd %[[R7]], %[[R3]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+      // CHECK: xegpu.store_nd %[[R8]], %[[R4]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+      // CHECK: xegpu.store_nd %[[R9]], %[[R5]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
       xetile.store_tile %4,  %2 : vector<4x1x8x32xf16>, !xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [8, 32]>>
       gpu.return
     }
diff --git a/test/Conversion/XeTileToXeGPU/sg_tiled_softmax.mlir b/test/Conversion/XeTileToXeGPU/sg_tiled_softmax.mlir
index a520e588b..9929788d0 100644
--- a/test/Conversion/XeTileToXeGPU/sg_tiled_softmax.mlir
+++ b/test/Conversion/XeTileToXeGPU/sg_tiled_softmax.mlir
@@ -4,13 +4,13 @@ gpu.module @test_kernel {
     //CHECK-SAME: (%[[arg0:.*]]: memref<1024x1024xf16>)
     gpu.func @sglevel_softmax_dim_0(%arg0: memref<1024x1024xf16>) {
       //CHECK: %[[c0:.*]] = arith.constant 0 : index
-      //CHECK: %[[r0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+      //CHECK: %[[r0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
       //CHECK: %[[c32:.*]] = arith.constant 32 : index
-      //CHECK: %[[r1:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+      //CHECK: %[[r1:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
       %0 = xetile.init_tile %arg0[0, 0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16, #xetile.tile_attr<inner_blocks = [32, 32]>>
 
-      //CHECK: %[[r2:.*]] = xegpu.load_nd %[[r0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x32xf16>
-      //CHECK: %[[r3:.*]] = xegpu.load_nd %[[r1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x32xf16>
+      //CHECK: %[[r2:.*]] = xegpu.load_nd %[[r0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x32xf16>
+      //CHECK: %[[r3:.*]] = xegpu.load_nd %[[r1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x32xf16>
       %1 = xetile.load_tile %0 {padding = 0.000000e+00 : f32}  : !xetile.tile<32x64xf16, #xetile.tile_attr<inner_blocks = [32, 32]>> -> vector<1x2x32x32xf16>
 
       //CHECK-COUNT-32: {{.*}} = vector.extract_strided_slice %[[r2]] {offsets = [{{.*}}], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
@@ -35,12 +35,12 @@ gpu.module @test_kernel {
     //CHECK-SAME: (%[[arg0:.*]]: memref<1024x1024xf16>)
     gpu.func @sglevel_softmax_dim_1(%arg0: memref<1024x1024xf16>) {
       //CHECK: %[[c0:.*]] = arith.constant 0 : index
-      //CHECK: %[[r0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+      //CHECK: %[[r0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
       //CHECK: %[[c32:.*]] = arith.constant 32 : index
-      //CHECK: %[[r1:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+      //CHECK: %[[r1:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
       %0 = xetile.init_tile %arg0[0, 0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16, #xetile.tile_attr<inner_blocks = [32, 32]>>
-      //CHECK: %[[r2:.*]] = xegpu.load_nd %[[r0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x32xf16>
-      //CHECK: %[[r3:.*]] = xegpu.load_nd %[[r1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x32xf16>
+      //CHECK: %[[r2:.*]] = xegpu.load_nd %[[r0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x32xf16>
+      //CHECK: %[[r3:.*]] = xegpu.load_nd %[[r1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x32xf16>
       %1 = xetile.load_tile %0 {padding = 0.000000e+00 : f32}  : !xetile.tile<32x64xf16, #xetile.tile_attr<inner_blocks = [32, 32]>> -> vector<1x2x32x32xf16>
       //CHECK-COUNT-32: {{.*}} = vector.extract_strided_slice %[[r2]] {offsets = [{{.*}}], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
       //CHECK-COUNT-32: {{.*}} = vector.extract_strided_slice %[[r3]] {offsets = [{{.*}}], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16>
diff --git a/test/Conversion/XeTileToXeGPU/sg_tiled_store_tile.mlir b/test/Conversion/XeTileToXeGPU/sg_tiled_store_tile.mlir
index 4419001ca..52ba152c0 100644
--- a/test/Conversion/XeTileToXeGPU/sg_tiled_store_tile.mlir
+++ b/test/Conversion/XeTileToXeGPU/sg_tiled_store_tile.mlir
@@ -8,25 +8,25 @@
       // CHECK: %[[c0:.*]] = arith.constant 0 : index
       // CHECK: %[[c32:.*]] = arith.constant 32 : index
       // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c32]]] : memref<1024x1024xf32>
-      // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+      // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
       // CHECK: %[[c48:.*]] = arith.constant 48 : index
       // CHECK: %[[R1:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c48]]] : memref<1024x1024xf32>
-      // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+      // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
       // CHECK: %[[c8:.*]] = arith.constant 8 : index
       // CHECK: %[[R2:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c8]], %[[c32]]] : memref<1024x1024xf32>
-      // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+      // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
       // CHECK: %[[R3:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c8]], %[[c48]]] : memref<1024x1024xf32>
-      // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+      // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
       // CHECK: %[[c16:.*]] = arith.constant 16 : index
       // CHECK: %[[R4:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c16]], %[[c32]]] : memref<1024x1024xf32>
-      // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+      // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
       // CHECK: %[[R5:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c16]], %[[c48]]] : memref<1024x1024xf32>
-      // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+      // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
       // CHECK: %[[c24:.*]] = arith.constant 24 : index
       // CHECK: %[[R6:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c24]], %[[c32]]] : memref<1024x1024xf32>
-      // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+      // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
       // CHECK: %[[R7:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c24]], %[[c48]]] : memref<1024x1024xf32>
-      // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+      // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
       %0 = xetile.init_tile %arg0[0, 32] : memref<1024x1024xf32> -> !xetile.tile<32x32xf32, #xetile.tile_attr<inner_blocks = [8, 16]>>
 
       // CHECK: %[[R8:.*]] = vector.extract_strided_slice %[[cst]] {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32>
@@ -37,21 +37,21 @@
       %2 = xetile.tile_pack %1 {inner_blocks = array<i64: 8, 16>}  : vector<32x32xf32> -> vector<4x2x8x16xf32>
 
       // CHECK: xegpu.store_nd %[[R8]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}>
-      // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+      // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
       // CHECK: xegpu.store_nd %[[R8]], %[[R1]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}>
-      // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+      // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
       // CHECK: xegpu.store_nd %[[R9]], %[[R2]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}>
-      // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+      // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
       // CHECK: xegpu.store_nd %[[R9]], %[[R3]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}>
-      // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+      // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
       // CHECK: xegpu.store_nd %[[R10]], %[[R4]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}>
-      // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+      // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
       // CHECK: xegpu.store_nd %[[R10]], %[[R5]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}>
-      // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+      // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
       // CHECK: xegpu.store_nd %[[R11]], %[[R6]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}>
-      // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+      // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
       // CHECK: xegpu.store_nd %[[R11]], %[[R7]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}>
-      // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+      // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
       xetile.store_tile %2,  %0 : vector<4x2x8x16xf32>, !xetile.tile<32x32xf32, #xetile.tile_attr<inner_blocks = [8, 16]>>
       gpu.return
     }
diff --git a/test/Conversion/XeTileToXeGPU/sg_tiled_tile_mma.mlir b/test/Conversion/XeTileToXeGPU/sg_tiled_tile_mma.mlir
index 31cac7e21..6079e9f29 100644
--- a/test/Conversion/XeTileToXeGPU/sg_tiled_tile_mma.mlir
+++ b/test/Conversion/XeTileToXeGPU/sg_tiled_tile_mma.mlir
@@ -8,24 +8,24 @@ gpu.module @test_kernel {
     // CHECK: %[[C64:.*]] = arith.constant 64 : index
     %c64 = arith.constant 64 : index
 
-    // CHECK: %[[REG0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%[[C0]], %[[C64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
+    // CHECK: %[[REG0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%[[C0]], %[[C64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>
     %0 = xetile.init_tile %arg0[%c0, %c64] : memref<1024x1024xf16> -> !xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 16]>>
 
-    // CHECK: %[[REG1:.*]] = xegpu.load_nd %[[REG0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x16xf16>
+    // CHECK: %[[REG1:.*]] = xegpu.load_nd %[[REG0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x16xf16>
     // CHECK: %[[REG2:.*]] = vector.extract %[[REG1]][0] : vector<32x16xf16> from vector<2x32x16xf16>
     // CHECK: %[[REG3:.*]] = vector.extract %[[REG1]][1] : vector<32x16xf16> from vector<2x32x16xf16>
     %1 = xetile.load_tile %0 {padding = 0.000000e+00 : f32}  : !xetile.tile<32x32xf16, #xetile.tile_attr<inner_blocks = [32, 16]>> -> vector<1x2x32x16xf16>
 
 
-    // CHECK: %[[REG4:.*]] = xegpu.create_nd_tdesc %arg1[%[[C64]], %[[C0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
+    // CHECK: %[[REG4:.*]] = xegpu.create_nd_tdesc %arg1[%[[C64]], %[[C0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>
     // CHECK: %[[C32:.*]] = arith.constant 32 : index
-    // CHECK: %[[REG5:.*]] = xegpu.create_nd_tdesc %arg1[%[[C64]], %[[C32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
+    // CHECK: %[[REG5:.*]] = xegpu.create_nd_tdesc %arg1[%[[C64]], %[[C32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>
     %2 = xetile.init_tile %arg1[%c64, %c0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16, #xetile.tile_attr<inner_blocks = [32, 16]>>
 
-    // CHECK: %[[REG6:.*]] = xegpu.load_nd %[[REG4]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x16xf16>
+    // CHECK: %[[REG6:.*]] = xegpu.load_nd %[[REG4]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x16xf16>
     // CHECK: %[[REG7:.*]] = vector.extract %[[REG6]][0] : vector<32x16xf16> from vector<2x32x16xf16>
     // CHECK: %[[REG8:.*]] = vector.extract %[[REG6]][1] : vector<32x16xf16> from vector<2x32x16xf16>
-    // CHECK: %[[REG9:.*]] = xegpu.load_nd %[[REG5]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x16xf16>
+    // CHECK: %[[REG9:.*]] = xegpu.load_nd %[[REG5]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x16xf16>
     // CHECK: %[[REG10:.*]] = vector.extract %[[REG9]][0] : vector<32x16xf16> from vector<2x32x16xf16>
     // CHECK: %[[REG11:.*]] = vector.extract %[[REG9]][1] : vector<32x16xf16> from vector<2x32x16xf16>
     %3 = xetile.load_tile %2 {padding = 0.000000e+00 : f32}  : !xetile.tile<32x64xf16, #xetile.tile_attr<inner_blocks = [32, 16]>> -> vector<1x4x32x16xf16>
diff --git a/test/Conversion/XeTileToXeGPU/test_order.mlir b/test/Conversion/XeTileToXeGPU/test_order.mlir
index 60b8b029f..b0eedce8e 100644
--- a/test/Conversion/XeTileToXeGPU/test_order.mlir
+++ b/test/Conversion/XeTileToXeGPU/test_order.mlir
@@ -5,10 +5,10 @@
 // CHECK: %[[C0:.*]] = arith.constant 0 : index
 // CHECK: %[[C16:.*]] = arith.constant 16 : index
 // CHECK: %[[R_CAST:.*]] = memref.reinterpret_cast %[[ARG1]] to offset: [0], sizes: [128, 64], strides: [64, 1] : memref<64x128xf16, strided<[1, 64]>> to memref<128x64xf16, strided<[64, 1]>>
-// CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[R_CAST]][%[[C0]], %[[C0]]] : memref<128x64xf16, strided<[64, 1]>> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-// CHECK: %[[T8:.*]] = xegpu.load_nd %[[T1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xf16>
-// CHECK: %[[T19:.*]] = xegpu.update_nd_offset %[[T1]], [%[[C0]], %[[C16]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-// CHECK: %[[T26:.*]] = xegpu.load_nd %[[T19]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xf16>
+// CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[R_CAST]][%[[C0]], %[[C0]]] : memref<128x64xf16, strided<[64, 1]>> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+// CHECK: %[[T8:.*]] = xegpu.load_nd %[[T1]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xf16>
+// CHECK: %[[T19:.*]] = xegpu.update_nd_offset %[[T1]], [%[[C0]], %[[C16]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+// CHECK: %[[T26:.*]] = xegpu.load_nd %[[T19]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xf16>
 gpu.module @test_kernel {
 func.func @test_func(%A : memref<128x64xf16>, %B : memref<64x128xf16, strided<[1, 64], offset: 0>>) {
   %c0 = arith.constant 0 : index
diff --git a/test/Dialect/XeGPU/IR/XeGPUOps.mlir b/test/Dialect/XeGPU/IR/XeGPUOps.mlir
index 6f729cc9f..7d88ec0a7 100644
--- a/test/Dialect/XeGPU/IR/XeGPUOps.mlir
+++ b/test/Dialect/XeGPU/IR/XeGPUOps.mlir
@@ -24,10 +24,10 @@ func.func @test_create_nd_tdesc_vc(%src: memref<24x32xf32>) {
 
 // CHECK-LABEL: func @test_create_tdesc_vc({{.*}}) {
 func.func @test_create_tdesc_vc(%src: ui64, %offsets : vector<16 x index>) {
-  // CHECK: xegpu.create_tdesc %{{.*}} : ui64
-  // CHECK-SAME: !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<memory_scope = slm, chunk_size = 2 : i64>>
-  %1 = xegpu.create_tdesc %src[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] : ui64
-          -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<memory_scope = slm, chunk_size = 2>>
+  // CHECK: xegpu.create_tdesc %{{.*}} : ui64, vector<16xindex>
+  // CHECK-SAME: !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+  %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16 x index>
+          -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
   return
 }
 
diff --git a/test/Dialect/XeGPU/IR/atomic_rmw_vc.mlir b/test/Dialect/XeGPU/IR/atomic_rmw_vc.mlir
index 7551f7308..159c338a0 100644
--- a/test/Dialect/XeGPU/IR/atomic_rmw_vc.mlir
+++ b/test/Dialect/XeGPU/IR/atomic_rmw_vc.mlir
@@ -5,8 +5,8 @@
 // RUN: imex-opt -mlir-print-op-generic %s | imex-opt | FileCheck %s
 
 // CHECK-LABEL: func @test_atomic_rmw({{.*}}) {
-func.func @test_atomic_rmw(%src: ui64, %value : vector<16xf32>, %mask : vector<16xi1>) {
-  %1 = xegpu.create_tdesc %src[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] : ui64 -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+func.func @test_atomic_rmw(%src: ui64, %offsets : vector<16 x index>, %value : vector<16xf32>, %mask : vector<16xi1>) {
+  %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
 
   // CHECK: xegpu.atomic_rmw
   // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>, vector<16xf32>
@@ -16,8 +16,8 @@ func.func @test_atomic_rmw(%src: ui64, %value : vector<16xf32>, %mask : vector<1
 }
 
 // CHECK-LABEL: func @test_atomic_rmw_0({{.*}}) {
-func.func @test_atomic_rmw_0(%src: ui64, %value : vector<16x2xf32>, %mask : vector<16xi1>) {
-  %1 = xegpu.create_tdesc %src[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] : ui64 -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
+func.func @test_atomic_rmw_0(%src: ui64, %offsets : vector<16 x index>, %value : vector<16x2xf32>, %mask : vector<16xi1>) {
+  %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16 x index> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
 
   // CHECK: xegpu.atomic_rmw
   // CHECK-SAME: !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<16xi1>, vector<16x2xf32>
@@ -27,8 +27,8 @@ func.func @test_atomic_rmw_0(%src: ui64, %value : vector<16x2xf32>, %mask : vect
 }
 
 // CHECK-LABEL: func @test_atomic_rmw_1({{.*}}) {
-func.func @test_atomic_rmw_1(%src: ui64, %value : vector<16x2xi32>, %mask : vector<16xi1>) {
-  %1 = xegpu.create_tdesc %src[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] : ui64 -> !xegpu.tensor_desc<16x2xi32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
+func.func @test_atomic_rmw_1(%src: ui64, %offsets : vector<16 x index>, %value : vector<16x2xi32>, %mask : vector<16xi1>) {
+  %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16 x index> -> !xegpu.tensor_desc<16x2xi32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
 
   // CHECK: xegpu.atomic_rmw
   // CHECK-SAME: !xegpu.tensor_desc<16x2xi32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<16xi1>, vector<16x2xi32>
diff --git a/test/Dialect/XeGPU/IR/create_nd_tdesc.mlir b/test/Dialect/XeGPU/IR/create_nd_tdesc.mlir
index e437622a6..d351fb826 100644
--- a/test/Dialect/XeGPU/IR/create_nd_tdesc.mlir
+++ b/test/Dialect/XeGPU/IR/create_nd_tdesc.mlir
@@ -65,9 +65,9 @@ func.func @test_create_nd_tdesc_4(%src: memref<?x?xf16>, %w : index, %h : index,
 func.func @test_create_nd_tdesc_5(%src: memref<?x?xf16>, %w : index, %h : index, %x : index, %y : index) {
   %c1 = arith.constant 1 : index
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<memory_scope = slm, map = <wi_layout = [2, 8], wi_data = [1, 2]>>>
+  // CHECK-SAME: memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<memory_space = slm, map = <wi_layout = [2, 8], wi_data = [1, 2]>>>
   %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1]
-                                  : memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<memory_scope = slm, map = #sg_map_fp16>>
+                                  : memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<memory_space = slm, map = #sg_map_fp16>>
   return
 }
 
@@ -75,9 +75,9 @@ func.func @test_create_nd_tdesc_5(%src: memref<?x?xf16>, %w : index, %h : index,
 func.func @test_create_nd_tdesc_6(%src: memref<?x?xf16>, %w : index, %h : index, %x : index, %y : index) {
   %c1 = arith.constant 1 : index
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<memory_scope = slm, map = <wi_layout = [2, 8], wi_data = [1, 2]>>>
+  // CHECK-SAME: memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<memory_space = slm, map = <wi_layout = [2, 8], wi_data = [1, 2]>>>
   %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1]
-                            : memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<memory_scope = slm, map = #sg_map_fp16>>
+                            : memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<memory_space = slm, map = #sg_map_fp16>>
   return
 }
 
@@ -94,9 +94,9 @@ func.func @test_create_nd_tdesc_7(%src: memref<1024xf16>, %offset : index) {
 func.func @test_create_nd_tdesc_8(%src: memref<?x?xf16>, %w : index, %h : index, %x : index) {
   %c1 = arith.constant 1 : index
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<memory_scope = slm, map = <wi_layout = [2, 8], wi_data = [1, 2]>>>
+  // CHECK-SAME: memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<memory_space = slm, map = <wi_layout = [2, 8], wi_data = [1, 2]>>>
   %1 = xegpu.create_nd_tdesc %src[8, %x], [%h, %w], [%w, %c1]
-                                    : memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<memory_scope = slm, map = #sg_map_fp16>>
+                                    : memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<memory_space = slm, map = #sg_map_fp16>>
   return
 }
 
@@ -104,8 +104,8 @@ func.func @test_create_nd_tdesc_8(%src: memref<?x?xf16>, %w : index, %h : index,
 func.func @test_create_nd_tdesc_9(%src: memref<?x?xf16>, %w : index, %h : index, %x : index) {
   %c1 = arith.constant 1 : index
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: memref<?x?xf16> -> !xegpu.tensor_desc<64x128xf16, #xegpu.block_tdesc_attr<memory_scope = slm, map = <wi_layout = [2, 8], wi_data = [1, 2]>>>
+  // CHECK-SAME: memref<?x?xf16> -> !xegpu.tensor_desc<64x128xf16, #xegpu.block_tdesc_attr<memory_space = slm, map = <wi_layout = [2, 8], wi_data = [1, 2]>>>
   %1 = xegpu.create_nd_tdesc %src[8, %x], [%h, %w], [%w, %c1] : memref<?x?xf16>
-            -> !xegpu.tensor_desc<64x128xf16, #xegpu.block_tdesc_attr<memory_scope = slm, map = #sg_map_fp16>>
+            -> !xegpu.tensor_desc<64x128xf16, #xegpu.block_tdesc_attr<memory_space = slm, map = #sg_map_fp16>>
   return
 }
diff --git a/test/Dialect/XeGPU/IR/create_nd_tdesc_vc.mlir b/test/Dialect/XeGPU/IR/create_nd_tdesc_vc.mlir
index e24c15574..6f652a21c 100644
--- a/test/Dialect/XeGPU/IR/create_nd_tdesc_vc.mlir
+++ b/test/Dialect/XeGPU/IR/create_nd_tdesc_vc.mlir
@@ -69,9 +69,8 @@ func.func @test_create_nd_tdesc_vc_5(%src: memref<?x?xf32>, %w : index, %h : ind
   %c1 = arith.constant 1 : index
   // CHECK: xegpu.create_nd_tdesc
   // CHECK-SAME: %arg0[%arg3, %arg4], [%arg2, %arg1], [%arg1, %c1]
-  // CHECK-SAME: memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope = slm>>
-  %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1]
-                                  : memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope = slm>>
+  // CHECK-SAME: memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32>
+  %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] : memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32>
   return
 }
 
@@ -80,9 +79,8 @@ func.func @test_create_nd_tdesc_vc_6(%src: memref<?x?xf32>, %w : index, %h : ind
   %c1 = arith.constant 1 : index
   // CHECK: xegpu.create_nd_tdesc
   // CHECK-SAME: %arg0[%arg3, %arg4], [%arg2, %arg1], [%arg1, %c1]
-  // CHECK-SAME: memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope = slm>>
-  %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1]
-                            : memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope = slm>>
+  // CHECK-SAME: memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32>
+  %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] : memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32>
   return
 }
 
@@ -100,16 +98,15 @@ func.func @test_create_nd_tdesc_vc_7(%src: memref<1024xf32>, %offset : index) {
 func.func @test_create_nd_tdesc_vc_8(%src: memref<?x?xf32>, %w : index, %h : index, %x : index) {
   %c1 = arith.constant 1 : index
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope = slm>>
-  %1 = xegpu.create_nd_tdesc %src[8, %x], [%h, %w], [%w, %c1]
-                                    : memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope = slm>>
+  // CHECK-SAME: memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32>
+  %1 = xegpu.create_nd_tdesc %src[8, %x], [%h, %w], [%w, %c1] : memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32>
   return
 }
 
 // CHECK-LABEL: func @test_create_nd_tdesc_vc_9({{.*}}) {
 func.func @test_create_nd_tdesc_vc_9(%src: memref<8x32xf32>) {
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: memref<8x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  slm, array_length = 2 : i64>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope = slm, array_length = 2>>
+  // CHECK-SAME: memref<8x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<array_length = 2>>
   return
 }
diff --git a/test/Dialect/XeGPU/IR/create_tdesc_vc.mlir b/test/Dialect/XeGPU/IR/create_tdesc_vc.mlir
index fd2da2354..137f77816 100644
--- a/test/Dialect/XeGPU/IR/create_tdesc_vc.mlir
+++ b/test/Dialect/XeGPU/IR/create_tdesc_vc.mlir
@@ -1,49 +1,51 @@
-// RUN: IMEX_XEGPU_PRINT_DEFAULTS=true imex-opt %s | FileCheck %s
+// RUN: imex-opt %s | FileCheck %s
 // Verify the printed output can be parsed.
-// RUN: IMEX_XEGPU_PRINT_DEFAULTS=true imex-opt %s | IMEX_XEGPU_PRINT_DEFAULTS=true imex-opt | FileCheck %s
+// RUN: imex-opt %s |  imex-opt | FileCheck %s
 // Verify the generic form can be parsed.
-// RUN: IMEX_XEGPU_PRINT_DEFAULTS=true imex-opt -mlir-print-op-generic %s | IMEX_XEGPU_PRINT_DEFAULTS=true imex-opt | FileCheck %s
+// RUN: imex-opt -mlir-print-op-generic %s |  imex-opt | FileCheck %s
 
 
 // CHECK-LABEL: func @test_create_tdesc_vc({{.*}}) {
-func.func @test_create_tdesc_vc(%src: ui64) {
-  // CHECK: xegpu.create_tdesc %{{.*}} : ui64 -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
-  %1 = xegpu.create_tdesc %src[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] : ui64 -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+func.func @test_create_tdesc_vc(%src: ui64, %offsets : vector<16 x index>) {
+  // CHECK: xegpu.create_tdesc %arg0, %arg1
+  // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+  %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
   return
 }
 
 // CHECK-LABEL: func @test_create_tdesc_vc_2({{.*}}) {
-func.func @test_create_tdesc_vc_2(%src: ui64) {
-  // CHECK: xegpu.create_tdesc %{{.*}} : ui64 -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_scope =  slm>>
-  %1 = xegpu.create_tdesc %src[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] : ui64
-                            -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_scope = slm>>
+func.func @test_create_tdesc_vc_2(%src: ui64, %offsets : vector<16 x index>) {
+  // CHECK: xegpu.create_tdesc %arg0, %arg1
+  // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+  %1 = xegpu.create_tdesc %src, %offsets :
+        ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
   return
 }
 
 // CHECK-LABEL: func @test_create_tdesc_vc_3({{.*}}) {
-func.func @test_create_tdesc_vc_3(%src: ui64) {
-  // CHECK: xegpu.create_tdesc %{{.*}} : ui64 -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>>
-  %1 = xegpu.create_tdesc %src[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] : ui64
+func.func @test_create_tdesc_vc_3(%src: ui64, %offsets : vector<16 x index>) {
+  // CHECK: xegpu.create_tdesc %arg0, %arg1
+  // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>>
+  %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16 x index>
             -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8>>
   return
 }
 
 // CHECK-LABEL: func @test_create_tdesc_vc_4({{.*}}) {
-func.func @test_create_tdesc_vc_4(%src: ui64) {
-  // CHECK: xegpu.create_tdesc %{{.*}} : ui64
-  // CHECK-SAME: !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<memory_scope =  slm, chunk_size = 2 : i64>>
-  %1 = xegpu.create_tdesc %src[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] : ui64
-            -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<memory_scope = slm, chunk_size = 2>>
+func.func @test_create_tdesc_vc_4(%src: ui64, %offsets : vector<16 x index>) {
+  // CHECK: xegpu.create_tdesc %arg0, %arg1 : ui64, vector<16xindex>
+  // CHECK-SAME: !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+  %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16 x index>
+            -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
   return
 }
 
 
 // CHECK-LABEL: func @test_create_tdesc_vc_5({{.*}}) {
-func.func @test_create_tdesc_vc_5(%src: memref<?xf32>) {
-  // CHECK: xegpu.create_tdesc
-  // CHECK-SAME: memref<?xf32>
-  // CHECK-SAME: !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<memory_scope =  slm, chunk_size = 2 : i64>>
-  %1 = xegpu.create_tdesc %src[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] : memref<?xf32>
-            -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<memory_scope = slm, chunk_size = 2>>
+func.func @test_create_tdesc_vc_5(%src: memref<?xf32, 3>, %offsets : vector<16 x index>) {
+  // CHECK: xegpu.create_tdesc {{.*}} : memref<?xf32, 3>, vector<16xindex>
+  // CHECK-SAME: !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<memory_space =  slm, chunk_size = 2 : i64>>
+  %1 = xegpu.create_tdesc %src, %offsets : memref<?xf32, 3>, vector<16 x index>
+            -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<memory_space = slm, chunk_size = 2>>
   return
 }
diff --git a/test/Dialect/XeGPU/IR/invalid_vc.mlir b/test/Dialect/XeGPU/IR/invalid_vc.mlir
index 3cfb4ad9b..aef5e77a5 100644
--- a/test/Dialect/XeGPU/IR/invalid_vc.mlir
+++ b/test/Dialect/XeGPU/IR/invalid_vc.mlir
@@ -47,19 +47,19 @@ func.func @test_create_nd_tdesc_vc_5(%input: memref<24x32x64xf32>) {
 }
 
 // -----
-func.func @test_create_tdesc(%src: ui64) {
-  // expected-error@+1 {{Incorrect TensorDesc shape}}
-  %1 = xegpu.create_tdesc %src[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17]
-                              : ui64 -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<>>
+func.func @test_create_tdesc(%src: ui64, %offsets : vector<16x8xindex>) {
+  // expected-error@+1 {{operand #1 must be vector of index values of ranks 1}}
+  %1 = xegpu.create_tdesc %src, %offsets
+                              : ui64, vector<16x8xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<>>
   return
 }
 
 // -----
-func.func @test_load_gather(%src: ui64) {
+func.func @test_load_gather(%src: ui64, %offsets : vector<16xindex>) {
   %0 = arith.constant dense<1>: vector<16xi1>
-  // CHECK: xegpu.create_tdesc {{.*}} : ui64
+  // CHECK: xegpu.create_tdesc {{.*}} : ui64, vector<16xindex>
   // CHECK-SAME: !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>>
-  %1 = xegpu.create_tdesc %src[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] : ui64
+  %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex>
         -> !xegpu.tensor_desc<16x8xf16, #xegpu.scatter_tdesc_attr<chunk_size = 8>>
 
   // expected-error@+1 {{failed to verify that all of {value, TensorDesc} have same rank}}
@@ -69,25 +69,25 @@ func.func @test_load_gather(%src: ui64) {
 }
 
 // -----
-func.func @test_create_tdesc_oversized(%src: ui64) {
+func.func @test_create_tdesc_oversized(%src: ui64, %offsets : vector<16xindex>) {
   // expected-error@+1 {{total access size (simd_lanes * chunk_size * sizeof(elemTy)) is upto 512 bytes}}
-  %1 = xegpu.create_tdesc %src[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] : ui64
+  %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex>
               -> !xegpu.tensor_desc<16x16xf32, #xegpu.scatter_tdesc_attr<chunk_size = 16>>
   return
 }
 
 // -----
-func.func @test_create_tdesc_invalid_chunk_size(%src: ui64) {
+func.func @test_create_tdesc_invalid_chunk_size(%src: ui64, %offsets : vector<16xindex>) {
   // expected-error@+1 {{Invalid chunk_size. Supported values are 1, 2, 3, 4, 8, 16, 32, 64, 128, or 256.}}
-  %1 = xegpu.create_tdesc %src[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] : ui64
+  %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex>
               -> !xegpu.tensor_desc<16x7xf32, #xegpu.scatter_tdesc_attr<chunk_size = 7>>
   return
 }
 
 // -----
-func.func @test_create_tdesc_unaligned(%src: ui64) {
+func.func @test_create_tdesc_unaligned(%src: ui64, %offsets : vector<16xindex>) {
   // expected-error@+1 {{access size (chunk_size * sizeof(elemTy)) should be 32-bit aligned}}
-  %1 = xegpu.create_tdesc %src[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] : ui64
+  %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex>
               -> !xegpu.tensor_desc<16x3xf16, #xegpu.scatter_tdesc_attr<chunk_size = 3>>
   return
 }
diff --git a/test/Dialect/XeGPU/IR/load_gather_vc.mlir b/test/Dialect/XeGPU/IR/load_gather_vc.mlir
index 3209205ac..e65275267 100644
--- a/test/Dialect/XeGPU/IR/load_gather_vc.mlir
+++ b/test/Dialect/XeGPU/IR/load_gather_vc.mlir
@@ -1,16 +1,16 @@
-// RUN: IMEX_XEGPU_PRINT_DEFAULTS=true imex-opt %s | FileCheck %s
+// RUN:  imex-opt %s | FileCheck %s
 // Verify the printed output can be parsed.
-// RUN: IMEX_XEGPU_PRINT_DEFAULTS=true imex-opt %s | IMEX_XEGPU_PRINT_DEFAULTS=true imex-opt | FileCheck %s
+// RUN:  imex-opt %s |  imex-opt | FileCheck %s
 // Verify the generic form can be parsed.
-// RUN: IMEX_XEGPU_PRINT_DEFAULTS=true imex-opt -mlir-print-op-generic %s | IMEX_XEGPU_PRINT_DEFAULTS=true imex-opt | FileCheck %s
+// RUN:  imex-opt -mlir-print-op-generic %s |  imex-opt | FileCheck %s
 
 
 // CHECK-LABEL: func @test_load_gather_vc({{.*}}) {
-func.func @test_load_gather_vc(%src: ui64) {
+func.func @test_load_gather_vc(%src: ui64, %offsets : vector<16xindex>) {
   %0 = arith.constant dense<1>: vector<16xi1>
-  //CHECK: {{.*}} = xegpu.create_tdesc {{.*}} : ui64
+  //CHECK: {{.*}} = xegpu.create_tdesc {{.*}}, {{.*}} : ui64, vector<16xindex>
   //CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
-  %1 = xegpu.create_tdesc %src[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] : ui64 -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+  %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
 
   // CHECK: {{.*}} = xegpu.load {{.*}}, {{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
   // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32>
@@ -20,12 +20,12 @@ func.func @test_load_gather_vc(%src: ui64) {
 }
 
 // CHECK-LABEL: func @test_load_gather_vc_2({{.*}}) {
-func.func @test_load_gather_vc_2(%src: ui64) {
+func.func @test_load_gather_vc_2(%src: ui64, %offsets : vector<16xindex>) {
   %0 = arith.constant dense<1>: vector<16xi1>
 
-  //CHECK: {{.*}} = xegpu.create_tdesc {{.*}} : ui64
+  //CHECK: {{.*}} = xegpu.create_tdesc {{.*}} : ui64, vector<16xindex>
   //CHECK-SAME: !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>>
-  %1 = xegpu.create_tdesc %src[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] : ui64
+  %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex>
           -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8>>
 
   //CHECK: {{.*}} = xegpu.load {{.*}}, {{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}>
@@ -36,11 +36,11 @@ func.func @test_load_gather_vc_2(%src: ui64) {
 }
 
 // CHECK-LABEL: func @test_load_gather_vc_4({{.*}}) {
-func.func @test_load_gather_vc_4(%src: ui64) {
+func.func @test_load_gather_vc_4(%src: ui64, %offsets : vector<16xindex>) {
   %0 = arith.constant dense<1>: vector<16xi1>
 
-  //CHECK: {{.*}} = xegpu.create_tdesc {{.*}} : ui64 -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
-  %1 = xegpu.create_tdesc %src[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] : ui64
+  //CHECK: {{.*}} = xegpu.create_tdesc {{.*}}, {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+  %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex>
         -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
 
   //CHECK: {{.*}} = xegpu.load {{.*}}, {{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
diff --git a/test/Dialect/XeGPU/IR/store_scatter_vc.mlir b/test/Dialect/XeGPU/IR/store_scatter_vc.mlir
index ad8a5b9f4..df304e739 100644
--- a/test/Dialect/XeGPU/IR/store_scatter_vc.mlir
+++ b/test/Dialect/XeGPU/IR/store_scatter_vc.mlir
@@ -5,13 +5,16 @@
 // RUN: imex-opt -mlir-print-op-generic %s | imex-opt | FileCheck %s
 
 // CHECK-LABEL: func @test_store_scatter_vc({{.*}}) {
-func.func @test_store_scatter_vc(%src: ui64, %dst: ui64) {
+func.func @test_store_scatter_vc(%src: ui64, %offsets : vector<16 x index>, %dst: ui64) {
   %0 = arith.constant dense<1>: vector<16xi1>
-  // CHECK: xegpu.create_tdesc %{{.*}} : ui64 -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
-  %1 = xegpu.create_tdesc %src[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] : ui64 -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+  // CHECK: xegpu.create_tdesc
+  // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+  %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
 
-  // CHECK: xegpu.create_tdesc %{{.*}} : ui64 -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
-  %2 = xegpu.create_tdesc %dst[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] : ui64 -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+  // CHECK: xegpu.create_tdesc
+  // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+  %2 = xegpu.create_tdesc %dst, %offsets
+          : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
 
   // CHECK: xegpu.load
   // CHECK-SAME: {l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}
diff --git a/test/Dialect/XeGPU/IR/update_offset_vc.mlir b/test/Dialect/XeGPU/IR/update_offset_vc.mlir
index 15f03b34e..2a90d4c07 100644
--- a/test/Dialect/XeGPU/IR/update_offset_vc.mlir
+++ b/test/Dialect/XeGPU/IR/update_offset_vc.mlir
@@ -7,8 +7,10 @@
 // CHECK-LABEL: func @test_update_offset_VC({{.*}}) {
 func.func @test_update_offset_VC(%src: ui64, %offsets : vector<16 x index>) {
   %0 = arith.constant dense<1>: vector<16xi1>
-  // CHECK: xegpu.create_tdesc %{{.*}} : ui64 -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
-  %1 = xegpu.create_tdesc %src[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] : ui64 -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+  // CHECK: xegpu.create_tdesc
+  // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+  %1 = xegpu.create_tdesc %src, %offsets
+              : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
 
   // CHECK: xegpu.load
   // CHECK-SAME: {l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}
@@ -16,8 +18,12 @@ func.func @test_update_offset_VC(%src: ui64, %offsets : vector<16 x index>) {
   %2 = xegpu.load %1, %0 {l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}
         : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32>
 
-  // CHECK: xegpu.update_offset %{{.*}} : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
-  %5 = xegpu.update_offset %1, [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+  %3 = arith.constant dense<16>: vector<16 x index>
+
+  // CHECK: xegpu.update_offset
+  // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xindex>
+  %5 = xegpu.update_offset %1, %3
+      : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16 x index>
 
   return
 }
diff --git a/test/Dialect/XeTile/IR/ops.mlir b/test/Dialect/XeTile/IR/ops.mlir
index f0ba5981d..1a952e3f0 100644
--- a/test/Dialect/XeTile/IR/ops.mlir
+++ b/test/Dialect/XeTile/IR/ops.mlir
@@ -22,8 +22,8 @@
 #wg_map_b2 = #xetile.wg_map<sg_layout = [4, 4], sg_data = [64, 64]>
 
 func.func @test_init_tile_for_slm(%a: memref<1024x1024xf16, 3>) {
-  //CHECK: xetile.init_tile {{.*}}[8, 16] : memref<1024x1024xf16, 3> -> !xetile.tile<32x64xf16, #xetile.tile_attr<memory_scope = 3 : i64>>
-  %1 = xetile.init_tile %a[8, 16] : memref<1024x1024xf16, 3> -> !xetile.tile<32x64xf16, #xetile.tile_attr<memory_scope = 3>>
+  //CHECK: xetile.init_tile {{.*}}[8, 16] : memref<1024x1024xf16, 3> -> !xetile.tile<32x64xf16, #xetile.tile_attr<memory_space = 3 : i64>>
+  %1 = xetile.init_tile %a[8, 16] : memref<1024x1024xf16, 3> -> !xetile.tile<32x64xf16, #xetile.tile_attr<memory_space = 3>>
   return
 }
 
diff --git a/test/Dialect/XeTile/Transforms/Blocking/sg_gemm_1k_1k_1k_f16_f32_slm.mlir b/test/Dialect/XeTile/Transforms/Blocking/sg_gemm_1k_1k_1k_f16_f32_slm.mlir
index 2bd13c234..0843b7485 100644
--- a/test/Dialect/XeTile/Transforms/Blocking/sg_gemm_1k_1k_1k_f16_f32_slm.mlir
+++ b/test/Dialect/XeTile/Transforms/Blocking/sg_gemm_1k_1k_1k_f16_f32_slm.mlir
@@ -1,6 +1,6 @@
 // RUN: imex-opt --xetile-init-duplicate --new-xetile-blocking --canonicalize --cse %s | FileCheck %s
 
-#slm = #xetile.tile_attr<memory_scope = 3>
+#slm = #xetile.tile_attr<memory_space = 3>
 
 // CHECK-LABEL: gpu.module @test_kernel {
 gpu.module @test_kernel {
@@ -26,8 +26,8 @@ gpu.module @test_kernel {
     %2 = xetile.init_tile %arg2[%0, %1] : memref<128x128xf32> -> !xetile.tile<8x16xf32>
     %3 = xetile.load_tile %2 {padding = 0.000000e+00 : f32}  : !xetile.tile<8x16xf32> -> vector<8x16xf32>
 
-    //CHECK: %[[r5:.*]] = xetile.init_tile %[[arg0]][%[[r0]], %[[c0]]] : memref<128x128xf16, 3> -> !xetile.tile<8x16xf16, #xetile.tile_attr<inner_blocks = [1, 16], memory_scope = 3 : i64>>
-    //CHECK: %[[r6:.*]] = xetile.init_tile %[[arg1]][%[[c0]], %[[r1]]] : memref<128x128xf16, 3> -> !xetile.tile<16x16xf16, #xetile.tile_attr<inner_blocks = [1, 16], memory_scope = 3 : i64>>
+    //CHECK: %[[r5:.*]] = xetile.init_tile %[[arg0]][%[[r0]], %[[c0]]] : memref<128x128xf16, 3> -> !xetile.tile<8x16xf16, #xetile.tile_attr<inner_blocks = [1, 16], memory_space = 3 : i64>>
+    //CHECK: %[[r6:.*]] = xetile.init_tile %[[arg1]][%[[c0]], %[[r1]]] : memref<128x128xf16, 3> -> !xetile.tile<16x16xf16, #xetile.tile_attr<inner_blocks = [1, 16], memory_space = 3 : i64>>
     %4 = xetile.init_tile %arg0[%0, %c0] : memref<128x128xf16, 3> -> !xetile.tile<8x16xf16, #slm>
     %5 = xetile.init_tile %arg1[%c0, %1] : memref<128x128xf16, 3> -> !xetile.tile<16x16xf16, #slm>
     %6:3 = scf.for %arg3 = %c0 to %c128 step %c16 iter_args(%arg4 = %4, %arg5 = %5, %arg6 = %3)
diff --git a/test/Dialect/XeTile/Transforms/sg_gemm_1k_1k_1k_f16_f32_slm.mlir b/test/Dialect/XeTile/Transforms/sg_gemm_1k_1k_1k_f16_f32_slm.mlir
index b2270afb1..91ae5b63c 100644
--- a/test/Dialect/XeTile/Transforms/sg_gemm_1k_1k_1k_f16_f32_slm.mlir
+++ b/test/Dialect/XeTile/Transforms/sg_gemm_1k_1k_1k_f16_f32_slm.mlir
@@ -1,6 +1,6 @@
 // RUN: imex-opt --xetile-init-duplicate --xetile-blocking --canonicalize --cse %s | FileCheck %s
 
-#slm = #xetile.tile_attr<memory_scope = 3>
+#slm = #xetile.tile_attr<memory_space = 3>
 
 // CHECK-LABEL: gpu.module @test_kernel {
 gpu.module @test_kernel {
@@ -26,8 +26,8 @@ gpu.module @test_kernel {
     %2 = xetile.init_tile %arg2[%0, %1] : memref<128x128xf32> -> !xetile.tile<8x16xf32>
     %3 = xetile.load_tile %2 {padding = 0.000000e+00 : f32}  : !xetile.tile<8x16xf32> -> vector<8x16xf32>
 
-    //CHECK: %[[r5:.*]] = xetile.init_tile %[[arg0]][%[[r0]], %[[c0]]] : memref<128x128xf16, 3> -> !xetile.tile<8x16xf16, #xetile.tile_attr<inner_blocks = [1, 16], memory_scope = 3 : i64>>
-    //CHECK: %[[r6:.*]] = xetile.init_tile %[[arg1]][%[[c0]], %[[r1]]] : memref<128x128xf16, 3> -> !xetile.tile<16x16xf16, #xetile.tile_attr<inner_blocks = [16, 16], memory_scope = 3 : i64>>
+    //CHECK: %[[r5:.*]] = xetile.init_tile %[[arg0]][%[[r0]], %[[c0]]] : memref<128x128xf16, 3> -> !xetile.tile<8x16xf16, #xetile.tile_attr<inner_blocks = [1, 16], memory_space = 3 : i64>>
+    //CHECK: %[[r6:.*]] = xetile.init_tile %[[arg1]][%[[c0]], %[[r1]]] : memref<128x128xf16, 3> -> !xetile.tile<16x16xf16, #xetile.tile_attr<inner_blocks = [16, 16], memory_space = 3 : i64>>
     %4 = xetile.init_tile %arg0[%0, %c0] : memref<128x128xf16, 3> -> !xetile.tile<8x16xf16, #slm>
     %5 = xetile.init_tile %arg1[%c0, %1] : memref<128x128xf16, 3> -> !xetile.tile<16x16xf16, #slm>
     %6:3 = scf.for %arg3 = %c0 to %c128 step %c16 iter_args(%arg4 = %4, %arg5 = %5, %arg6 = %3)
diff --git a/test/Dialect/XeTile/Transforms/wg_to_sg_btranspose.mlir b/test/Dialect/XeTile/Transforms/wg_to_sg_btranspose.mlir
index b05102cb1..312a07dc0 100644
--- a/test/Dialect/XeTile/Transforms/wg_to_sg_btranspose.mlir
+++ b/test/Dialect/XeTile/Transforms/wg_to_sg_btranspose.mlir
@@ -51,11 +51,11 @@ gpu.module @test_gemm_btranspose{
       %4 = arith.muli %block_id_x, %c2048 : index
       %5 = arith.muli %0, %c256 : index
       %6 = arith.addi %4, %5 : index
-      %7 = xetile.init_tile %arg2[%6, %3] : memref<16384x1536xf32> -> !xetile.tile<256x256xf32, #xetile.tile_attr<wg_map = <sg_layout = [8, 4], sg_data = [32, 64]>, inner_blocks = [], memory_scope = 0 : i32>>
+      %7 = xetile.init_tile %arg2[%6, %3] : memref<16384x1536xf32> -> !xetile.tile<256x256xf32, #xetile.tile_attr<wg_map = <sg_layout = [8, 4], sg_data = [32, 64]>, inner_blocks = [], memory_space = 0 : i32>>
       %8 = arith.muli %block_id_x, %c2048 : index
       %9 = arith.muli %0, %c256 : index
       %10 = arith.addi %8, %9 : index
-      %11 = xetile.init_tile %arg0[%10, %c0] : memref<16384x12288xf16> -> !xetile.tile<256x32xf16, #xetile.tile_attr<wg_map = <sg_layout = [8, 4], sg_data = [32, 32]>, inner_blocks = [], memory_scope = 0 : i32>>
+      %11 = xetile.init_tile %arg0[%10, %c0] : memref<16384x12288xf16> -> !xetile.tile<256x32xf16, #xetile.tile_attr<wg_map = <sg_layout = [8, 4], sg_data = [32, 32]>, inner_blocks = [], memory_space = 0 : i32>>
 
       //CHECK: %[[R7:.*]] = index.floordivs %[[R6]], %[[c8]]
       //CHECK: %[[R8:.*]] = index.remu %[[R6]], %[[c8]]
@@ -69,16 +69,16 @@ gpu.module @test_gemm_btranspose{
       //CHECK: %[[R16:.*]] = index.add %[[R15]], %[[c0]]
 
       //CHECK: %[[INITTILE:.*]] = xetile.init_tile %[[arg1]][%[[R12]], %[[R16]]] : memref<1536x12288xf16> -> !xetile.tile<64x32xf16>
-      %12 = xetile.init_tile %arg1[%2, %c0] : memref<1536x12288xf16> -> !xetile.tile<256x32xf16, #xetile.tile_attr<wg_map = <sg_layout = [4, 8], sg_data = [64, 32]>, inner_blocks = [], memory_scope = 0 : i32>>
-      %13:2 = scf.for %arg15 = %c0 to %c2 step %c1_1 iter_args(%arg16 = %7, %arg17 = %11) -> (!xetile.tile<256x256xf32, #xetile.tile_attr<wg_map = <sg_layout = [8, 4], sg_data = [32, 64]>, inner_blocks = [], memory_scope = 0 : i32>>, !xetile.tile<256x32xf16, #xetile.tile_attr<wg_map = <sg_layout = [8, 4], sg_data = [32, 32]>, inner_blocks = [], memory_scope = 0 : i32>>) {
-        %14 = xetile.update_tile_offset %arg17, [%c1024,  %c0] : !xetile.tile<256x32xf16, #xetile.tile_attr<wg_map = <sg_layout = [8, 4], sg_data = [32, 32]>, inner_blocks = [], memory_scope = 0 : i32>>, index, index -> !xetile.tile<256x32xf16, #xetile.tile_attr<wg_map = <sg_layout = [8, 4], sg_data = [32, 32]>, inner_blocks = [], memory_scope = 0 : i32>>
-        %15 = xetile.update_tile_offset %arg16, [%c1024,  %c0] : !xetile.tile<256x256xf32, #xetile.tile_attr<wg_map = <sg_layout = [8, 4], sg_data = [32, 64]>, inner_blocks = [], memory_scope = 0 : i32>>, index, index -> !xetile.tile<256x256xf32, #xetile.tile_attr<wg_map = <sg_layout = [8, 4], sg_data = [32, 64]>, inner_blocks = [], memory_scope = 0 : i32>>
-        %16:3 = scf.for %arg18 = %c0 to %c12288 step %c32_2 iter_args(%arg19 = %cst, %arg20 = %arg17, %arg21 = %12) -> (vector<256x256xf32>, !xetile.tile<256x32xf16, #xetile.tile_attr<wg_map = <sg_layout = [8, 4], sg_data = [32, 32]>, inner_blocks = [], memory_scope = 0 : i32>>, !xetile.tile<256x32xf16, #xetile.tile_attr<wg_map = <sg_layout = [4, 8], sg_data = [64, 32]>, inner_blocks = [], memory_scope = 0 : i32>>) {
-          %18 = xetile.update_tile_offset %arg21, [%c0,  %c32_2] : !xetile.tile<256x32xf16, #xetile.tile_attr<wg_map = <sg_layout = [4, 8], sg_data = [64, 32]>, inner_blocks = [], memory_scope = 0 : i32>>, index, index -> !xetile.tile<256x32xf16, #xetile.tile_attr<wg_map = <sg_layout = [4, 8], sg_data = [64, 32]>, inner_blocks = [], memory_scope = 0 : i32>>
-          %19 = xetile.update_tile_offset %arg20, [%c0,  %c32_2] : !xetile.tile<256x32xf16, #xetile.tile_attr<wg_map = <sg_layout = [8, 4], sg_data = [32, 32]>, inner_blocks = [], memory_scope = 0 : i32>>, index, index -> !xetile.tile<256x32xf16, #xetile.tile_attr<wg_map = <sg_layout = [8, 4], sg_data = [32, 32]>, inner_blocks = [], memory_scope = 0 : i32>>
-          %20 = xetile.load_tile %arg20 {padding = 0.000000e+00 : f32}  : !xetile.tile<256x32xf16, #xetile.tile_attr<wg_map = <sg_layout = [8, 4], sg_data = [32, 32]>, inner_blocks = [], memory_scope = 0 : i32>> -> vector<256x32xf16>
+      %12 = xetile.init_tile %arg1[%2, %c0] : memref<1536x12288xf16> -> !xetile.tile<256x32xf16, #xetile.tile_attr<wg_map = <sg_layout = [4, 8], sg_data = [64, 32]>, inner_blocks = [], memory_space = 0 : i32>>
+      %13:2 = scf.for %arg15 = %c0 to %c2 step %c1_1 iter_args(%arg16 = %7, %arg17 = %11) -> (!xetile.tile<256x256xf32, #xetile.tile_attr<wg_map = <sg_layout = [8, 4], sg_data = [32, 64]>, inner_blocks = [], memory_space = 0 : i32>>, !xetile.tile<256x32xf16, #xetile.tile_attr<wg_map = <sg_layout = [8, 4], sg_data = [32, 32]>, inner_blocks = [], memory_space = 0 : i32>>) {
+        %14 = xetile.update_tile_offset %arg17, [%c1024,  %c0] : !xetile.tile<256x32xf16, #xetile.tile_attr<wg_map = <sg_layout = [8, 4], sg_data = [32, 32]>, inner_blocks = [], memory_space = 0 : i32>>, index, index -> !xetile.tile<256x32xf16, #xetile.tile_attr<wg_map = <sg_layout = [8, 4], sg_data = [32, 32]>, inner_blocks = [], memory_space = 0 : i32>>
+        %15 = xetile.update_tile_offset %arg16, [%c1024,  %c0] : !xetile.tile<256x256xf32, #xetile.tile_attr<wg_map = <sg_layout = [8, 4], sg_data = [32, 64]>, inner_blocks = [], memory_space = 0 : i32>>, index, index -> !xetile.tile<256x256xf32, #xetile.tile_attr<wg_map = <sg_layout = [8, 4], sg_data = [32, 64]>, inner_blocks = [], memory_space = 0 : i32>>
+        %16:3 = scf.for %arg18 = %c0 to %c12288 step %c32_2 iter_args(%arg19 = %cst, %arg20 = %arg17, %arg21 = %12) -> (vector<256x256xf32>, !xetile.tile<256x32xf16, #xetile.tile_attr<wg_map = <sg_layout = [8, 4], sg_data = [32, 32]>, inner_blocks = [], memory_space = 0 : i32>>, !xetile.tile<256x32xf16, #xetile.tile_attr<wg_map = <sg_layout = [4, 8], sg_data = [64, 32]>, inner_blocks = [], memory_space = 0 : i32>>) {
+          %18 = xetile.update_tile_offset %arg21, [%c0,  %c32_2] : !xetile.tile<256x32xf16, #xetile.tile_attr<wg_map = <sg_layout = [4, 8], sg_data = [64, 32]>, inner_blocks = [], memory_space = 0 : i32>>, index, index -> !xetile.tile<256x32xf16, #xetile.tile_attr<wg_map = <sg_layout = [4, 8], sg_data = [64, 32]>, inner_blocks = [], memory_space = 0 : i32>>
+          %19 = xetile.update_tile_offset %arg20, [%c0,  %c32_2] : !xetile.tile<256x32xf16, #xetile.tile_attr<wg_map = <sg_layout = [8, 4], sg_data = [32, 32]>, inner_blocks = [], memory_space = 0 : i32>>, index, index -> !xetile.tile<256x32xf16, #xetile.tile_attr<wg_map = <sg_layout = [8, 4], sg_data = [32, 32]>, inner_blocks = [], memory_space = 0 : i32>>
+          %20 = xetile.load_tile %arg20 {padding = 0.000000e+00 : f32}  : !xetile.tile<256x32xf16, #xetile.tile_attr<wg_map = <sg_layout = [8, 4], sg_data = [32, 32]>, inner_blocks = [], memory_space = 0 : i32>> -> vector<256x32xf16>
           %21 = math.exp %20 {map = #xetile.wg_map<sg_layout = [8, 4], sg_data = [32, 32]>} : vector<256x32xf16>
-          %22 = xetile.load_tile %arg21 {padding = 0.000000e+00 : f32}  : !xetile.tile<256x32xf16, #xetile.tile_attr<wg_map = <sg_layout = [4, 8], sg_data = [64, 32]>, inner_blocks = [], memory_scope = 0 : i32>> -> vector<256x32xf16>
+          %22 = xetile.load_tile %arg21 {padding = 0.000000e+00 : f32}  : !xetile.tile<256x32xf16, #xetile.tile_attr<wg_map = <sg_layout = [4, 8], sg_data = [64, 32]>, inner_blocks = [], memory_space = 0 : i32>> -> vector<256x32xf16>
           //CHECK: %[[TRANSPOSE:.*]] vector.transpose {{%.*}}, [1, 0] : vector<64x32xf16> to vector<32x64xf16>
           %23 = vector.transpose %22, [1, 0] {map = #xetile.wg_map<sg_layout = [8, 4], sg_data = [32, 64]>} : vector<256x32xf16> to vector<32x256xf16>
           %24 = math.exp %23 {map = #xetile.wg_map<sg_layout = [8, 4], sg_data = [32, 64]>} : vector<32x256xf16>
@@ -86,11 +86,11 @@ gpu.module @test_gemm_btranspose{
           %25 = xetile.tile_mma %21, %24, %cst {wg_map_a =#xetile.wg_map<sg_layout = [8, 4], sg_data = [32, 32]>, wg_map_b =#xetile.wg_map<sg_layout = [8, 4], sg_data = [32, 64]>, wg_map_c =#xetile.wg_map<sg_layout = [8, 4], sg_data = [32, 64]>} : vector<256x32xf16>, vector<32x256xf16>, vector<256x256xf32> -> vector<256x256xf32>
           xegpu.compile_hint
           %26 = arith.addf %arg19, %25 {map = #xetile.wg_map<sg_layout = [8, 4], sg_data = [32, 64]>} : vector<256x256xf32>
-          scf.yield %26, %19, %18 : vector<256x256xf32>, !xetile.tile<256x32xf16, #xetile.tile_attr<wg_map = <sg_layout = [8, 4], sg_data = [32, 32]>, inner_blocks = [], memory_scope = 0 : i32>>, !xetile.tile<256x32xf16, #xetile.tile_attr<wg_map = <sg_layout = [4, 8], sg_data = [64, 32]>, inner_blocks = [], memory_scope = 0 : i32>>
+          scf.yield %26, %19, %18 : vector<256x256xf32>, !xetile.tile<256x32xf16, #xetile.tile_attr<wg_map = <sg_layout = [8, 4], sg_data = [32, 32]>, inner_blocks = [], memory_space = 0 : i32>>, !xetile.tile<256x32xf16, #xetile.tile_attr<wg_map = <sg_layout = [4, 8], sg_data = [64, 32]>, inner_blocks = [], memory_space = 0 : i32>>
         }
         %17 = math.exp %16#0 {map = #xetile.wg_map<sg_layout = [8, 4], sg_data = [32, 64]>} : vector<256x256xf32>
-        xetile.store_tile %17,  %arg16 : vector<256x256xf32>, !xetile.tile<256x256xf32, #xetile.tile_attr<wg_map = <sg_layout = [8, 4], sg_data = [32, 64]>, inner_blocks = [], memory_scope = 0 : i32>>
-        scf.yield %15, %14 : !xetile.tile<256x256xf32, #xetile.tile_attr<wg_map = <sg_layout = [8, 4], sg_data = [32, 64]>, inner_blocks = [], memory_scope = 0 : i32>>, !xetile.tile<256x32xf16, #xetile.tile_attr<wg_map = <sg_layout = [8, 4], sg_data = [32, 32]>, inner_blocks = [], memory_scope = 0 : i32>>
+        xetile.store_tile %17,  %arg16 : vector<256x256xf32>, !xetile.tile<256x256xf32, #xetile.tile_attr<wg_map = <sg_layout = [8, 4], sg_data = [32, 64]>, inner_blocks = [], memory_space = 0 : i32>>
+        scf.yield %15, %14 : !xetile.tile<256x256xf32, #xetile.tile_attr<wg_map = <sg_layout = [8, 4], sg_data = [32, 64]>, inner_blocks = [], memory_space = 0 : i32>>, !xetile.tile<256x32xf16, #xetile.tile_attr<wg_map = <sg_layout = [8, 4], sg_data = [32, 32]>, inner_blocks = [], memory_space = 0 : i32>>
       }
       gpu.terminator
     }
diff --git a/test/Dialect/XeTile/Transforms/wg_to_sg_gemm_postop.mlir b/test/Dialect/XeTile/Transforms/wg_to_sg_gemm_postop.mlir
index 888bb0bad..ac2f1261a 100644
--- a/test/Dialect/XeTile/Transforms/wg_to_sg_gemm_postop.mlir
+++ b/test/Dialect/XeTile/Transforms/wg_to_sg_gemm_postop.mlir
@@ -1,11 +1,11 @@
 // RUN: imex-opt --split-input-file --xetile-wg-to-sg --cse %s -verify-diagnostics | FileCheck %s
 
 #wg_map_a = #xetile.wg_map<sg_layout = [8, 4], sg_data = [40, 32]>
-#tile_attr_a = #xetile.tile_attr<wg_map = #wg_map_a, inner_blocks = [], memory_scope = 0>
+#tile_attr_a = #xetile.tile_attr<wg_map = #wg_map_a, inner_blocks = [], memory_space = 0>
 #wg_map_b = #xetile.wg_map<sg_layout = [8, 4], sg_data = [32, 96]>
-#tile_attr_b = #xetile.tile_attr<wg_map = #wg_map_b, inner_blocks = [], memory_scope = 0>
+#tile_attr_b = #xetile.tile_attr<wg_map = #wg_map_b, inner_blocks = [], memory_space = 0>
 #wg_map_c = #xetile.wg_map<sg_layout = [8, 4], sg_data = [40, 96]>
-#tile_attr_c = #xetile.tile_attr<wg_map = #wg_map_c, inner_blocks = [], memory_scope = 0>
+#tile_attr_c = #xetile.tile_attr<wg_map = #wg_map_c, inner_blocks = [], memory_space = 0>
 
 #map = affine_map<() -> (0)>
 #map1 = affine_map<() -> (12288)>
diff --git a/test/Integration/Dialect/XeGPU/gather_scatter_global/load_global_chunk_4_f32.mlir b/test/Integration/Dialect/XeGPU/gather_scatter_global/load_global_chunk_4_f32.mlir
index 2309e13f5..098eb8928 100644
--- a/test/Integration/Dialect/XeGPU/gather_scatter_global/load_global_chunk_4_f32.mlir
+++ b/test/Integration/Dialect/XeGPU/gather_scatter_global/load_global_chunk_4_f32.mlir
@@ -5,7 +5,7 @@
 // RUN:                                        --runner imex-cpu-runner -e main --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
 
-#scatter = #xegpu.scatter_tdesc_attr<memory_scope=global, chunk_size = 4>
+#scatter = #xegpu.scatter_tdesc_attr<memory_space=global, chunk_size = 4>
 module @gemm attributes {gpu.container_module} {
   func.func @test(%arg0: memref<16x4xf32>) -> memref<16x4xf32> attributes {llvm.emit_c_interface} {
     %c1 = arith.constant 1 : index
@@ -23,11 +23,13 @@ module @gemm attributes {gpu.container_module} {
 
   gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, SubgroupDispatch, VectorComputeINTEL, VectorAnyINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
     gpu.func @test_copy(%a: memref<16x4xf32>, %b: memref<16x4xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+
       %mask = arith.constant dense<1> : vector<16xi1>
+      %offsets = arith.constant dense<[0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60]> : vector<16xindex>
 
       // load from a using load_gather
       %a_cast = memref.reinterpret_cast %a to offset: [0], sizes: [64], strides: [1] : memref<16x4xf32> to memref<64xf32>
-      %a_tdesc = xegpu.create_tdesc %a_cast[0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60] : memref<64xf32> -> !xegpu.tensor_desc<16x4xf32, #scatter>
+      %a_tdesc = xegpu.create_tdesc %a_cast, %offsets : memref<64xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x4xf32, #scatter>
       xegpu.prefetch %a_tdesc : !xegpu.tensor_desc<16x4xf32, #scatter>
       %data = xegpu.load %a_tdesc, %mask {transpose} : !xegpu.tensor_desc<16x4xf32, #scatter>, vector<16xi1> -> vector<4x16xf32>
 
@@ -38,7 +40,7 @@ module @gemm attributes {gpu.container_module} {
 
       // store to b using store_scatter
       %b_cast = memref.reinterpret_cast %b to offset: [0], sizes: [64], strides: [1] : memref<16x4xf32> to memref<64xf32>
-      %b_tdesc = xegpu.create_tdesc %b_cast[0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60] : memref<64xf32> -> !xegpu.tensor_desc<16x4xf32, #scatter>
+      %b_tdesc = xegpu.create_tdesc %b_cast, %offsets : memref<64xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x4xf32, #scatter>
       xegpu.store %data, %b_tdesc, %mask {transpose} : vector<4x16xf32>, !xegpu.tensor_desc<16x4xf32, #scatter>, vector<16xi1>
       gpu.return
     }
diff --git a/test/Integration/Dialect/XeGPU/gather_scatter_global/load_global_chunk_8_f32.mlir b/test/Integration/Dialect/XeGPU/gather_scatter_global/load_global_chunk_8_f32.mlir
index 748b24ab4..3fbd1b227 100644
--- a/test/Integration/Dialect/XeGPU/gather_scatter_global/load_global_chunk_8_f32.mlir
+++ b/test/Integration/Dialect/XeGPU/gather_scatter_global/load_global_chunk_8_f32.mlir
@@ -5,7 +5,7 @@
 // RUN:                                        --runner imex-cpu-runner -e main --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
 
-#scatter = #xegpu.scatter_tdesc_attr<memory_scope=global, chunk_size = 8>
+#scatter = #xegpu.scatter_tdesc_attr<memory_space=global, chunk_size = 8>
 module @gemm attributes {gpu.container_module} {
   func.func @test(%arg0: memref<16x8xf32>) -> memref<16x8xf32> attributes {llvm.emit_c_interface} {
     %c1 = arith.constant 1 : index
@@ -23,11 +23,13 @@ module @gemm attributes {gpu.container_module} {
 
   gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, SubgroupDispatch, VectorComputeINTEL, VectorAnyINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
     gpu.func @test_copy(%a: memref<16x8xf32>, %b: memref<16x8xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+
       %mask = arith.constant dense<[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]> : vector<16xi1>
+      %offsets = arith.constant dense<[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120]> : vector<16xindex>
 
       // load from a using load_gather
       %a_cast = memref.reinterpret_cast %a to offset: [0], sizes: [128], strides: [1] : memref<16x8xf32> to memref<128xf32>
-      %a_tdesc = xegpu.create_tdesc %a_cast[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120] : memref<128xf32> -> !xegpu.tensor_desc<16x8xf32, #scatter>
+      %a_tdesc = xegpu.create_tdesc %a_cast, %offsets : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #scatter>
       xegpu.prefetch %a_tdesc : !xegpu.tensor_desc<16x8xf32, #scatter>
       %data = xegpu.load %a_tdesc, %mask {transpose} : !xegpu.tensor_desc<16x8xf32, #scatter>, vector<16xi1> -> vector<8x16xf32>
 
@@ -38,7 +40,7 @@ module @gemm attributes {gpu.container_module} {
 
       // store to b using store_scatter
       %b_cast = memref.reinterpret_cast %b to offset: [0], sizes: [128], strides: [1] : memref<16x8xf32> to memref<128xf32>
-      %b_tdesc = xegpu.create_tdesc %b_cast[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120] : memref<128xf32> -> !xegpu.tensor_desc<16x8xf32, #scatter>
+      %b_tdesc = xegpu.create_tdesc %b_cast, %offsets : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #scatter>
       xegpu.store %data, %b_tdesc, %mask {transpose} : vector<8x16xf32>, !xegpu.tensor_desc<16x8xf32, #scatter>, vector<16xi1>
       gpu.return
     }
diff --git a/test/Integration/Dialect/XeGPU/gather_scatter_global/load_global_no_chunk_f16.mlir b/test/Integration/Dialect/XeGPU/gather_scatter_global/load_global_no_chunk_f16.mlir
index f4edaad18..bd699c38d 100644
--- a/test/Integration/Dialect/XeGPU/gather_scatter_global/load_global_no_chunk_f16.mlir
+++ b/test/Integration/Dialect/XeGPU/gather_scatter_global/load_global_no_chunk_f16.mlir
@@ -5,7 +5,7 @@
 // RUN:                                        --runner imex-cpu-runner -e main --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
 
-#scatter = #xegpu.scatter_tdesc_attr<memory_scope=global>
+#scatter = #xegpu.scatter_tdesc_attr<memory_space=global>
 module @gemm attributes {gpu.container_module} {
   func.func @test(%arg0: memref<16xf16>) -> memref<16xf16> attributes {llvm.emit_c_interface} {
     %c1 = arith.constant 1 : index
@@ -25,16 +25,17 @@ module @gemm attributes {gpu.container_module} {
     gpu.func @test_copy(%a: memref<16xf16>, %b: memref<16xf16>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
 
       %mask = arith.constant dense<1> : vector<16xi1>
+      %offsets = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex>
 
       // load from a using load_gather
-      %a_tdesc = xegpu.create_tdesc %a[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : memref<16xf16> -> !xegpu.tensor_desc<16xf16, #scatter>
+      %a_tdesc = xegpu.create_tdesc %a, %offsets : memref<16xf16>, vector<16xindex> -> !xegpu.tensor_desc<16xf16, #scatter>
       %data = xegpu.load %a_tdesc, %mask : !xegpu.tensor_desc<16xf16, #scatter>, vector<16xi1> -> vector<16xf16>
 
       // %v1 = vector.extract %data[4]: f16 from vector<16xf16>
       // gpu.printf "\ndata[4] : %f.\n" %v1: f16
 
       // store to b using store_scatter
-      %b_tdesc = xegpu.create_tdesc %b[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : memref<16xf16> -> !xegpu.tensor_desc<16xf16, #scatter>
+      %b_tdesc = xegpu.create_tdesc %b, %offsets : memref<16xf16>, vector<16xindex> -> !xegpu.tensor_desc<16xf16, #scatter>
       xegpu.store %data, %b_tdesc, %mask : vector<16xf16>, !xegpu.tensor_desc<16xf16, #scatter>, vector<16xi1>
       gpu.return
     }
diff --git a/test/Integration/Dialect/XeGPU/gather_scatter_global/load_global_no_chunk_f32.mlir b/test/Integration/Dialect/XeGPU/gather_scatter_global/load_global_no_chunk_f32.mlir
index 7cffad49b..0516e79a6 100644
--- a/test/Integration/Dialect/XeGPU/gather_scatter_global/load_global_no_chunk_f32.mlir
+++ b/test/Integration/Dialect/XeGPU/gather_scatter_global/load_global_no_chunk_f32.mlir
@@ -5,7 +5,7 @@
 // RUN:                                        --runner imex-cpu-runner -e main --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
 
-#scatter = #xegpu.scatter_tdesc_attr<memory_scope=global>
+#scatter = #xegpu.scatter_tdesc_attr<memory_space=global>
 module @gemm attributes {gpu.container_module} {
   func.func @test(%arg0: memref<16xf32>) -> memref<16xf32> attributes {llvm.emit_c_interface} {
     %c1 = arith.constant 1 : index
@@ -28,12 +28,12 @@ module @gemm attributes {gpu.container_module} {
       %offsets = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex>
 
       // load from a using load_gather
-      %a_tdesc = xegpu.create_tdesc %a[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #scatter>
+      %a_tdesc = xegpu.create_tdesc %a, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #scatter>
       xegpu.prefetch %a_tdesc : !xegpu.tensor_desc<16xf32, #scatter>
       %data = xegpu.load %a_tdesc, %mask : !xegpu.tensor_desc<16xf32, #scatter>, vector<16xi1> -> vector<16xf32>
 
       // store to b using store_scatter
-      %b_tdesc = xegpu.create_tdesc %b[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #scatter>
+      %b_tdesc = xegpu.create_tdesc %b, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #scatter>
       xegpu.store %data, %b_tdesc, %mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #scatter>, vector<16xi1>
       gpu.return
     }
diff --git a/test/Integration/Dialect/XeGPU/gather_scatter_global/store_global_chunk_4_f32.mlir b/test/Integration/Dialect/XeGPU/gather_scatter_global/store_global_chunk_4_f32.mlir
index 282c41306..012be6c1d 100644
--- a/test/Integration/Dialect/XeGPU/gather_scatter_global/store_global_chunk_4_f32.mlir
+++ b/test/Integration/Dialect/XeGPU/gather_scatter_global/store_global_chunk_4_f32.mlir
@@ -5,7 +5,7 @@
 // RUN:                                        --runner imex-cpu-runner -e main --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
 
-#scatter = #xegpu.scatter_tdesc_attr<memory_scope=global, chunk_size = 4>
+#scatter = #xegpu.scatter_tdesc_attr<memory_space=global, chunk_size = 4>
 
 module @gemm attributes {gpu.container_module} {
   func.func @test() -> memref<16x4xf32> attributes {llvm.emit_c_interface} {
@@ -23,8 +23,9 @@ module @gemm attributes {gpu.container_module} {
                                    [48., 49., 50., 51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63.]]> : vector<4x16xf32>
 
       %mask = arith.constant dense<1> : vector<16xi1>
+      %offsets = arith.constant dense<[0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60]> : vector<16xindex>
       %cast = memref.reinterpret_cast %mem to offset: [0], sizes: [64], strides: [1] : memref<16x4xf32> to memref<64xf32>
-      %5 = xegpu.create_tdesc %cast[0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60] : memref<64xf32> -> !xegpu.tensor_desc<16x4xf32, #scatter>
+      %5 = xegpu.create_tdesc %cast, %offsets : memref<64xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x4xf32, #scatter>
       xegpu.store %cst, %5, %mask {transpose} : vector<4x16xf32>, !xegpu.tensor_desc<16x4xf32, #scatter>, vector<16xi1>
       gpu.return
     }
diff --git a/test/Integration/Dialect/XeGPU/gather_scatter_global/store_global_chunk_8_f32.mlir b/test/Integration/Dialect/XeGPU/gather_scatter_global/store_global_chunk_8_f32.mlir
index b614e21e5..f0241bfb0 100644
--- a/test/Integration/Dialect/XeGPU/gather_scatter_global/store_global_chunk_8_f32.mlir
+++ b/test/Integration/Dialect/XeGPU/gather_scatter_global/store_global_chunk_8_f32.mlir
@@ -6,7 +6,7 @@
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
 
 
-#scatter = #xegpu.scatter_tdesc_attr<memory_scope=global, chunk_size = 8>
+#scatter = #xegpu.scatter_tdesc_attr<memory_space=global, chunk_size = 8>
 
 module @gemm attributes {gpu.container_module} {
   func.func @test() -> memref<16x8xf32> attributes {llvm.emit_c_interface} {
@@ -28,9 +28,10 @@ module @gemm attributes {gpu.container_module} {
                                    [112., 113., 114., 115., 116., 117., 118., 119., 120., 121., 122., 123., 124., 125., 126., 127.]]> : vector<8x16xf32>
 
       %mask = arith.constant dense<1> : vector<16xi1>
+      %offsets = arith.constant dense<[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120]> : vector<16xindex>
 
       %cast = memref.reinterpret_cast %mem to offset: [0], sizes: [128], strides: [1] : memref<16x8xf32> to memref<128xf32>
-      %5 = xegpu.create_tdesc %cast[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120] : memref<128xf32> -> !xegpu.tensor_desc<16x8xf32, #scatter>
+      %5 = xegpu.create_tdesc %cast, %offsets : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #scatter>
       xegpu.store %cst, %5, %mask {transpose} : vector<8x16xf32>, !xegpu.tensor_desc<16x8xf32, #scatter>, vector<16xi1>
 
       gpu.return
diff --git a/test/Integration/Dialect/XeGPU/gather_scatter_global/store_global_no_chunk_f16.mlir b/test/Integration/Dialect/XeGPU/gather_scatter_global/store_global_no_chunk_f16.mlir
index 000547969..932f113e1 100644
--- a/test/Integration/Dialect/XeGPU/gather_scatter_global/store_global_no_chunk_f16.mlir
+++ b/test/Integration/Dialect/XeGPU/gather_scatter_global/store_global_no_chunk_f16.mlir
@@ -5,7 +5,7 @@
 // RUN:                                        --runner imex-cpu-runner -e main --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
 
-#scatter = #xegpu.scatter_tdesc_attr<memory_scope=global>
+#scatter = #xegpu.scatter_tdesc_attr<memory_space=global>
 
 module @gemm attributes {gpu.container_module} {
   func.func @test() -> memref<16xf16> attributes {llvm.emit_c_interface} {
@@ -20,7 +20,8 @@ module @gemm attributes {gpu.container_module} {
       %cst = arith.constant dense<[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0]> : vector<16xf16>
 
       %mask = arith.constant dense<1> : vector<16xi1>
-      %tdesc = xegpu.create_tdesc %mem[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : memref<16xf16> -> !xegpu.tensor_desc<16xf16, #scatter>
+      %offsets = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex>
+      %tdesc = xegpu.create_tdesc %mem, %offsets : memref<16xf16>, vector<16xindex> -> !xegpu.tensor_desc<16xf16, #scatter>
       xegpu.store %cst, %tdesc, %mask : vector<16xf16>, !xegpu.tensor_desc<16xf16, #scatter>, vector<16xi1>
 
       gpu.return
diff --git a/test/Integration/Dialect/XeGPU/gather_scatter_global/store_global_no_chunk_f32.mlir b/test/Integration/Dialect/XeGPU/gather_scatter_global/store_global_no_chunk_f32.mlir
index e83027672..f1508f56c 100644
--- a/test/Integration/Dialect/XeGPU/gather_scatter_global/store_global_no_chunk_f32.mlir
+++ b/test/Integration/Dialect/XeGPU/gather_scatter_global/store_global_no_chunk_f32.mlir
@@ -5,7 +5,7 @@
 // RUN:                                        --runner imex-cpu-runner -e main --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
 
-#scatter = #xegpu.scatter_tdesc_attr<memory_scope=global>
+#scatter = #xegpu.scatter_tdesc_attr<memory_space=global>
 
 module @gemm attributes {gpu.container_module} {
   func.func @test() -> memref<16xf32> attributes {llvm.emit_c_interface} {
@@ -20,7 +20,8 @@ module @gemm attributes {gpu.container_module} {
       %cst = arith.constant dense<[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0]> : vector<16xf32>
 
       %mask = arith.constant dense<1> : vector<16xi1>
-      %tdesc = xegpu.create_tdesc %mem[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #scatter>
+      %offsets = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex>
+      %tdesc = xegpu.create_tdesc %mem, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #scatter>
       xegpu.store %cst, %tdesc, %mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #scatter>, vector<16xi1>
 
       gpu.return
diff --git a/test/Integration/Dialect/XeGPU/gather_scatter_slm/store_load_slm_chunk_4_f32.mlir b/test/Integration/Dialect/XeGPU/gather_scatter_slm/store_load_slm_chunk_4_f32.mlir
index 11f8ff90f..2a354496e 100644
--- a/test/Integration/Dialect/XeGPU/gather_scatter_slm/store_load_slm_chunk_4_f32.mlir
+++ b/test/Integration/Dialect/XeGPU/gather_scatter_slm/store_load_slm_chunk_4_f32.mlir
@@ -5,8 +5,8 @@
 // RUN:                                        --runner imex-cpu-runner -e main --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
 
-#global = #xegpu.scatter_tdesc_attr<memory_scope=global, chunk_size = 4>
-#slm = #xegpu.scatter_tdesc_attr<memory_scope=slm, chunk_size = 4>
+#global = #xegpu.scatter_tdesc_attr<memory_space=global, chunk_size = 4>
+#slm = #xegpu.scatter_tdesc_attr<memory_space=slm, chunk_size = 4>
 
 module @gemm attributes {gpu.container_module} {
   func.func @test() -> memref<16x4xf32> attributes {llvm.emit_c_interface} {
@@ -24,17 +24,18 @@ module @gemm attributes {gpu.container_module} {
                                    [48., 49., 50., 51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63.]]> : vector<4x16xf32>
 
       %mask = arith.constant dense<1> : vector<16xi1>
+      %offsets = arith.constant dense<[0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60]> : vector<16xindex>
 
       // store the cst into slm and load it back;
       %slm = memref.alloc() : memref<64xf32, 3>
-      %slm_tdesc = xegpu.create_tdesc %slm[0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60] : memref<64xf32, 3> -> !xegpu.tensor_desc<16x4xf32, #slm>
+      %slm_tdesc = xegpu.create_tdesc %slm, %offsets : memref<64xf32, 3>, vector<16xindex> -> !xegpu.tensor_desc<16x4xf32, #slm>
       xegpu.store %cst, %slm_tdesc, %mask {transpose} : vector<4x16xf32>, !xegpu.tensor_desc<16x4xf32, #slm>, vector<16xi1>
       // load from slm
       %data = xegpu.load %slm_tdesc, %mask {transpose} : !xegpu.tensor_desc<16x4xf32, #slm>, vector<16xi1> -> vector<4x16xf32>
 
       // store data to global memory
       %cast = memref.reinterpret_cast %mem to offset: [0], sizes: [64], strides: [1] : memref<16x4xf32> to memref<64xf32>
-      %5 = xegpu.create_tdesc %cast[0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60] : memref<64xf32> -> !xegpu.tensor_desc<16x4xf32, #global>
+      %5 = xegpu.create_tdesc %cast, %offsets : memref<64xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x4xf32, #global>
       xegpu.store %data, %5, %mask {transpose} : vector<4x16xf32>, !xegpu.tensor_desc<16x4xf32, #global>, vector<16xi1>
       gpu.return
     }
diff --git a/test/Integration/Dialect/XeGPU/gather_scatter_slm/store_load_slm_chunk_8_f32.mlir b/test/Integration/Dialect/XeGPU/gather_scatter_slm/store_load_slm_chunk_8_f32.mlir
index 56c7aa516..bc5bf1708 100644
--- a/test/Integration/Dialect/XeGPU/gather_scatter_slm/store_load_slm_chunk_8_f32.mlir
+++ b/test/Integration/Dialect/XeGPU/gather_scatter_slm/store_load_slm_chunk_8_f32.mlir
@@ -6,8 +6,8 @@
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
 
 
-#global = #xegpu.scatter_tdesc_attr<memory_scope=global, chunk_size = 8>
-#slm = #xegpu.scatter_tdesc_attr<memory_scope=slm, chunk_size = 8>
+#global = #xegpu.scatter_tdesc_attr<memory_space=global, chunk_size = 8>
+#slm = #xegpu.scatter_tdesc_attr<memory_space=slm, chunk_size = 8>
 
 module @gemm attributes {gpu.container_module} {
   func.func @test() -> memref<16x8xf32> attributes {llvm.emit_c_interface} {
@@ -30,9 +30,10 @@ module @gemm attributes {gpu.container_module} {
                                    [112., 113., 114., 115., 116., 117., 118., 119., 120., 121., 122., 123., 124., 125., 126., 127.]]> : vector<8x16xf32>
 
       %mask = arith.constant dense<1> : vector<16xi1>
+      %offsets = arith.constant dense<[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120]> : vector<16xindex>
 
       // store the cst into slm
-      %slm_tdesc = xegpu.create_tdesc %slm[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120] : memref<128xf32, 3> -> !xegpu.tensor_desc<16x8xf32, #slm>
+      %slm_tdesc = xegpu.create_tdesc %slm, %offsets : memref<128xf32, 3>, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #slm>
       xegpu.store %cst, %slm_tdesc, %mask {transpose} : vector<8x16xf32>, !xegpu.tensor_desc<16x8xf32, #slm>, vector<16xi1>
 
       // load from slm
@@ -40,7 +41,7 @@ module @gemm attributes {gpu.container_module} {
 
       // store data to global memory
       %cast = memref.reinterpret_cast %mem to offset: [0], sizes: [128], strides: [1] : memref<16x8xf32> to memref<128xf32>
-      %5 = xegpu.create_tdesc %cast[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120] : memref<128xf32> -> !xegpu.tensor_desc<16x8xf32, #global>
+      %5 = xegpu.create_tdesc %cast, %offsets : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #global>
       xegpu.store %data, %5, %mask {transpose} : vector<8x16xf32>, !xegpu.tensor_desc<16x8xf32, #global>, vector<16xi1>
       gpu.return
     }
diff --git a/test/Integration/Dialect/XeGPU/gather_scatter_slm/store_load_slm_chunk_8_f32_mask.mlir b/test/Integration/Dialect/XeGPU/gather_scatter_slm/store_load_slm_chunk_8_f32_mask.mlir
index 1e8edb13a..ce5e0f521 100644
--- a/test/Integration/Dialect/XeGPU/gather_scatter_slm/store_load_slm_chunk_8_f32_mask.mlir
+++ b/test/Integration/Dialect/XeGPU/gather_scatter_slm/store_load_slm_chunk_8_f32_mask.mlir
@@ -6,8 +6,8 @@
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
 
 
-#global = #xegpu.scatter_tdesc_attr<memory_scope=global, chunk_size = 8>
-#slm = #xegpu.scatter_tdesc_attr<memory_scope=slm, chunk_size = 8>
+#global = #xegpu.scatter_tdesc_attr<memory_space=global, chunk_size = 8>
+#slm = #xegpu.scatter_tdesc_attr<memory_space=slm, chunk_size = 8>
 
 module @gemm attributes {gpu.container_module} {
   func.func @test() -> memref<16x8xf32> attributes {llvm.emit_c_interface} {
@@ -33,7 +33,7 @@ module @gemm attributes {gpu.container_module} {
       %offsets = arith.constant dense<[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120]> : vector<16xindex>
 
       // store the cst into slm
-      %slm_tdesc = xegpu.create_tdesc %slm[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120] : memref<128xf32, 3> -> !xegpu.tensor_desc<16x8xf32, #slm>
+      %slm_tdesc = xegpu.create_tdesc %slm, %offsets : memref<128xf32, 3>, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #slm>
       xegpu.store %cst, %slm_tdesc, %mask {transpose} : vector<8x16xf32>, !xegpu.tensor_desc<16x8xf32, #slm>, vector<16xi1>
 
       // load from slm
@@ -41,7 +41,7 @@ module @gemm attributes {gpu.container_module} {
 
       // store data to global memory
       %cast = memref.reinterpret_cast %mem to offset: [0], sizes: [128], strides: [1] : memref<16x8xf32> to memref<128xf32>
-      %5 = xegpu.create_tdesc %cast[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120] : memref<128xf32> -> !xegpu.tensor_desc<16x8xf32, #global>
+      %5 = xegpu.create_tdesc %cast, %offsets : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #global>
       xegpu.store %data, %5, %mask {transpose} : vector<8x16xf32>, !xegpu.tensor_desc<16x8xf32, #global>, vector<16xi1>
       gpu.return
     }
diff --git a/test/Integration/Dialect/XeGPU/gather_scatter_slm/store_load_slm_no_chunk_f16.mlir b/test/Integration/Dialect/XeGPU/gather_scatter_slm/store_load_slm_no_chunk_f16.mlir
index ea000b14b..695157f95 100644
--- a/test/Integration/Dialect/XeGPU/gather_scatter_slm/store_load_slm_no_chunk_f16.mlir
+++ b/test/Integration/Dialect/XeGPU/gather_scatter_slm/store_load_slm_no_chunk_f16.mlir
@@ -5,8 +5,8 @@
 // RUN:                                        --runner imex-cpu-runner -e main --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
 
-#global = #xegpu.scatter_tdesc_attr<memory_scope=global>
-#slm = #xegpu.scatter_tdesc_attr<memory_scope=slm>
+#global = #xegpu.scatter_tdesc_attr<memory_space=global>
+#slm = #xegpu.scatter_tdesc_attr<memory_space=slm>
 
 module @gemm attributes {gpu.container_module} {
   func.func @test() -> memref<16xf16> attributes {llvm.emit_c_interface} {
@@ -20,15 +20,16 @@ module @gemm attributes {gpu.container_module} {
     gpu.func @test_store_scatter(%mem: memref<16xf16>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
       %cst = arith.constant dense<[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0]> : vector<16xf16>
       %mask = arith.constant dense<1> : vector<16xi1>
+      %offsets = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex>
 
       // store the cst into slm and load it back;
       %slm = memref.alloc() : memref<16xf16, 3>
-      %slm_tdesc = xegpu.create_tdesc %slm[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : memref<16xf16, 3> -> !xegpu.tensor_desc<16xf16, #slm>
+      %slm_tdesc = xegpu.create_tdesc %slm, %offsets : memref<16xf16, 3>, vector<16xindex> -> !xegpu.tensor_desc<16xf16, #slm>
       xegpu.store %cst, %slm_tdesc, %mask : vector<16xf16>, !xegpu.tensor_desc<16xf16, #slm>, vector<16xi1>
       %data = xegpu.load %slm_tdesc, %mask : !xegpu.tensor_desc<16xf16, #slm>, vector<16xi1> -> vector<16xf16>
 
       // store data to global memory
-      %tdesc = xegpu.create_tdesc %mem[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : memref<16xf16> -> !xegpu.tensor_desc<16xf16, #global>
+      %tdesc = xegpu.create_tdesc %mem, %offsets : memref<16xf16>, vector<16xindex> -> !xegpu.tensor_desc<16xf16, #global>
       xegpu.store %data, %tdesc, %mask : vector<16xf16>, !xegpu.tensor_desc<16xf16, #global>, vector<16xi1>
 
       gpu.return
diff --git a/test/Integration/Dialect/XeGPU/gather_scatter_slm/store_load_slm_no_chunk_f32.mlir b/test/Integration/Dialect/XeGPU/gather_scatter_slm/store_load_slm_no_chunk_f32.mlir
index 2a10527fb..e75d5ee9d 100644
--- a/test/Integration/Dialect/XeGPU/gather_scatter_slm/store_load_slm_no_chunk_f32.mlir
+++ b/test/Integration/Dialect/XeGPU/gather_scatter_slm/store_load_slm_no_chunk_f32.mlir
@@ -5,8 +5,8 @@
 // RUN:                                        --runner imex-cpu-runner -e main --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
 
-#global = #xegpu.scatter_tdesc_attr<memory_scope=global>
-#slm = #xegpu.scatter_tdesc_attr<memory_scope=slm>
+#global = #xegpu.scatter_tdesc_attr<memory_space=global>
+#slm = #xegpu.scatter_tdesc_attr<memory_space=slm>
 
 module @gemm attributes {gpu.container_module} {
   func.func @test() -> memref<16xf32> attributes {llvm.emit_c_interface} {
@@ -21,14 +21,15 @@ module @gemm attributes {gpu.container_module} {
       %cst = arith.constant dense<[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0]> : vector<16xf32>
 
       %mask = arith.constant dense<1> : vector<16xi1>
+      %offsets = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex>
 
       // store the cst into slm and load it back;
       %slm = memref.alloc() : memref<16xf32, 3>
-      %slm_tdesc = xegpu.create_tdesc %slm[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : memref<16xf32, 3> -> !xegpu.tensor_desc<16xf32, #slm>
+      %slm_tdesc = xegpu.create_tdesc %slm, %offsets : memref<16xf32, 3>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #slm>
       xegpu.store %cst, %slm_tdesc, %mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #slm>, vector<16xi1>
       %data = xegpu.load %slm_tdesc, %mask : !xegpu.tensor_desc<16xf32, #slm>, vector<16xi1> -> vector<16xf32>
 
-      %tdesc = xegpu.create_tdesc %mem[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #global>
+      %tdesc = xegpu.create_tdesc %mem, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #global>
       xegpu.store %cst, %tdesc, %mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #global>, vector<16xi1>
 
       gpu.return
diff --git a/test/Integration/Dialect/XeGPU/load1d-slm-f32.mlir b/test/Integration/Dialect/XeGPU/load1d-slm-f32.mlir
index 7ac7761e5..353a233ed 100644
--- a/test/Integration/Dialect/XeGPU/load1d-slm-f32.mlir
+++ b/test/Integration/Dialect/XeGPU/load1d-slm-f32.mlir
@@ -7,7 +7,7 @@
 // RUN:                                        --entry-point-result=void \
 // RUN:                                        --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck
 
-#slm = #xegpu.block_tdesc_attr<memory_scope=slm>
+#slm = #xegpu.block_tdesc_attr<memory_space=slm>
 module @gemm attributes {gpu.container_module} {
   memref.global "private" constant @__constant_8x16xf32 : memref<32xf32> = dense<[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0]>
 
diff --git a/test/Integration/Dialect/XeGPU/loadgather2d_masked_f32.mlir b/test/Integration/Dialect/XeGPU/loadgather2d_masked_f32.mlir
index 34d8183fd..c082c7c3c 100644
--- a/test/Integration/Dialect/XeGPU/loadgather2d_masked_f32.mlir
+++ b/test/Integration/Dialect/XeGPU/loadgather2d_masked_f32.mlir
@@ -43,20 +43,21 @@ module @gemm attributes {gpu.container_module} {
 
       // Spirv has no lowering for memref.reinterpret_cast with different sizes (doesn't work: memref<3x16xf32> to memref<16xf32>)
       // Each row has a tdesc with offsets that determine linearized memref's values to be loaded
-      %row_1_in_td = xegpu.create_tdesc %arg0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] : memref<?xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
-      %row_1_out_td = xegpu.create_tdesc %arg1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] : memref<?xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+      %offsets_row1 = arith.constant dense<[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]> : vector<16xindex>
+      %row_1_in_td = xegpu.create_tdesc %arg0, %offsets_row1 : memref<?xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+      %row_1_out_td = xegpu.create_tdesc %arg1, %offsets_row1 : memref<?xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
       %row_1_loaded = xegpu.load %row_1_in_td, %row_mask : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32>
       %row_1_store = arith.select %row_mask, %row_1_loaded, %user_val : vector<16xi1>, vector<16xf32>
       xegpu.store %row_1_store, %row_1_out_td, %store_mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>
 
-      %row_2_in_td = xegpu.update_offset %row_1_in_td, [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16] : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
-      %row_2_out_td = xegpu.update_offset %row_1_out_td, [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16] : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+      %row_2_in_td = xegpu.update_offset %row_1_in_td, %offset_step : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xindex>
+      %row_2_out_td = xegpu.update_offset %row_1_out_td, %offset_step : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xindex>
       %row_2_loaded = xegpu.load %row_2_in_td, %row_mask : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32>
       %row_2_store = arith.select %row_mask, %row_2_loaded, %user_val : vector<16xi1>, vector<16xf32>
       xegpu.store %row_2_store, %row_2_out_td, %store_mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>
 
       // The entire row is out of bounds
-      %row_3_out_td = xegpu.update_offset %row_2_out_td, [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16] : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+      %row_3_out_td = xegpu.update_offset %row_2_out_td, %offset_step : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xindex>
       xegpu.store %user_val, %row_3_out_td, %store_mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>
       gpu.return
     }
diff --git a/test/Integration/Dialect/XeGPU/loadgather_chunk_size_f32.mlir b/test/Integration/Dialect/XeGPU/loadgather_chunk_size_f32.mlir
index 46abbae1f..ca2e4bba0 100644
--- a/test/Integration/Dialect/XeGPU/loadgather_chunk_size_f32.mlir
+++ b/test/Integration/Dialect/XeGPU/loadgather_chunk_size_f32.mlir
@@ -32,9 +32,10 @@ module @gemm attributes {gpu.container_module} {
     gpu.func @test_scattered(%in: memref<?xf32>, %out: memref<?xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
       // We have 16 work items, each accesses 2 elements: {chunk_size = 2}, hence 16x2 tensor.
       // Valid offsets (%offsets for which %mask is 1) should not exceed 16*2=32.
+      %offsets = arith.constant dense<[0,4,8,12,16,20,24,28,32,34,38,42,46,50,54,58]> : vector<16xindex>
       %mask = arith.constant dense<[1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0]> : vector<16xi1>
-      %tdesc_in = xegpu.create_tdesc %in[0,4,8,12,16,20,24,28,32,34,38,42,46,50,54,58] : memref<?xf32> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
-      %tdesc_out = xegpu.create_tdesc %out[0,4,8,12,16,20,24,28,32,34,38,42,46,50,54,58] : memref<?xf32> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
+      %tdesc_in = xegpu.create_tdesc %in, %offsets : memref<?xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
+      %tdesc_out = xegpu.create_tdesc %out, %offsets : memref<?xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
       %loaded = xegpu.load %tdesc_in, %mask {transpose} : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<16xi1> -> vector<2x16xf32>
       xegpu.store %loaded, %tdesc_out, %mask {transpose} : vector<2x16xf32>, !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<16xi1>
       gpu.return
diff --git a/test/Integration/Dialect/XeGPU/loadgather_chunk_size_i32.mlir b/test/Integration/Dialect/XeGPU/loadgather_chunk_size_i32.mlir
index bd8ca7111..638db7833 100644
--- a/test/Integration/Dialect/XeGPU/loadgather_chunk_size_i32.mlir
+++ b/test/Integration/Dialect/XeGPU/loadgather_chunk_size_i32.mlir
@@ -32,9 +32,10 @@ module @gemm attributes {gpu.container_module} {
     gpu.func @test_scattered(%in: memref<?xi32>, %out: memref<?xi32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
       // We have 16 work items, each accesses 2 elements: {chunk_size = 2}, hence 16x2 tensor.
       // Valid offsets (%offsets for which %mask is 1) should not exceed 16*2=32.
+      %offsets = arith.constant dense<[0,4,8,12,16,20,24,28,32,34,38,42,46,50,54,58]> : vector<16xindex>
       %mask = arith.constant dense<[1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0]> : vector<16xi1>
-      %tdesc_in = xegpu.create_tdesc %in[0,4,8,12,16,20,24,28,32,34,38,42,46,50,54,58] : memref<?xi32> -> !xegpu.tensor_desc<16x2xi32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
-      %tdesc_out = xegpu.create_tdesc %out[0,4,8,12,16,20,24,28,32,34,38,42,46,50,54,58] : memref<?xi32> -> !xegpu.tensor_desc<16x2xi32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
+      %tdesc_in = xegpu.create_tdesc %in, %offsets : memref<?xi32>, vector<16xindex> -> !xegpu.tensor_desc<16x2xi32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
+      %tdesc_out = xegpu.create_tdesc %out, %offsets : memref<?xi32>, vector<16xindex> -> !xegpu.tensor_desc<16x2xi32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
       %loaded = xegpu.load %tdesc_in, %mask {transpose} : !xegpu.tensor_desc<16x2xi32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<16xi1> -> vector<2x16xi32>
       xegpu.store %loaded, %tdesc_out, %mask {transpose} : vector<2x16xi32>, !xegpu.tensor_desc<16x2xi32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<16xi1>
       gpu.return
diff --git a/test/Integration/Dialect/XeGPU/loadgather_f32.mlir b/test/Integration/Dialect/XeGPU/loadgather_f32.mlir
index 0eb253e82..88e2cbf8c 100644
--- a/test/Integration/Dialect/XeGPU/loadgather_f32.mlir
+++ b/test/Integration/Dialect/XeGPU/loadgather_f32.mlir
@@ -24,11 +24,12 @@ module @gemm attributes {gpu.container_module} {
   gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, SubgroupDispatch, VectorComputeINTEL, VectorAnyINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
     gpu.func @test_scattered(%arg0: memref<1x16xf32>, %arg1: memref<1x16xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
       %c0 = arith.constant 0 : index
+      %offsets = arith.constant dense<[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]> : vector<16xindex>
       %mask = arith.constant dense<[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]> : vector<16xi1>
       %1 = memref.reinterpret_cast %arg0 to offset: [0], sizes: [16], strides: [1] : memref<1x16xf32> to memref<16xf32>
       %2 = memref.reinterpret_cast %arg1 to offset: [0], sizes: [16], strides: [1] : memref<1x16xf32> to memref<16xf32>
-      %tdesc1 = xegpu.create_tdesc %1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
-      %tdesc2 = xegpu.create_tdesc %2[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+      %tdesc1 = xegpu.create_tdesc %1, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+      %tdesc2 = xegpu.create_tdesc %2, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
       %loaded = xegpu.load %tdesc1, %mask : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32>
       xegpu.store %loaded, %tdesc2, %mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>
       gpu.return
diff --git a/test/Integration/Dialect/XeGPU/loadgather_masked_f32.mlir b/test/Integration/Dialect/XeGPU/loadgather_masked_f32.mlir
index daa6e9d12..349576169 100644
--- a/test/Integration/Dialect/XeGPU/loadgather_masked_f32.mlir
+++ b/test/Integration/Dialect/XeGPU/loadgather_masked_f32.mlir
@@ -24,11 +24,12 @@ module @gemm attributes {gpu.container_module} {
   gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, SubgroupDispatch, VectorComputeINTEL, VectorAnyINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
     gpu.func @test_scattered(%arg0: memref<1x16xf32>, %arg1: memref<1x16xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
       %c0 = arith.constant 0 : index
+      %offsets = arith.constant dense<[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]> : vector<16xindex>
       %mask = arith.constant dense<[1,1,1,0,1,1,1,1,0,1,1,1,1,0,1,1]> : vector<16xi1>
       %1 = memref.reinterpret_cast %arg0 to offset: [0], sizes: [16], strides: [1] : memref<1x16xf32> to memref<16xf32>
       %2 = memref.reinterpret_cast %arg1 to offset: [0], sizes: [16], strides: [1] : memref<1x16xf32> to memref<16xf32>
-      %tdesc1 = xegpu.create_tdesc %1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
-      %tdesc2 = xegpu.create_tdesc %2[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+      %tdesc1 = xegpu.create_tdesc %1, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+      %tdesc2 = xegpu.create_tdesc %2, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
       %loaded = xegpu.load %tdesc1, %mask : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32>
       xegpu.store %loaded, %tdesc2, %mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>
       gpu.return
diff --git a/test/Integration/Dialect/XeGPU/optimize_transpose.mlir b/test/Integration/Dialect/XeGPU/optimize_transpose.mlir
index 93327f688..8f8d61a67 100644
--- a/test/Integration/Dialect/XeGPU/optimize_transpose.mlir
+++ b/test/Integration/Dialect/XeGPU/optimize_transpose.mlir
@@ -34,34 +34,34 @@ module @gemm attributes {gpu.container_module} {
       %1 = arith.muli %block_id_y, %c32 : index
       %2 = arith.addi %0, %c0 : index
       %3 = arith.addi %1, %c0 : index
-      %4 = xegpu.create_nd_tdesc %arg2[%2, %3] : memref<256x256xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+      %4 = xegpu.create_nd_tdesc %arg2[%2, %3] : memref<256x256xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
       %5 = arith.addi %1, %c16 : index
-      %6 = xegpu.create_nd_tdesc %arg2[%2, %5] : memref<256x256xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+      %6 = xegpu.create_nd_tdesc %arg2[%2, %5] : memref<256x256xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
       %c8 = arith.constant 8 : index
       %7 = arith.addi %0, %c8 : index
-      %8 = xegpu.create_nd_tdesc %arg2[%7, %3] : memref<256x256xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-      %9 = xegpu.create_nd_tdesc %arg2[%7, %5] : memref<256x256xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-      %10 = xegpu.create_nd_tdesc %arg2[%2, %3] : memref<256x256xf32> -> !xegpu.tensor_desc<16x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-      %11 = xegpu.create_nd_tdesc %arg2[%2, %5] : memref<256x256xf32> -> !xegpu.tensor_desc<16x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-      %12 = xegpu.load_nd %10 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<16x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<16x16xf32>
-      %13 = xegpu.load_nd %11 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<16x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<16x16xf32>
-      %14 = xegpu.create_nd_tdesc %arg0[%2, %c0] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
-      %15 = xegpu.create_nd_tdesc %arg1[%3, %c0] : memref<256x256xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-      %16 = xegpu.create_nd_tdesc %arg1[%3, %c16] : memref<256x256xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-      %17:5 = scf.for %arg3 = %c0 to %c256 step %c32 iter_args(%arg4 = %14, %arg5 = %15, %arg6 = %16, %arg7 = %12, %arg8 = %13) -> (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, vector<16x16xf32>, vector<16x16xf32>) {
+      %8 = xegpu.create_nd_tdesc %arg2[%7, %3] : memref<256x256xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+      %9 = xegpu.create_nd_tdesc %arg2[%7, %5] : memref<256x256xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+      %10 = xegpu.create_nd_tdesc %arg2[%2, %3] : memref<256x256xf32> -> !xegpu.tensor_desc<16x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+      %11 = xegpu.create_nd_tdesc %arg2[%2, %5] : memref<256x256xf32> -> !xegpu.tensor_desc<16x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+      %12 = xegpu.load_nd %10 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<16x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>> -> vector<16x16xf32>
+      %13 = xegpu.load_nd %11 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<16x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>> -> vector<16x16xf32>
+      %14 = xegpu.create_nd_tdesc %arg0[%2, %c0] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>
+      %15 = xegpu.create_nd_tdesc %arg1[%3, %c0] : memref<256x256xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+      %16 = xegpu.create_nd_tdesc %arg1[%3, %c16] : memref<256x256xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+      %17:5 = scf.for %arg3 = %c0 to %c256 step %c32 iter_args(%arg4 = %14, %arg5 = %15, %arg6 = %16, %arg7 = %12, %arg8 = %13) -> (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>, vector<16x16xf32>, vector<16x16xf32>) {
         %22 = vector.extract_strided_slice %arg7 {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf32> to vector<8x16xf32>
         %23 = vector.extract_strided_slice %arg7 {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf32> to vector<8x16xf32>
         %24 = vector.extract_strided_slice %arg8 {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf32> to vector<8x16xf32>
         %25 = vector.extract_strided_slice %arg8 {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf32> to vector<8x16xf32>
-        %26 = xegpu.load_nd %arg4 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x16x16xf16>
+        %26 = xegpu.load_nd %arg4 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x16x16xf16>
         %27 = vector.extract %26[0] : vector<16x16xf16> from vector<2x16x16xf16>
         %28 = vector.extract %26[1] : vector<16x16xf16> from vector<2x16x16xf16>
         %29 = vector.extract_strided_slice %27 {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf16> to vector<8x16xf16>
         %30 = vector.extract_strided_slice %27 {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf16> to vector<8x16xf16>
         %31 = vector.extract_strided_slice %28 {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf16> to vector<8x16xf16>
         %32 = vector.extract_strided_slice %28 {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf16> to vector<8x16xf16>
-        %33 = xegpu.load_nd %arg5 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xf16>
-        %34 = xegpu.load_nd %arg6 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xf16>
+        %33 = xegpu.load_nd %arg5 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xf16>
+        %34 = xegpu.load_nd %arg6 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xf16>
         %35 = vector.transpose %33, [1, 0] : vector<32x16xf16> to vector<16x32xf16>
         %36 = vector.shape_cast %35 {packed} : vector<16x32xf16> to vector<512xf16>
         %37 = vector.shuffle %36, %36 [0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63, 64, 96, 65, 97, 66, 98, 67, 99, 68, 100, 69, 101, 70, 102, 71, 103, 72, 104, 73, 105, 74, 106, 75, 107, 76, 108, 77, 109, 78, 110, 79, 111, 80, 112, 81, 113, 82, 114, 83, 115, 84, 116, 85, 117, 86, 118, 87, 119, 88, 120, 89, 121, 90, 122, 91, 123, 92, 124, 93, 125, 94, 126, 95, 127, 128, 160, 129, 161, 130, 162, 131, 163, 132, 164, 133, 165, 134, 166, 135, 167, 136, 168, 137, 169, 138, 170, 139, 171, 140, 172, 141, 173, 142, 174, 143, 175, 144, 176, 145, 177, 146, 178, 147, 179, 148, 180, 149, 181, 150, 182, 151, 183, 152, 184, 153, 185, 154, 186, 155, 187, 156, 188, 157, 189, 158, 190, 159, 191, 192, 224, 193, 225, 194, 226, 195, 227, 196, 228, 197, 229, 198, 230, 199, 231, 200, 232, 201, 233, 202, 234, 203, 235, 204, 236, 205, 237, 206, 238, 207, 239, 208, 240, 209, 241, 210, 242, 211, 243, 212, 244, 213, 245, 214, 246, 215, 247, 216, 248, 217, 249, 218, 250, 219, 251, 220, 252, 221, 253, 222, 254, 223, 255, 256, 288, 257, 289, 258, 290, 259, 291, 260, 292, 261, 293, 262, 294, 263, 295, 264, 296, 265, 297, 266, 298, 267, 299, 268, 300, 269, 301, 270, 302, 271, 303, 272, 304, 273, 305, 274, 306, 275, 307, 276, 308, 277, 309, 278, 310, 279, 311, 280, 312, 281, 313, 282, 314, 283, 315, 284, 316, 285, 317, 286, 318, 287, 319, 320, 352, 321, 353, 322, 354, 323, 355, 324, 356, 325, 357, 326, 358, 327, 359, 328, 360, 329, 361, 330, 362, 331, 363, 332, 364, 333, 365, 334, 366, 335, 367, 336, 368, 337, 369, 338, 370, 339, 371, 340, 372, 341, 373, 342, 374, 343, 375, 344, 376, 345, 377, 346, 378, 347, 379, 348, 380, 349, 381, 350, 382, 351, 383, 384, 416, 385, 417, 386, 418, 387, 419, 388, 420, 389, 421, 390, 422, 391, 423, 392, 424, 393, 425, 394, 426, 395, 427, 396, 428, 397, 429, 398, 430, 399, 431, 400, 432, 401, 433, 402, 434, 403, 435, 404, 436, 405, 437, 406, 438, 407, 439, 408, 440, 409, 441, 410, 442, 411, 443, 412, 444, 413, 445, 414, 446, 415, 447, 448, 480, 449, 481, 450, 482, 451, 483, 452, 484, 453, 485, 454, 486, 455, 487, 456, 488, 457, 489, 458, 490, 459, 491, 460, 492, 461, 493, 462, 494, 463, 495, 464, 496, 465, 497, 466, 498, 467, 499, 468, 500, 469, 501, 470, 502, 471, 503, 472, 504, 473, 505, 474, 506, 475, 507, 476, 508, 477, 509, 478, 510, 479, 511] {packed} : vector<512xf16>, vector<512xf16>
@@ -84,19 +84,19 @@ module @gemm attributes {gpu.container_module} {
         %54 = xegpu.dpas %32, %46, %53 : vector<8x16xf16>, vector<8x16x2xf16>, vector<8x16xf32> -> vector<8x16xf32>
         %55 = vector.shuffle %48, %52 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : vector<8x16xf32>, vector<8x16xf32>
         %56 = vector.shuffle %50, %54 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : vector<8x16xf32>, vector<8x16xf32>
-        %57 = xegpu.update_nd_offset %arg4, [%c0, %c32] : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
-        %58 = xegpu.update_nd_offset %arg5, [%c0, %c32] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-        %59 = xegpu.update_nd_offset %arg6, [%c0, %c32] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-        scf.yield %57, %58, %59, %55, %56 : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, vector<16x16xf32>, vector<16x16xf32>
+        %57 = xegpu.update_nd_offset %arg4, [%c0, %c32] : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>
+        %58 = xegpu.update_nd_offset %arg5, [%c0, %c32] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+        %59 = xegpu.update_nd_offset %arg6, [%c0, %c32] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+        scf.yield %57, %58, %59, %55, %56 : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>, vector<16x16xf32>, vector<16x16xf32>
       }
       %18 = vector.extract_strided_slice %17#3 {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf32> to vector<8x16xf32>
       %19 = vector.extract_strided_slice %17#3 {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf32> to vector<8x16xf32>
       %20 = vector.extract_strided_slice %17#4 {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf32> to vector<8x16xf32>
       %21 = vector.extract_strided_slice %17#4 {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf32> to vector<8x16xf32>
-      xegpu.store_nd %18, %4 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-      xegpu.store_nd %20, %6 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-      xegpu.store_nd %19, %8 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-      xegpu.store_nd %21, %9 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+      xegpu.store_nd %18, %4 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+      xegpu.store_nd %20, %6 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+      xegpu.store_nd %19, %8 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+      xegpu.store_nd %21, %9 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
       gpu.return
     }
   }
diff --git a/test/SPIRV/OpTest.spirv.CL.printf.mlir b/test/SPIRV/OpTest.spirv.CL.printf.mlir
index a7be187f6..b771a2726 100644
--- a/test/SPIRV/OpTest.spirv.CL.printf.mlir
+++ b/test/SPIRV/OpTest.spirv.CL.printf.mlir
@@ -45,7 +45,7 @@ module @print_simple attributes {gpu.container_module} {
 
       %printfMsg1_addr = spirv.mlir.addressof @printfMsg1 : !spirv.ptr<!spirv.array<14 x i8>, UniformConstant>
       %0 = spirv.Bitcast %printfMsg1_addr : !spirv.ptr<!spirv.array<14 x i8>, UniformConstant> to !spirv.ptr<i8, UniformConstant>
-      %1 = spirv.CL.printf %0 : !spirv.ptr<i8, UniformConstant> (%arg0, %arg1 : i32, f32) -> i32
+      %1 = spirv.CL.printf %0 %arg0, %arg1 : !spirv.ptr<i8, UniformConstant>, i32, f32 -> i32
 
      spirv.Return
     }
diff --git a/test/Transforms/RemoveSingleElemVector/postop_reduce_n.mlir b/test/Transforms/RemoveSingleElemVector/postop_reduce_n.mlir
index 864aad766..464eb9507 100644
--- a/test/Transforms/RemoveSingleElemVector/postop_reduce_n.mlir
+++ b/test/Transforms/RemoveSingleElemVector/postop_reduce_n.mlir
@@ -53,11 +53,11 @@ module {
       %26 = arith.muli %25, %c256 : index
       %27 = arith.divsi %15, %c32 : index
       %28 = arith.muli %27, %c32 : index
-      %29 = xegpu.create_nd_tdesc %arg0[%19, %15] : memref<16384x12288xbf16> -> !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
+      %29 = xegpu.create_nd_tdesc %arg0[%19, %15] : memref<16384x12288xbf16> -> !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>
       %30 = arith.divsi %23, %c32 : index
       %31 = arith.muli %30, %c32 : index
       %32 = arith.addi %26, %2 : index
-      %33 = xegpu.create_nd_tdesc %arg0[%32, %28] : memref<16384x12288xbf16> -> !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+      %33 = xegpu.create_nd_tdesc %arg0[%32, %28] : memref<16384x12288xbf16> -> !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
       %34 = arith.remsi %11, %c4 : index
       %35 = scf.for %arg3 = %c0 to %c3 step %c1 iter_args(%arg4 = %cst) -> (vector<8x1xf32>) {
         %39 = vector.shape_cast %arg4 : vector<8x1xf32> to vector<8xf32>
@@ -75,29 +75,29 @@ module {
         %50 = arith.addi %49, %24 : index
         %51 = arith.divsi %50, %c128 : index
         %52 = arith.muli %51, %c128 : index
-        %53 = xegpu.create_nd_tdesc %arg1[%50, %23] : memref<1536x12288xbf16> -> !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
-        xegpu.prefetch_nd %33 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-        %54 = xegpu.update_nd_offset %33, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-        xegpu.prefetch_nd %54 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-        %55 = xegpu.update_nd_offset %54, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-        xegpu.prefetch_nd %55 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-        %56 = xegpu.update_nd_offset %55, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+        %53 = xegpu.create_nd_tdesc %arg1[%50, %23] : memref<1536x12288xbf16> -> !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>
+        xegpu.prefetch_nd %33 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+        %54 = xegpu.update_nd_offset %33, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+        xegpu.prefetch_nd %54 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+        %55 = xegpu.update_nd_offset %54, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+        xegpu.prefetch_nd %55 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+        %56 = xegpu.update_nd_offset %55, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
         %57 = arith.addi %52, %3 : index
-        %58 = xegpu.create_nd_tdesc %arg1[%57, %31] : memref<1536x12288xbf16> -> !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-        xegpu.prefetch_nd %58 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-        %59 = xegpu.update_nd_offset %58, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-        xegpu.prefetch_nd %59 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-        %60 = xegpu.update_nd_offset %59, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-        xegpu.prefetch_nd %60 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-        %61 = xegpu.update_nd_offset %60, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-        %62:13 = scf.for %arg5 = %c0 to %c12288 step %c32 iter_args(%arg6 = %29, %arg7 = %53, %arg8 = %cst_0, %arg9 = %cst_0, %arg10 = %cst_0, %arg11 = %cst_0, %arg12 = %cst_0, %arg13 = %cst_0, %arg14 = %cst_0, %arg15 = %cst_0, %arg16 = %56, %arg17 = %61, %arg18 = %c0) -> (!xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, index) {
+        %58 = xegpu.create_nd_tdesc %arg1[%57, %31] : memref<1536x12288xbf16> -> !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+        xegpu.prefetch_nd %58 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+        %59 = xegpu.update_nd_offset %58, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+        xegpu.prefetch_nd %59 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+        %60 = xegpu.update_nd_offset %59, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+        xegpu.prefetch_nd %60 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+        %61 = xegpu.update_nd_offset %60, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+        %62:13 = scf.for %arg5 = %c0 to %c12288 step %c32 iter_args(%arg6 = %29, %arg7 = %53, %arg8 = %cst_0, %arg9 = %cst_0, %arg10 = %cst_0, %arg11 = %cst_0, %arg12 = %cst_0, %arg13 = %cst_0, %arg14 = %cst_0, %arg15 = %cst_0, %arg16 = %56, %arg17 = %61, %arg18 = %c0) -> (!xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>, !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>, index) {
           %391 = arith.cmpi eq, %arg18, %c21 : index
           %392 = arith.select %391, %c0, %arg18 : index
           scf.if %391 {
             gpu.barrier
           }
           %393 = arith.addi %392, %c1 : index
-          %394 = xegpu.load_nd %arg6 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x16xbf16>
+          %394 = xegpu.load_nd %arg6 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x16xbf16>
           %395 = vector.shape_cast %394 : vector<2x32x16xbf16> to vector<1024xbf16>
           %396 = vector.shuffle %395, %395 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511] : vector<1024xbf16>, vector<1024xbf16>
           %397 = vector.shuffle %395, %395 [512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, 798, 799, 800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818, 819, 820, 821, 822, 823, 824, 825, 826, 827, 828, 829, 830, 831, 832, 833, 834, 835, 836, 837, 838, 839, 840, 841, 842, 843, 844, 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857, 858, 859, 860, 861, 862, 863, 864, 865, 866, 867, 868, 869, 870, 871, 872, 873, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883, 884, 885, 886, 887, 888, 889, 890, 891, 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 928, 929, 930, 931, 932, 933, 934, 935, 936, 937, 938, 939, 940, 941, 942, 943, 944, 945, 946, 947, 948, 949, 950, 951, 952, 953, 954, 955, 956, 957, 958, 959, 960, 961, 962, 963, 964, 965, 966, 967, 968, 969, 970, 971, 972, 973, 974, 975, 976, 977, 978, 979, 980, 981, 982, 983, 984, 985, 986, 987, 988, 989, 990, 991, 992, 993, 994, 995, 996, 997, 998, 999, 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023] : vector<1024xbf16>, vector<1024xbf16>
@@ -117,7 +117,7 @@ module {
           %411 = vector.shape_cast %410 : vector<128xbf16> to vector<8x16xbf16>
           %412 = vector.shuffle %397, %397 [384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511] : vector<512xbf16>, vector<512xbf16>
           %413 = vector.shape_cast %412 : vector<128xbf16> to vector<8x16xbf16>
-          %414 = xegpu.load_nd %arg7 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>, packed}> : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x16x16x2xbf16>
+          %414 = xegpu.load_nd %arg7 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>, packed}> : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x16x16x2xbf16>
           %415 = vector.shape_cast %414 : vector<2x16x16x2xbf16> to vector<1024xbf16>
           %416 = vector.shuffle %415, %415 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511] : vector<1024xbf16>, vector<1024xbf16>
           %417 = vector.shuffle %415, %415 [512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, 798, 799, 800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818, 819, 820, 821, 822, 823, 824, 825, 826, 827, 828, 829, 830, 831, 832, 833, 834, 835, 836, 837, 838, 839, 840, 841, 842, 843, 844, 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857, 858, 859, 860, 861, 862, 863, 864, 865, 866, 867, 868, 869, 870, 871, 872, 873, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883, 884, 885, 886, 887, 888, 889, 890, 891, 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 928, 929, 930, 931, 932, 933, 934, 935, 936, 937, 938, 939, 940, 941, 942, 943, 944, 945, 946, 947, 948, 949, 950, 951, 952, 953, 954, 955, 956, 957, 958, 959, 960, 961, 962, 963, 964, 965, 966, 967, 968, 969, 970, 971, 972, 973, 974, 975, 976, 977, 978, 979, 980, 981, 982, 983, 984, 985, 986, 987, 988, 989, 990, 991, 992, 993, 994, 995, 996, 997, 998, 999, 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023] : vector<1024xbf16>, vector<1024xbf16>
@@ -130,15 +130,15 @@ module {
           %424 = vector.shuffle %417, %417 [256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511] : vector<512xbf16>, vector<512xbf16>
           %425 = vector.shape_cast %424 : vector<256xbf16> to vector<8x16x2xbf16>
           xegpu.compile_hint
-          xegpu.prefetch_nd %arg16 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-          xegpu.prefetch_nd %arg17 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+          xegpu.prefetch_nd %arg16 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+          xegpu.prefetch_nd %arg17 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
           xegpu.compile_hint
-          %426 = xegpu.update_nd_offset %arg16, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-          %427 = xegpu.update_nd_offset %arg17, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+          %426 = xegpu.update_nd_offset %arg16, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+          %427 = xegpu.update_nd_offset %arg17, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
           xegpu.compile_hint
           xegpu.compile_hint
-          %428 = xegpu.update_nd_offset %arg6, [%c0, %c32] : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
-          %429 = xegpu.update_nd_offset %arg7, [%c0, %c32] : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
+          %428 = xegpu.update_nd_offset %arg6, [%c0, %c32] : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>
+          %429 = xegpu.update_nd_offset %arg7, [%c0, %c32] : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>
           xegpu.compile_hint
           %430 = xegpu.dpas %399, %419, %arg8 : vector<8x16xbf16>, vector<8x16x2xbf16>, vector<8x16xf32> -> vector<8x16xf32>
           %431 = xegpu.dpas %407, %421, %430 : vector<8x16xbf16>, vector<8x16x2xbf16>, vector<8x16xf32> -> vector<8x16xf32>
@@ -157,7 +157,7 @@ module {
           %444 = xegpu.dpas %405, %423, %arg15 : vector<8x16xbf16>, vector<8x16x2xbf16>, vector<8x16xf32> -> vector<8x16xf32>
           %445 = xegpu.dpas %413, %425, %444 : vector<8x16xbf16>, vector<8x16x2xbf16>, vector<8x16xf32> -> vector<8x16xf32>
           xegpu.compile_hint
-          scf.yield %428, %429, %431, %433, %435, %437, %439, %441, %443, %445, %426, %427, %393 : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, index
+          scf.yield %428, %429, %431, %433, %435, %437, %439, %441, %443, %445, %426, %427, %393 : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>, !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>, index
         }
         %63 = vector.shape_cast %62#2 : vector<8x16xf32> to vector<128xf32>
         %64 = vector.shape_cast %62#3 : vector<8x16xf32> to vector<128xf32>
@@ -419,13 +419,13 @@ module {
         %320 = arith.addf %318, %319 : vector<16xf32>
         %321 = vector.shuffle %317, %320 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] : vector<16xf32>, vector<16xf32>
         %alloc = memref.alloc() : memref<256x4xf32, #spirv.storage_class<Workgroup>>
-        %322 = xegpu.create_nd_tdesc %alloc[%13, %34] : memref<256x4xf32, #spirv.storage_class<Workgroup>> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+        %322 = xegpu.create_nd_tdesc %alloc[%13, %34] : memref<256x4xf32, #spirv.storage_class<Workgroup>> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
         %323 = arith.addi %13, %c8 : index
-        %324 = xegpu.create_nd_tdesc %alloc[%323, %34] : memref<256x4xf32, #spirv.storage_class<Workgroup>> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+        %324 = xegpu.create_nd_tdesc %alloc[%323, %34] : memref<256x4xf32, #spirv.storage_class<Workgroup>> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
         %325 = arith.addi %13, %c16 : index
-        %326 = xegpu.create_nd_tdesc %alloc[%325, %34] : memref<256x4xf32, #spirv.storage_class<Workgroup>> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+        %326 = xegpu.create_nd_tdesc %alloc[%325, %34] : memref<256x4xf32, #spirv.storage_class<Workgroup>> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
         %327 = arith.addi %13, %c24 : index
-        %328 = xegpu.create_nd_tdesc %alloc[%327, %34] : memref<256x4xf32, #spirv.storage_class<Workgroup>> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+        %328 = xegpu.create_nd_tdesc %alloc[%327, %34] : memref<256x4xf32, #spirv.storage_class<Workgroup>> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
         %329 = vector.shuffle %321, %321 [0, 1, 2, 3, 4, 5, 6, 7] : vector<32xf32>, vector<32xf32>
         %330 = vector.shape_cast %329 : vector<8xf32> to vector<8x1xf32>
         %331 = vector.shuffle %321, %321 [8, 9, 10, 11, 12, 13, 14, 15] : vector<32xf32>, vector<32xf32>
@@ -434,13 +434,13 @@ module {
         %334 = vector.shape_cast %333 : vector<8xf32> to vector<8x1xf32>
         %335 = vector.shuffle %321, %321 [24, 25, 26, 27, 28, 29, 30, 31] : vector<32xf32>, vector<32xf32>
         %336 = vector.shape_cast %335 : vector<8xf32> to vector<8x1xf32>
-        xegpu.store_nd %330, %322 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-        xegpu.store_nd %332, %324 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-        xegpu.store_nd %334, %326 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-        xegpu.store_nd %336, %328 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+        xegpu.store_nd %330, %322 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+        xegpu.store_nd %332, %324 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+        xegpu.store_nd %334, %326 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+        xegpu.store_nd %336, %328 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
         gpu.barrier
-        %337 = xegpu.create_nd_tdesc %alloc[%9, %c0] : memref<256x4xf32, #spirv.storage_class<Workgroup>> -> !xegpu.tensor_desc<8x4xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-        %338 = xegpu.load_nd %337 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<8x4xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<8x4xf32>
+        %337 = xegpu.create_nd_tdesc %alloc[%9, %c0] : memref<256x4xf32, #spirv.storage_class<Workgroup>> -> !xegpu.tensor_desc<8x4xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+        %338 = xegpu.load_nd %337 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<8x4xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>> -> vector<8x4xf32>
         %339 = vector.shape_cast %338 : vector<8x4xf32> to vector<32xf32>
         %340 = vector.shuffle %339, %339 [0, 1, 2, 3] : vector<32xf32>, vector<32xf32>
         %341 = vector.shuffle %339, %339 [4, 5, 6, 7] : vector<32xf32>, vector<32xf32>
@@ -499,8 +499,8 @@ module {
       }
       %36 = arith.addi %16, %18 : index
       %37 = arith.addi %36, %9 : index
-      %38 = xegpu.create_nd_tdesc %arg2[%37, %7] : memref<16384x4xf32> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-      xegpu.store_nd %35, %38 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+      %38 = xegpu.create_nd_tdesc %arg2[%37, %7] : memref<16384x4xf32> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+      xegpu.store_nd %35, %38 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
       gpu.return
     }
   }
diff --git a/test/Transforms/VectorLinearize/postop_reduce_n.mlir b/test/Transforms/VectorLinearize/postop_reduce_n.mlir
index 09f28d414..506eb36f4 100644
--- a/test/Transforms/VectorLinearize/postop_reduce_n.mlir
+++ b/test/Transforms/VectorLinearize/postop_reduce_n.mlir
@@ -56,13 +56,13 @@ module {
       %28 = arith.muli %27, %c32 : index
       %29 = arith.addi %19, %c0 : index
       %30 = arith.addi %15, %c0 : index
-      %31 = xegpu.create_nd_tdesc %arg0[%29, %30] : memref<16384x12288xbf16> -> !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
+      %31 = xegpu.create_nd_tdesc %arg0[%29, %30] : memref<16384x12288xbf16> -> !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>
       %32 = arith.divsi %23, %c32 : index
       %33 = arith.muli %32, %c32 : index
       %34 = arith.addi %26, %2 : index
       %35 = arith.addi %34, %c0 : index
       %36 = arith.addi %28, %c0 : index
-      %37 = xegpu.create_nd_tdesc %arg0[%35, %36] : memref<16384x12288xbf16> -> !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+      %37 = xegpu.create_nd_tdesc %arg0[%35, %36] : memref<16384x12288xbf16> -> !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
       %38 = arith.remsi %11, %c4 : index
       %39 = scf.for %arg3 = %c0 to %c3 step %c1 iter_args(%arg4 = %cst) -> (vector<8x1xf32>) {
 
@@ -89,31 +89,31 @@ module {
         %57 = arith.muli %56, %c128 : index
         %58 = arith.addi %55, %c0 : index
         %59 = arith.addi %23, %c0 : index
-        %60 = xegpu.create_nd_tdesc %arg1[%58, %59] : memref<1536x12288xbf16> -> !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
-        xegpu.prefetch_nd %37 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-        %61 = xegpu.update_nd_offset %37, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-        xegpu.prefetch_nd %61 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-        %62 = xegpu.update_nd_offset %61, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-        xegpu.prefetch_nd %62 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-        %63 = xegpu.update_nd_offset %62, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+        %60 = xegpu.create_nd_tdesc %arg1[%58, %59] : memref<1536x12288xbf16> -> !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>
+        xegpu.prefetch_nd %37 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+        %61 = xegpu.update_nd_offset %37, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+        xegpu.prefetch_nd %61 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+        %62 = xegpu.update_nd_offset %61, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+        xegpu.prefetch_nd %62 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+        %63 = xegpu.update_nd_offset %62, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
         %64 = arith.addi %57, %3 : index
         %65 = arith.addi %64, %c0 : index
         %66 = arith.addi %33, %c0 : index
-        %67 = xegpu.create_nd_tdesc %arg1[%65, %66] : memref<1536x12288xbf16> -> !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-        xegpu.prefetch_nd %67 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-        %68 = xegpu.update_nd_offset %67, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-        xegpu.prefetch_nd %68 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-        %69 = xegpu.update_nd_offset %68, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-        xegpu.prefetch_nd %69 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-        %70 = xegpu.update_nd_offset %69, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-        %71:13 = scf.for %arg5 = %c0 to %c12288 step %c32 iter_args(%arg6 = %31, %arg7 = %60, %arg8 = %cst_0, %arg9 = %cst_0, %arg10 = %cst_0, %arg11 = %cst_0, %arg12 = %cst_0, %arg13 = %cst_0, %arg14 = %cst_0, %arg15 = %cst_0, %arg16 = %63, %arg17 = %70, %arg18 = %c0) -> (!xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, index) {
+        %67 = xegpu.create_nd_tdesc %arg1[%65, %66] : memref<1536x12288xbf16> -> !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+        xegpu.prefetch_nd %67 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+        %68 = xegpu.update_nd_offset %67, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+        xegpu.prefetch_nd %68 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+        %69 = xegpu.update_nd_offset %68, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+        xegpu.prefetch_nd %69 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+        %70 = xegpu.update_nd_offset %69, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+        %71:13 = scf.for %arg5 = %c0 to %c12288 step %c32 iter_args(%arg6 = %31, %arg7 = %60, %arg8 = %cst_0, %arg9 = %cst_0, %arg10 = %cst_0, %arg11 = %cst_0, %arg12 = %cst_0, %arg13 = %cst_0, %arg14 = %cst_0, %arg15 = %cst_0, %arg16 = %63, %arg17 = %70, %arg18 = %c0) -> (!xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>, !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>, index) {
           %437 = arith.cmpi eq, %arg18, %c21 : index
           %438 = arith.select %437, %c0, %arg18 : index
           scf.if %437 {
             gpu.barrier
           }
           %439 = arith.addi %438, %c1 : index
-          %440 = xegpu.load_nd %arg6 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x16xbf16>
+          %440 = xegpu.load_nd %arg6 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x16xbf16>
 
           //CHECK: vector.shape_cast %{{.*}} : vector<2x32x16xbf16> to vector<1024xbf16>
           //CHECK: vector.shuffle %{{.*}}, %{{.*}} [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511] : vector<1024xbf16>, vector<1024xbf16>
@@ -144,7 +144,7 @@ module {
           %448 = vector.extract_strided_slice %442 {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xbf16> to vector<8x16xbf16>
           %449 = vector.extract_strided_slice %442 {offsets = [16, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xbf16> to vector<8x16xbf16>
           %450 = vector.extract_strided_slice %442 {offsets = [24, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xbf16> to vector<8x16xbf16>
-          %451 = xegpu.load_nd %arg7 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>, packed}> : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x16x16x2xbf16>
+          %451 = xegpu.load_nd %arg7 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>, packed}> : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x16x16x2xbf16>
 
           //CHECK: vector.shape_cast %{{.*}} : vector<2x16x16x2xbf16> to vector<1024xbf16>
           //CHECK: vector.shuffle %{{.*}}, %{{.*}} [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511] : vector<1024xbf16>, vector<1024xbf16>
@@ -164,15 +164,15 @@ module {
           %456 = vector.extract_strided_slice %453 {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16x2xbf16> to vector<8x16x2xbf16>
           %457 = vector.extract_strided_slice %453 {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16x2xbf16> to vector<8x16x2xbf16>
           xegpu.compile_hint
-          xegpu.prefetch_nd %arg16 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-          xegpu.prefetch_nd %arg17 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+          xegpu.prefetch_nd %arg16 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+          xegpu.prefetch_nd %arg17 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
           xegpu.compile_hint
-          %458 = xegpu.update_nd_offset %arg16, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-          %459 = xegpu.update_nd_offset %arg17, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+          %458 = xegpu.update_nd_offset %arg16, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+          %459 = xegpu.update_nd_offset %arg17, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
           xegpu.compile_hint
           xegpu.compile_hint
-          %460 = xegpu.update_nd_offset %arg6, [%c0, %c32] : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
-          %461 = xegpu.update_nd_offset %arg7, [%c0, %c32] : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
+          %460 = xegpu.update_nd_offset %arg6, [%c0, %c32] : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>
+          %461 = xegpu.update_nd_offset %arg7, [%c0, %c32] : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>
           xegpu.compile_hint
           %462 = xegpu.dpas %443, %454, %arg8 : vector<8x16xbf16>, vector<8x16x2xbf16>, vector<8x16xf32> -> vector<8x16xf32>
           %463 = xegpu.dpas %447, %455, %462 : vector<8x16xbf16>, vector<8x16x2xbf16>, vector<8x16xf32> -> vector<8x16xf32>
@@ -191,7 +191,7 @@ module {
           %476 = xegpu.dpas %446, %456, %arg15 : vector<8x16xbf16>, vector<8x16x2xbf16>, vector<8x16xf32> -> vector<8x16xf32>
           %477 = xegpu.dpas %450, %457, %476 : vector<8x16xbf16>, vector<8x16x2xbf16>, vector<8x16xf32> -> vector<8x16xf32>
           xegpu.compile_hint
-          scf.yield %460, %461, %463, %465, %467, %469, %471, %473, %475, %477, %458, %459, %439 : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, index
+          scf.yield %460, %461, %463, %465, %467, %469, %471, %473, %475, %477, %458, %459, %439 : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>, !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>, index
         }
 
         //CHECK-COUNT-8: vector.shape_cast %{{.*}} : vector<8x16xf32> to vector<128xf32>
@@ -489,27 +489,27 @@ module {
         %alloc = memref.alloc() : memref<256x4xf32, #spirv.storage_class<Workgroup>>
         %359 = arith.addi %13, %c0 : index
         %360 = arith.addi %38, %c0 : index
-        %361 = xegpu.create_nd_tdesc %alloc[%359, %360] : memref<256x4xf32, #spirv.storage_class<Workgroup>> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+        %361 = xegpu.create_nd_tdesc %alloc[%359, %360] : memref<256x4xf32, #spirv.storage_class<Workgroup>> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
         %362 = arith.addi %13, %c8 : index
-        %363 = xegpu.create_nd_tdesc %alloc[%362, %360] : memref<256x4xf32, #spirv.storage_class<Workgroup>> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+        %363 = xegpu.create_nd_tdesc %alloc[%362, %360] : memref<256x4xf32, #spirv.storage_class<Workgroup>> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
         %c16 = arith.constant 16 : index
         %364 = arith.addi %13, %c16 : index
-        %365 = xegpu.create_nd_tdesc %alloc[%364, %360] : memref<256x4xf32, #spirv.storage_class<Workgroup>> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+        %365 = xegpu.create_nd_tdesc %alloc[%364, %360] : memref<256x4xf32, #spirv.storage_class<Workgroup>> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
         %c24 = arith.constant 24 : index
         %366 = arith.addi %13, %c24 : index
-        %367 = xegpu.create_nd_tdesc %alloc[%366, %360] : memref<256x4xf32, #spirv.storage_class<Workgroup>> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+        %367 = xegpu.create_nd_tdesc %alloc[%366, %360] : memref<256x4xf32, #spirv.storage_class<Workgroup>> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
         %368 = vector.extract_strided_slice %358 {offsets = [0, 0], sizes = [8, 1], strides = [1, 1]} : vector<32x1xf32> to vector<8x1xf32>
         %369 = vector.extract_strided_slice %358 {offsets = [8, 0], sizes = [8, 1], strides = [1, 1]} : vector<32x1xf32> to vector<8x1xf32>
         %370 = vector.extract_strided_slice %358 {offsets = [16, 0], sizes = [8, 1], strides = [1, 1]} : vector<32x1xf32> to vector<8x1xf32>
         %371 = vector.extract_strided_slice %358 {offsets = [24, 0], sizes = [8, 1], strides = [1, 1]} : vector<32x1xf32> to vector<8x1xf32>
-        xegpu.store_nd %368, %361 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-        xegpu.store_nd %369, %363 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-        xegpu.store_nd %370, %365 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-        xegpu.store_nd %371, %367 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+        xegpu.store_nd %368, %361 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+        xegpu.store_nd %369, %363 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+        xegpu.store_nd %370, %365 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+        xegpu.store_nd %371, %367 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
         gpu.barrier
         %372 = arith.addi %9, %c0 : index
-        %373 = xegpu.create_nd_tdesc %alloc[%372, %c0] : memref<256x4xf32, #spirv.storage_class<Workgroup>> -> !xegpu.tensor_desc<8x4xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-        %374 = xegpu.load_nd %373 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<8x4xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<8x4xf32>
+        %373 = xegpu.create_nd_tdesc %alloc[%372, %c0] : memref<256x4xf32, #spirv.storage_class<Workgroup>> -> !xegpu.tensor_desc<8x4xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+        %374 = xegpu.load_nd %373 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<8x4xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>> -> vector<8x4xf32>
         %375 = vector.extract_strided_slice %374 {offsets = [0, 0], sizes = [1, 4], strides = [1, 1]} : vector<8x4xf32> to vector<1x4xf32>
         %376 = vector.extract_strided_slice %374 {offsets = [1, 0], sizes = [1, 4], strides = [1, 1]} : vector<8x4xf32> to vector<1x4xf32>
         %377 = vector.extract_strided_slice %374 {offsets = [2, 0], sizes = [1, 4], strides = [1, 1]} : vector<8x4xf32> to vector<1x4xf32>
@@ -591,8 +591,8 @@ module {
       %41 = arith.addi %40, %9 : index
       %42 = arith.addi %41, %c0 : index
       %43 = arith.addi %7, %c0 : index
-      %44 = xegpu.create_nd_tdesc %arg2[%42, %43] : memref<16384x4xf32> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-      xegpu.store_nd %39, %44 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+      %44 = xegpu.create_nd_tdesc %arg2[%42, %43] : memref<16384x4xf32> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+      xegpu.store_nd %39, %44 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
       gpu.return
     }
   }
diff --git a/test/Transforms/VnniTransform/gemm_with_extract.mlir b/test/Transforms/VnniTransform/gemm_with_extract.mlir
index a1685f920..206486397 100644
--- a/test/Transforms/VnniTransform/gemm_with_extract.mlir
+++ b/test/Transforms/VnniTransform/gemm_with_extract.mlir
@@ -11,28 +11,28 @@ gpu.module @test_kernel {
     %1 = arith.muli %block_id_y, %c32 : index
     %2 = arith.addi %0, %c0 : index
     %3 = arith.addi %1, %c0 : index
-    %4 = xegpu.create_nd_tdesc %arg2[%2, %3] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    %4 = xegpu.create_nd_tdesc %arg2[%2, %3] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
     %c16 = arith.constant 16 : index
     %5 = arith.addi %1, %c16 : index
-    %6 = xegpu.create_nd_tdesc %arg2[%2, %5] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    %6 = xegpu.create_nd_tdesc %arg2[%2, %5] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
     %c8 = arith.constant 8 : index
     %7 = arith.addi %0, %c8 : index
-    %8 = xegpu.create_nd_tdesc %arg2[%7, %3] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    %9 = xegpu.create_nd_tdesc %arg2[%7, %5] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    %8 = xegpu.create_nd_tdesc %arg2[%7, %3] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    %9 = xegpu.create_nd_tdesc %arg2[%7, %5] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
     %10 = arith.addi %0, %c16 : index
-    %11 = xegpu.create_nd_tdesc %arg2[%10, %3] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    %12 = xegpu.create_nd_tdesc %arg2[%10, %5] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    %11 = xegpu.create_nd_tdesc %arg2[%10, %3] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    %12 = xegpu.create_nd_tdesc %arg2[%10, %5] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
     %c24 = arith.constant 24 : index
     %13 = arith.addi %0, %c24 : index
-    %14 = xegpu.create_nd_tdesc %arg2[%13, %3] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    %15 = xegpu.create_nd_tdesc %arg2[%13, %5] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    %16 = xegpu.create_nd_tdesc %arg2[%2, %3] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    %17 = xegpu.create_nd_tdesc %arg2[%2, %5] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    %18 = xegpu.load_nd %16 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xf32>
-    %19 = xegpu.load_nd %17 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xf32>
-    %20 = xegpu.create_nd_tdesc %arg0[%2, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
-    %21 = xegpu.create_nd_tdesc %arg1[%c0, %3] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
-    %22:4 = scf.for %arg3 = %c0 to %c1024 step %c32 iter_args(%arg4 = %20, %arg5 = %21, %arg6 = %18, %arg7 = %19) -> (!xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>, vector<32x16xf32>, vector<32x16xf32>) {
+    %14 = xegpu.create_nd_tdesc %arg2[%13, %3] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    %15 = xegpu.create_nd_tdesc %arg2[%13, %5] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    %16 = xegpu.create_nd_tdesc %arg2[%2, %3] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    %17 = xegpu.create_nd_tdesc %arg2[%2, %5] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    %18 = xegpu.load_nd %16 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xf32>
+    %19 = xegpu.load_nd %17 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xf32>
+    %20 = xegpu.create_nd_tdesc %arg0[%2, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>
+    %21 = xegpu.create_nd_tdesc %arg1[%c0, %3] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>
+    %22:4 = scf.for %arg3 = %c0 to %c1024 step %c32 iter_args(%arg4 = %20, %arg5 = %21, %arg6 = %18, %arg7 = %19) -> (!xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>, vector<32x16xf32>, vector<32x16xf32>) {
       %31 = vector.extract_strided_slice %arg6 {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32>
       %32 = vector.extract_strided_slice %arg6 {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32>
       %33 = vector.extract_strided_slice %arg6 {offsets = [16, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32>
@@ -41,7 +41,7 @@ gpu.module @test_kernel {
       %36 = vector.extract_strided_slice %arg7 {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32>
       %37 = vector.extract_strided_slice %arg7 {offsets = [16, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32>
       %38 = vector.extract_strided_slice %arg7 {offsets = [24, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32>
-      %39 = xegpu.load_nd %arg4 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x16xf16>
+      %39 = xegpu.load_nd %arg4 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x16xf16>
       %40 = vector.extract %39[0] : vector<32x16xf16> from vector<2x32x16xf16>
       %41 = vector.extract %39[1] : vector<32x16xf16> from vector<2x32x16xf16>
       %42 = vector.extract_strided_slice %40 {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf16> to vector<8x16xf16>
@@ -53,8 +53,8 @@ gpu.module @test_kernel {
       %48 = vector.extract_strided_slice %41 {offsets = [16, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf16> to vector<8x16xf16>
       %49 = vector.extract_strided_slice %41 {offsets = [24, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf16> to vector<8x16xf16>
 
-      //CHECK: %[[R50:.*]] = xegpu.load_nd %{{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>, packed}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x16x16x2xf16>
-      %50 = xegpu.load_nd %arg5 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x16xf16>
+      //CHECK: %[[R50:.*]] = xegpu.load_nd %{{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>, packed}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x16x16x2xf16>
+      %50 = xegpu.load_nd %arg5 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x16xf16>
 
       //CHECK: %[[R51:.*]] = vector.extract %[[R50]][0] : vector<16x16x2xf16> from vector<2x16x16x2xf16>
       //CHECK: %[[R52:.*]] = vector.extract %[[R50]][1] : vector<16x16x2xf16> from vector<2x16x16x2xf16>
@@ -117,9 +117,9 @@ gpu.module @test_kernel {
       %80 = vector.shuffle %64, %68 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : vector<8x16xf32>, vector<8x16xf32>
       %81 = vector.shuffle %72, %76 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : vector<8x16xf32>, vector<8x16xf32>
       %82 = vector.shuffle %80, %81 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] : vector<16x16xf32>, vector<16x16xf32>
-      %83 = xegpu.update_nd_offset %arg4, [%c0, %c32] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
-      %84 = xegpu.update_nd_offset %arg5, [%c32, %c0] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
-      scf.yield %83, %84, %79, %82 : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>, vector<32x16xf32>, vector<32x16xf32>
+      %83 = xegpu.update_nd_offset %arg4, [%c0, %c32] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>
+      %84 = xegpu.update_nd_offset %arg5, [%c32, %c0] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>
+      scf.yield %83, %84, %79, %82 : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>, vector<32x16xf32>, vector<32x16xf32>
     }
     %23 = vector.extract_strided_slice %22#2 {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32>
     %24 = vector.extract_strided_slice %22#2 {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32>
@@ -129,14 +129,14 @@ gpu.module @test_kernel {
     %28 = vector.extract_strided_slice %22#3 {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32>
     %29 = vector.extract_strided_slice %22#3 {offsets = [16, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32>
     %30 = vector.extract_strided_slice %22#3 {offsets = [24, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32>
-    xegpu.store_nd %23, %4 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    xegpu.store_nd %27, %6 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    xegpu.store_nd %24, %8 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    xegpu.store_nd %28, %9 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    xegpu.store_nd %25, %11 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    xegpu.store_nd %29, %12 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    xegpu.store_nd %26, %14 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    xegpu.store_nd %30, %15 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    xegpu.store_nd %23, %4 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    xegpu.store_nd %27, %6 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    xegpu.store_nd %24, %8 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    xegpu.store_nd %28, %9 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    xegpu.store_nd %25, %11 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    xegpu.store_nd %29, %12 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    xegpu.store_nd %26, %14 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    xegpu.store_nd %30, %15 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
     gpu.return
   }
 }
diff --git a/test/Transforms/VnniTransform/gemm_with_extract_e2e.mlir b/test/Transforms/VnniTransform/gemm_with_extract_e2e.mlir
index e1c803622..af6696bc4 100644
--- a/test/Transforms/VnniTransform/gemm_with_extract_e2e.mlir
+++ b/test/Transforms/VnniTransform/gemm_with_extract_e2e.mlir
@@ -35,28 +35,28 @@ gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.
     %1 = arith.muli %block_id_y, %c32 : index
     %2 = arith.addi %0, %c0 : index
     %3 = arith.addi %1, %c0 : index
-    %4 = xegpu.create_nd_tdesc %arg2[%2, %3] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    %4 = xegpu.create_nd_tdesc %arg2[%2, %3] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
     %c16 = arith.constant 16 : index
     %5 = arith.addi %1, %c16 : index
-    %6 = xegpu.create_nd_tdesc %arg2[%2, %5] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    %6 = xegpu.create_nd_tdesc %arg2[%2, %5] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
     %c8 = arith.constant 8 : index
     %7 = arith.addi %0, %c8 : index
-    %8 = xegpu.create_nd_tdesc %arg2[%7, %3] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    %9 = xegpu.create_nd_tdesc %arg2[%7, %5] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    %8 = xegpu.create_nd_tdesc %arg2[%7, %3] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    %9 = xegpu.create_nd_tdesc %arg2[%7, %5] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
     %10 = arith.addi %0, %c16 : index
-    %11 = xegpu.create_nd_tdesc %arg2[%10, %3] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    %12 = xegpu.create_nd_tdesc %arg2[%10, %5] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    %11 = xegpu.create_nd_tdesc %arg2[%10, %3] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    %12 = xegpu.create_nd_tdesc %arg2[%10, %5] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
     %c24 = arith.constant 24 : index
     %13 = arith.addi %0, %c24 : index
-    %14 = xegpu.create_nd_tdesc %arg2[%13, %3] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    %15 = xegpu.create_nd_tdesc %arg2[%13, %5] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    %16 = xegpu.create_nd_tdesc %arg2[%2, %3] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    %17 = xegpu.create_nd_tdesc %arg2[%2, %5] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    %18 = xegpu.load_nd %16 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xf32>
-    %19 = xegpu.load_nd %17 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xf32>
-    %20 = xegpu.create_nd_tdesc %arg0[%2, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
-    %21 = xegpu.create_nd_tdesc %arg1[%c0, %3] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
-    %22:4 = scf.for %arg3 = %c0 to %c1024 step %c32 iter_args(%arg4 = %20, %arg5 = %21, %arg6 = %18, %arg7 = %19) -> (!xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>, vector<32x16xf32>, vector<32x16xf32>) {
+    %14 = xegpu.create_nd_tdesc %arg2[%13, %3] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    %15 = xegpu.create_nd_tdesc %arg2[%13, %5] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    %16 = xegpu.create_nd_tdesc %arg2[%2, %3] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    %17 = xegpu.create_nd_tdesc %arg2[%2, %5] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    %18 = xegpu.load_nd %16 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xf32>
+    %19 = xegpu.load_nd %17 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>> -> vector<32x16xf32>
+    %20 = xegpu.create_nd_tdesc %arg0[%2, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>
+    %21 = xegpu.create_nd_tdesc %arg1[%c0, %3] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>
+    %22:4 = scf.for %arg3 = %c0 to %c1024 step %c32 iter_args(%arg4 = %20, %arg5 = %21, %arg6 = %18, %arg7 = %19) -> (!xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>, vector<32x16xf32>, vector<32x16xf32>) {
       %31 = vector.extract_strided_slice %arg6 {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32>
       %32 = vector.extract_strided_slice %arg6 {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32>
       %33 = vector.extract_strided_slice %arg6 {offsets = [16, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32>
@@ -65,7 +65,7 @@ gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.
       %36 = vector.extract_strided_slice %arg7 {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32>
       %37 = vector.extract_strided_slice %arg7 {offsets = [16, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32>
       %38 = vector.extract_strided_slice %arg7 {offsets = [24, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32>
-      %39 = xegpu.load_nd %arg4 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x16xf16>
+      %39 = xegpu.load_nd %arg4 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x16xf16>
       %40 = vector.extract %39[0] : vector<32x16xf16> from vector<2x32x16xf16>
       %41 = vector.extract %39[1] : vector<32x16xf16> from vector<2x32x16xf16>
       %42 = vector.extract_strided_slice %40 {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf16> to vector<8x16xf16>
@@ -77,7 +77,7 @@ gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.
       %48 = vector.extract_strided_slice %41 {offsets = [16, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf16> to vector<8x16xf16>
       %49 = vector.extract_strided_slice %41 {offsets = [24, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf16> to vector<8x16xf16>
 
-      %50 = xegpu.load_nd %arg5 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x16xf16>
+      %50 = xegpu.load_nd %arg5 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>> -> vector<2x32x16xf16>
 
       %51 = vector.extract %50[0] : vector<32x16xf16> from vector<2x32x16xf16>
       %52 = vector.extract %50[1] : vector<32x16xf16> from vector<2x32x16xf16>
@@ -109,9 +109,9 @@ gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.
       %80 = vector.shuffle %64, %68 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : vector<8x16xf32>, vector<8x16xf32>
       %81 = vector.shuffle %72, %76 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : vector<8x16xf32>, vector<8x16xf32>
       %82 = vector.shuffle %80, %81 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] : vector<16x16xf32>, vector<16x16xf32>
-      %83 = xegpu.update_nd_offset %arg4, [%c0, %c32] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
-      %84 = xegpu.update_nd_offset %arg5, [%c32, %c0] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>
-      scf.yield %83, %84, %79, %82 : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 2 : i64, boundary_check = true>>, vector<32x16xf32>, vector<32x16xf32>
+      %83 = xegpu.update_nd_offset %arg4, [%c0, %c32] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>
+      %84 = xegpu.update_nd_offset %arg5, [%c32, %c0] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>
+      scf.yield %83, %84, %79, %82 : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 2 : i64, boundary_check = true>>, vector<32x16xf32>, vector<32x16xf32>
     }
     %23 = vector.extract_strided_slice %22#2 {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32>
     %24 = vector.extract_strided_slice %22#2 {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32>
@@ -121,14 +121,14 @@ gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.
     %28 = vector.extract_strided_slice %22#3 {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32>
     %29 = vector.extract_strided_slice %22#3 {offsets = [16, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32>
     %30 = vector.extract_strided_slice %22#3 {offsets = [24, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32>
-    xegpu.store_nd %23, %4 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    xegpu.store_nd %27, %6 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    xegpu.store_nd %24, %8 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    xegpu.store_nd %28, %9 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    xegpu.store_nd %25, %11 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    xegpu.store_nd %29, %12 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    xegpu.store_nd %26, %14 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
-    xegpu.store_nd %30, %15 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
+    xegpu.store_nd %23, %4 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    xegpu.store_nd %27, %6 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    xegpu.store_nd %24, %8 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    xegpu.store_nd %28, %9 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    xegpu.store_nd %25, %11 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    xegpu.store_nd %29, %12 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    xegpu.store_nd %26, %14 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
+    xegpu.store_nd %30, %15 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space =  global, array_length = 1 : i64, boundary_check = true>>
     gpu.return
   }
 }