diff --git a/include/imex/Dialect/XeGPU/IR/XeGPUAttrs.td b/include/imex/Dialect/XeGPU/IR/XeGPUAttrs.td
index 49422f46c..a793b7081 100644
--- a/include/imex/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/include/imex/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -22,30 +22,27 @@ def XeGPU_ScatteredAttr : XeGPUAttr<"Scattered", "scattered"> {
 
 def XeGPU_SgMapAttr: XeGPUAttr<"SubGroupMap", "sg_map"> {
   let parameters = (ins
-        ArrayRefParameter<"unsigned">:$wiLayout,
-        ArrayRefParameter<"unsigned">:$wiData,
-        ArrayRefParameter<"unsigned">:$mmaBlockSize);
+        OptionalParameter<"mlir::DenseI32ArrayAttr">:$mma_block_size,
+        "mlir::DenseI32ArrayAttr":$wi_layout,
+        "mlir::DenseI32ArrayAttr":$wi_data
+      );
 
   // In format of #xegpu.sg_map<{mma_block_size = [2, 4], wi_layout = [2, 4], wi_data = [2, 4]}>
-  let assemblyFormat = "`<` custom<SubGroupMapAttrElements>($wiLayout, $wiData, $mmaBlockSize) `>`";
+  let assemblyFormat = "`<` struct(params) `>`";
 
   let genVerifyDecl = true;
 
-  let extraClassDeclaration = [{
-    bool hasMMABlockSizeAttr() {
-      return getMmaBlockSize().size() == 2;
-    }
-  }];
-
   let builders = [
     AttrBuilder<(ins
-      "::llvm::ArrayRef<unsigned>":$wiLayout,
-      "::llvm::ArrayRef<unsigned>":$wiData,
-      CArg<"::llvm::ArrayRef<unsigned>", "{}">:$mmaBlockSize
+      "::llvm::ArrayRef<int32_t>":$wiLayout,
+      "::llvm::ArrayRef<int32_t>":$wiData,
+      CArg<"::llvm::ArrayRef<int32_t>", "{}">:$mmaBlockSize
     ), [{
       assert(wiLayout.size() == 2 && wiData.size() == 2 && "wiLayout and wiData should be 2D arrays.\n");
       assert((mmaBlockSize.size() == 2 || mmaBlockSize.size() == 0) && "mmaBlockSize can be either empty or a 2D array.\n");
-      return $_get($_ctxt, wiLayout, wiData, mmaBlockSize);
+      return $_get($_ctxt, mlir::DenseI32ArrayAttr::get($_ctxt, mmaBlockSize),
+                           mlir::DenseI32ArrayAttr::get($_ctxt, wiLayout),
+                           mlir::DenseI32ArrayAttr::get($_ctxt, wiData));
     }]>
   ];
 
@@ -54,16 +51,18 @@ def XeGPU_SgMapAttr: XeGPUAttr<"SubGroupMap", "sg_map"> {
 
 def XeGPU_WgMapAttr: XeGPUAttr<"WorkGroupMap", "wg_map"> {
   let parameters = (ins
-        ArrayRefParameter<"unsigned">:$sgLayout,
-        ArrayRefParameter<"unsigned">:$sgData);
+        "mlir::DenseI32ArrayAttr":$sg_layout,
+        "mlir::DenseI32ArrayAttr":$sg_data
+      );
 
   let builders = [
     AttrBuilder<(ins
-      "::llvm::ArrayRef<unsigned>":$sgLayout,
-      "::llvm::ArrayRef<unsigned>":$sgData
+      "::llvm::ArrayRef<int32_t>":$sgLayout,
+      "::llvm::ArrayRef<int32_t>":$sgData
     ), [{
       assert(sgLayout.size() == 2 && sgData.size() == 2 && "sgLayout and sgData should be 2D arrays.\n");
-      return $_get($_ctxt, sgLayout, sgData);
+      return $_get($_ctxt, mlir::DenseI32ArrayAttr::get($_ctxt, sgLayout),
+                           mlir::DenseI32ArrayAttr::get($_ctxt, sgData));
     }]>
   ];
 
@@ -71,7 +70,7 @@ def XeGPU_WgMapAttr: XeGPUAttr<"WorkGroupMap", "wg_map"> {
   let skipDefaultBuilders = 1;
 
   // In format of #xegpu.wg_map<{sg_layout = [2, 4], sg_data = [2, 4]}>
-  let assemblyFormat = "`<` custom<WorkGroupMapAttrElements>($sgLayout, $sgData) `>`";
+  let assemblyFormat = "`<` struct(params) `>`";
 }
 
 def XeGPU_XeMapAttr: XeGPUAttr<"XeMap", "xe_map"> {
@@ -81,23 +80,27 @@ def XeGPU_XeMapAttr: XeGPUAttr<"XeMap", "xe_map"> {
 
   let builders = [
     AttrBuilder<(ins
-      "::llvm::ArrayRef<unsigned>":$sgLayout,
-      "::llvm::ArrayRef<unsigned>":$sgData,
-      "::llvm::ArrayRef<unsigned>":$wiLayout,
-      "::llvm::ArrayRef<unsigned>":$wiData,
-      CArg<"::llvm::ArrayRef<unsigned>", "{}">:$mmaBlockSize
+      "::llvm::ArrayRef<int32_t>":$sgLayout,
+      "::llvm::ArrayRef<int32_t>":$sgData,
+      "::llvm::ArrayRef<int32_t>":$wiLayout,
+      "::llvm::ArrayRef<int32_t>":$wiData,
+      CArg<"::llvm::ArrayRef<int32_t>", "{}">:$mmaBlockSize
     ), [{
       assert(sgLayout.size() == 2 && sgData.size() == 2 && "sgLayout and sgData should be 2D arrays.\n");
       assert(wiLayout.size() == 2 && wiData.size() == 2 && "wiLayout and wiData should be 2D arrays.\n");
       assert((mmaBlockSize.size() == 2 || mmaBlockSize.size() == 0) && "mmaBlockSize can be either empty or a 2D array.\n");
-      auto wg = WorkGroupMapAttr::get($_ctxt, sgLayout, sgData);
-      auto sg = SubGroupMapAttr::get($_ctxt, wiLayout, wiData, mmaBlockSize);
+      auto wg = WorkGroupMapAttr::get($_ctxt, mlir::DenseI32ArrayAttr::get($_ctxt, sgLayout),
+                                              mlir::DenseI32ArrayAttr::get($_ctxt, sgData));
+      auto sg = SubGroupMapAttr::get($_ctxt, mlir::DenseI32ArrayAttr::get($_ctxt, mmaBlockSize),
+                                             mlir::DenseI32ArrayAttr::get($_ctxt, wiLayout),
+                                             mlir::DenseI32ArrayAttr::get($_ctxt, wiData));
       return $_get($_ctxt, wg, sg);
     }]>
   ];
 
   // In format of #xegpu.xe_map<wg = {sg_layout = [2, 4], sg_data = [2, 4]}, sg = {mma_block_size = [2, 4], sg_layout = [2, 4], sg_data = [2, 4]}>
-  let hasCustomAssemblyFormat = 1;
+  let assemblyFormat = "`<` struct(params) `>`";
+
 }
 
 def XeGPU_ArgTypeAttr : I32EnumAttr<
diff --git a/include/imex/Dialect/XeGPU/IR/XeGPUOps.td b/include/imex/Dialect/XeGPU/IR/XeGPUOps.td
index b2baf565c..652ee9f60 100644
--- a/include/imex/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/include/imex/Dialect/XeGPU/IR/XeGPUOps.td
@@ -317,13 +317,12 @@ def XeGPU_CreateDescOp
   let description = [{
     "create_tdesc" is similar to "create_nd_tdesc" in terms that it creates a TensorDesc for a memory region.
     while "create_nd_tdesc" is for creating continious subviews, "create_tdesc" is for creating non-continious
-    (scattered) subviews. It accepts the following parameters:
+    (scattered) subviews. It only works with VectorCompute (VC) mode and accepts the following parameters:
 
     * source: a 1D memref or pointer (uint64_t) represents the memory object.
-    * offsets: In VectorCompute (VC) mode, it is a 1D vector containing offsets of each access point, the size is aligned with
+    * offsets: It is a 1D vector containing offsets of each access point, the size should be aligned with
                supportted group size, e.g., vector<16xindex>. And each element in the vector corresponds to a
                work item (SIMT lane) in the subgroup.
-               In SIMT mode (default), it is an index scalar representing the offset of the access point.
     * chunk_size_per_lane: [optional attribute] indicates number of continious elements accessed for each offset, default is 1.
 
     Example 1. It assumes subgroup size is 4, and accesses a[0], a[16], a[32], a[64]
@@ -336,13 +335,6 @@ def XeGPU_CreateDescOp
     %0 = memref.alloc() : memref<1024xf32>
     %c0 = arith.constant dense<0, 16, 32, 64> : vector<4xindex>
     %1 = xegpu.create_tdesc %0, %c0 {chunk_size_per_lane = 8}: memref<1024xf32> -> TensorDesc<4x8xf32>
-
-    Example 3. an SIMT mode example, accessing a[16].
-    %a = memref.alloc() : memref<1024xf32>
-    %c0 = arith.constant 16 : index
-    %1 = xegpu.create_tdesc %a, %c0: memref<1024xf32> -> TensorDesc<1xf32>
-
-
   }];
 
   let arguments = (ins XeGPU_BaseAddrType: $source,
@@ -366,6 +358,28 @@ def XeGPU_CreateDescOp
 
   }];
 
+  let builders = [
+    OpBuilder<(ins "::imex::xegpu::TensorDescType": $TensorDesc, "::mlir::Value": $source,
+                  "::mlir::Value": $offsets, CArg<"uint32_t", "1"> : $chunk_size_per_lane), [{
+        $_state.addOperands(source);
+        $_state.addOperands(offsets);
+        $_state.getOrAddProperties<Properties>().chunk_size_per_lane = $_builder.getIntegerAttr($_builder.getIntegerType(32), chunk_size_per_lane);
+        $_state.getOrAddProperties<Properties>().mode = ::imex::xegpu::ModeAttr::get($_builder.getContext(), imex::xegpu::Mode::VC);
+        $_state.addTypes(TensorDesc);
+    }]>,
+
+    OpBuilder<(ins "::imex::xegpu::TensorDescType": $TensorDesc, "::mlir::Value": $source,
+                  "::mlir::Value": $offsets, "::mlir::IntegerAttr": $chunk_size_per_lane), [{
+        $_state.addOperands(source);
+        $_state.addOperands(offsets);
+        if(chunk_size_per_lane)
+          $_state.getOrAddProperties<Properties>().chunk_size_per_lane = chunk_size_per_lane;
+        $_state.getOrAddProperties<Properties>().mode = ::imex::xegpu::ModeAttr::get($_builder.getContext(), imex::xegpu::Mode::VC);
+        $_state.addTypes(TensorDesc);
+    }]>
+  ];
+  let skipDefaultBuilders = 1;
+
   // Format: xegpu.create_tdesc %src, %offsets {mode=simt, chunk_size_per_lane=1}
   //                   : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
   let hasCustomAssemblyFormat = 1;
@@ -431,6 +445,25 @@ def XeGPU_PrefetchNDOp : XeGPU_Op<"prefetch_nd", []> {
   let hasCustomAssemblyFormat = 1;
 }
 
+def XeGPU_UpdateNDOffsetOp : XeGPU_Op<"update_nd_offset", []> {
+  let summary = "update the offsets for the given tensor descriptor";
+
+  let arguments = (ins
+    XeGPU_TensorDesc: $TensorDesc,
+    Variadic<Index>: $offsets,
+    DefaultValuedAttr<XeGPU_ModeAttr, "imex::xegpu::Mode::SIMT">: $mode);
+
+  let results = (outs XeGPU_TensorDesc: $result);
+
+  let assemblyFormat = [{
+    $TensorDesc `,` (`[` $offsets^ `]`)? (`{` `mode` `=` $mode^ `}`)?
+    attr-dict `:` qualified(type($TensorDesc)) `->` qualified(type($result))
+  }];
+
+  let hasVerifier = 1;
+}
+
+
 def XeGPU_DpasOp : XeGPU_Op<"dpas"> {
   let summary = "performs dpas computation";
   let arguments = (ins
@@ -480,6 +513,55 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load"> {
 
   let results = (outs XeGPU_ValueType: $value);
 
+  let builders = [
+    OpBuilder<(ins "::mlir::Type": $value, "::mlir::Value": $TensorDesc, "::mlir::Value": $mask, "::mlir::IntegerAttr": $vnni_axis,
+                    CArg<"::mlir::DenseI64ArrayAttr", "::mlir::DenseI64ArrayAttr()">: $transpose,
+                    CArg<"::imex::xegpu::CacheReadHintAttr", "::imex::xegpu::CacheReadHintAttr()">: $l1_hint,
+                    CArg<"::imex::xegpu::CacheReadHintAttr", "::imex::xegpu::CacheReadHintAttr()">: $l2_hint,
+                    CArg<"::imex::xegpu::CacheReadHintAttr", "::imex::xegpu::CacheReadHintAttr()">: $l3_hint), [{
+      $_state.addOperands(TensorDesc);
+      $_state.addOperands(mask);
+      if (vnni_axis) {
+        $_state.getOrAddProperties<Properties>().vnni_axis = vnni_axis;
+      }
+      if (transpose) {
+        $_state.getOrAddProperties<Properties>().transpose = transpose;
+      }
+      if (l1_hint) {
+        $_state.getOrAddProperties<Properties>().l1_hint = l1_hint;
+      }
+      if (l2_hint) {
+        $_state.getOrAddProperties<Properties>().l2_hint = l2_hint;
+      }
+      if (l3_hint) {
+        $_state.getOrAddProperties<Properties>().l3_hint = l3_hint;
+      }
+      $_state.getOrAddProperties<Properties>().mode = ::imex::xegpu::ModeAttr::get($_builder.getContext(), imex::xegpu::Mode::VC);
+      $_state.addTypes(value); }]>,
+
+    OpBuilder<(ins "::mlir::Type": $value, "::mlir::Value": $TensorDesc, "::mlir::Value": $mask, "::mlir::IntegerAttr": $vnni_axis,
+                    CArg<"::mlir::DenseI64ArrayAttr", "::mlir::DenseI64ArrayAttr()">: $transpose,
+                    CArg<"::imex::xegpu::CacheReadHint", "::imex::xegpu::CacheReadHint::CACHED">: $l1_hint,
+                    CArg<"::imex::xegpu::CacheReadHint", "::imex::xegpu::CacheReadHint::CACHED">: $l2_hint,
+                    CArg<"::imex::xegpu::CacheReadHint", "::imex::xegpu::CacheReadHint::CACHED">: $l3_hint), [{
+      $_state.addOperands(TensorDesc);
+      $_state.addOperands(mask);
+      if (vnni_axis) {
+        $_state.getOrAddProperties<Properties>().vnni_axis = vnni_axis;
+      }
+      if (transpose) {
+        $_state.getOrAddProperties<Properties>().transpose = transpose;
+      }
+
+      $_state.getOrAddProperties<Properties>().l1_hint = ::imex::xegpu::CacheReadHintAttr::get($_builder.getContext(), l1_hint);
+      $_state.getOrAddProperties<Properties>().l2_hint = ::imex::xegpu::CacheReadHintAttr::get($_builder.getContext(), l2_hint);
+      $_state.getOrAddProperties<Properties>().l3_hint = ::imex::xegpu::CacheReadHintAttr::get($_builder.getContext(), l3_hint);
+      $_state.getOrAddProperties<Properties>().mode = ::imex::xegpu::ModeAttr::get($_builder.getContext(), imex::xegpu::Mode::VC);
+      $_state.addTypes(value); }]>
+
+  ];
+  let skipDefaultBuilders = 1;
+
   // In format of: %2 = xegpu.load %1, %0 {transpose = [1, 0], l1_hint = cached, l2_hint = uncached}
   //                 : !xegpu.tensor_desc<16x8xf32, #xegpu.scattered>, vector<16x8xi1> -> vector<8x16xf32>
   let hasCustomAssemblyFormat = 1;
@@ -499,30 +581,47 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", []> {
     DefaultValuedAttr<XeGPU_ModeAttr, "imex::xegpu::Mode::SIMT">: $mode
   );
 
+  let builders = [
+    OpBuilder<(ins "::mlir::Value": $value, "::mlir::Value": $TensorDesc, "::mlir::Value": $mask,
+                CArg<"::imex::xegpu::CacheWriteHintAttr", "::imex::xegpu::CacheWriteHintAttr()">: $l1_hint,
+                CArg<"::imex::xegpu::CacheWriteHintAttr", "::imex::xegpu::CacheWriteHintAttr()">: $l2_hint,
+                CArg<"::imex::xegpu::CacheWriteHintAttr", "::imex::xegpu::CacheWriteHintAttr()">: $l3_hint), [{
+    $_state.addOperands(value);
+    $_state.addOperands(TensorDesc);
+    $_state.addOperands(mask);
+    if (l1_hint) {
+      $_state.getOrAddProperties<Properties>().l1_hint = l1_hint;
+    }
+    if (l2_hint) {
+      $_state.getOrAddProperties<Properties>().l2_hint = l2_hint;
+    }
+    if (l3_hint) {
+      $_state.getOrAddProperties<Properties>().l3_hint = l3_hint;
+    }
+    $_state.getOrAddProperties<Properties>().mode = ::imex::xegpu::ModeAttr::get($_builder.getContext(), imex::xegpu::Mode::VC);
+    }]>,
+
+    OpBuilder<(ins "::mlir::Value": $value, "::mlir::Value": $TensorDesc, "::mlir::Value": $mask,
+                CArg<"::imex::xegpu::CacheWriteHint", "::imex::xegpu::CacheWriteHint::WRITE_BACK">: $l1_hint,
+                CArg<"::imex::xegpu::CacheWriteHint", "::imex::xegpu::CacheWriteHint::WRITE_BACK">: $l2_hint,
+                CArg<"::imex::xegpu::CacheWriteHint", "::imex::xegpu::CacheWriteHint::WRITE_BACK">: $l3_hint), [{
+    $_state.addOperands(value);
+    $_state.addOperands(TensorDesc);
+    $_state.addOperands(mask);
+    $_state.getOrAddProperties<Properties>().l1_hint = ::imex::xegpu::CacheWriteHintAttr::get($_builder.getContext(), l1_hint);
+    $_state.getOrAddProperties<Properties>().l2_hint = ::imex::xegpu::CacheWriteHintAttr::get($_builder.getContext(), l2_hint);;
+    $_state.getOrAddProperties<Properties>().l3_hint = ::imex::xegpu::CacheWriteHintAttr::get($_builder.getContext(), l3_hint);;
+    $_state.getOrAddProperties<Properties>().mode = ::imex::xegpu::ModeAttr::get($_builder.getContext(), imex::xegpu::Mode::VC);
+    }]>
+  ];
+  let skipDefaultBuilders = 1;
+
   // Format: %3 = xegpu.load %1, %0 {l1_hint = cached, l2_hint = uncached}
   //                      : !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1> -> vector<16xf32>
   let hasCustomAssemblyFormat = 1;
   let hasVerifier = 1;
 }
 
-def XeGPU_UpdateNDOffsetOp : XeGPU_Op<"update_nd_offset", []> {
-  let summary = "update the offsets for the given tensor descriptor";
-
-  let arguments = (ins
-    XeGPU_TensorDesc: $TensorDesc,
-    Variadic<Index>: $offsets,
-    DefaultValuedAttr<XeGPU_ModeAttr, "imex::xegpu::Mode::SIMT">: $mode);
-
-  let results = (outs XeGPU_TensorDesc: $result);
-
-  let assemblyFormat = [{
-    $TensorDesc `,` (`[` $offsets^ `]`)? (`{` `mode` `=` $mode^ `}`)?
-    attr-dict `:` qualified(type($TensorDesc)) `->` qualified(type($result))
-  }];
-
-  let hasVerifier = 1;
-}
-
 def XeGPU_UpdateOffsetOp
     : XeGPU_Op<"update_offset", []> {
       let summary = "update the offsets for the given tensor descriptor";
@@ -535,6 +634,17 @@ def XeGPU_UpdateOffsetOp
 
       let results = (outs XeGPU_TensorDesc: $result);
 
+      let builders = [
+        OpBuilder<(ins "::mlir::Type": $result, "::mlir::Value": $TensorDesc, "::mlir::Value": $offsets), [{
+          $_state.addOperands(TensorDesc);
+          $_state.addOperands(offsets);
+          $_state.getOrAddProperties<Properties>().mode = ::imex::xegpu::ModeAttr::get($_builder.getContext(), imex::xegpu::Mode::VC);
+          $_state.addTypes(result);
+        }]>
+      ];
+
+      let skipDefaultBuilders = 1;
+
       let assemblyFormat = [{
         $TensorDesc `,` $offsets (`{` `mode` `=` $mode^ `}`)?
         attr-dict `:` qualified(type($TensorDesc)) `,` qualified(type($offsets)) `->`  qualified(type($result))
@@ -543,6 +653,53 @@ def XeGPU_UpdateOffsetOp
       let hasVerifier = 1;
   }
 
+def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> {
+  let summary = "prefetches a nD block to cache";
+  let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
+            OptionalAttr<XeGPU_CacheReadAttr>: $l1_hint,
+            OptionalAttr<XeGPU_CacheReadAttr>: $l2_hint,
+            OptionalAttr<XeGPU_CacheReadAttr>: $l3_hint,
+            DefaultValuedAttr<XeGPU_ModeAttr, "imex::xegpu::Mode::SIMT">: $mode
+  );
+
+  let builders = [
+    OpBuilder<(ins "::mlir::Value": $TensorDesc,
+      CArg<"::imex::xegpu::CacheReadHintAttr", "::imex::xegpu::CacheReadHintAttr()">: $l1_hint,
+      CArg<"::imex::xegpu::CacheReadHintAttr", "::imex::xegpu::CacheReadHintAttr()">: $l2_hint,
+      CArg<"::imex::xegpu::CacheReadHintAttr", "::imex::xegpu::CacheReadHintAttr()">: $l3_hint), [{
+      $_state.addOperands(TensorDesc);
+      if (l1_hint) {
+        $_state.getOrAddProperties<Properties>().l1_hint = l1_hint;
+      }
+      if (l2_hint) {
+        $_state.getOrAddProperties<Properties>().l2_hint = l2_hint;
+      }
+      if (l3_hint) {
+        $_state.getOrAddProperties<Properties>().l3_hint = l3_hint;
+      }
+      $_state.getOrAddProperties<Properties>().mode = ::imex::xegpu::ModeAttr::get($_builder.getContext(), imex::xegpu::Mode::VC);
+      }]>,
+
+    OpBuilder<(ins "::mlir::Value": $TensorDesc,
+      CArg<"::imex::xegpu::CacheReadHint", "::imex::xegpu::CacheReadHint::CACHED">: $l1_hint,
+      CArg<"::imex::xegpu::CacheReadHint", "::imex::xegpu::CacheReadHint::CACHED">: $l2_hint,
+      CArg<"::imex::xegpu::CacheReadHint", "::imex::xegpu::CacheReadHint::CACHED">: $l3_hint), [{
+      $_state.addOperands(TensorDesc);
+      $_state.getOrAddProperties<Properties>().l1_hint = ::imex::xegpu::CacheReadHintAttr::get($_builder.getContext(), l1_hint);
+      $_state.getOrAddProperties<Properties>().l2_hint = ::imex::xegpu::CacheReadHintAttr::get($_builder.getContext(), l2_hint);
+      $_state.getOrAddProperties<Properties>().l3_hint = ::imex::xegpu::CacheReadHintAttr::get($_builder.getContext(), l3_hint);;
+      $_state.getOrAddProperties<Properties>().mode = ::imex::xegpu::ModeAttr::get($_builder.getContext(), imex::xegpu::Mode::VC);
+      }]>
+  ];
+
+  let skipDefaultBuilders = 1;
+
+  // In format of: xegpu.prefetch %tdesc {l1_hint = cached, l2_hint = uncached}:
+  //                                    !xegpu.tensor_desc<8x16xf16>
+  let hasCustomAssemblyFormat = 1;
+  let hasVerifier = 1;
+}
+
 def XeGPU_InvokeSIMDOp : XeGPU_Op<"invoke_SIMD", []> {
     let summary = "Invoke_SIMD operation";
     let description = [{
@@ -588,6 +745,36 @@ def XeGPU_AtomicRMWOp: XeGPU_Op<"atomic_rmw", []> {
   let assemblyFormat = [{
     $kind $tensorDesc `,` $mask (`,` $value^)? (`{` `mode` `=` $mode^ `}`)? attr-dict `:` qualified(type(operands)) `->` type($result)
   }];
+
+  let builders = [
+    OpBuilder<(ins "::mlir::Type": $result, "::imex::xegpu::AtomicRMWKindAttr": $kind,
+                   "::mlir::Value": $tensorDesc, "::mlir::Value": $mask,
+                   "::mlir::Value": $value), [{
+      $_state.addOperands(tensorDesc);
+      $_state.addOperands(mask);
+      if (value)
+        $_state.addOperands(value);
+      $_state.getOrAddProperties<Properties>().kind = kind;
+      $_state.getOrAddProperties<Properties>().mode = ::imex::xegpu::ModeAttr::get($_builder.getContext(), imex::xegpu::Mode::VC);
+      $_state.addTypes(result);
+    }]>,
+
+    OpBuilder<(ins "::mlir::Type": $result, "::imex::xegpu::AtomicRMWKind": $kind,
+                   "::mlir::Value": $tensorDesc, "::mlir::Value": $mask,
+                   "::mlir::Value": $value), [{
+      $_state.addOperands(tensorDesc);
+      $_state.addOperands(mask);
+      if (value)
+        $_state.addOperands(value);
+      $_state.getOrAddProperties<Properties>().kind = ::imex::xegpu::AtomicRMWKindAttr::get($_builder.getContext(), kind);
+      $_state.getOrAddProperties<Properties>().mode = ::imex::xegpu::ModeAttr::get($_builder.getContext(), imex::xegpu::Mode::VC);
+      $_state.addTypes(result);
+    }]>
+  ];
+
+  let skipDefaultBuilders = 1;
+
+  let hasVerifier = 1;
 }
 
 
diff --git a/include/imex/Dialect/XeGPU/IR/XeGPUTypes.td b/include/imex/Dialect/XeGPU/IR/XeGPUTypes.td
index 6f6f3df59..c87238232 100644
--- a/include/imex/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/include/imex/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -27,7 +27,8 @@ def XeGPU_FloatType: AnyTypeOf<[F16, F32, F64, BF16, TF32]>;
 def XeGPU_ScalarType: AnyTypeOf<[XeGPU_IntType, XeGPU_FloatType]>;
 def XeGPU_BaseAddrType: AnyTypeOf<[MemRefRankOf<[XeGPU_ScalarType], [1, 2]>, UI64, UI32, I64, I32]>;
 def XeGPU_DpasOpType: VectorOfRankAndType<[2, 3], [XeGPU_ScalarType]>;
-def XeGPU_OffsetType: AnyTypeOf<[VectorOfRankAndType<[1], [Index]>, Index]>;
+// def XeGPU_OffsetType: AnyTypeOf<[VectorOfRankAndType<[1], [Index]>, Index]>;
+def XeGPU_OffsetType: VectorOfRankAndType<[1], [Index]>;
 def XeGPU_MaskType: AnyTypeOf<[VectorOfRankAndType<[1,2], [I1]>, I1]>;
 def XeGPU_ValueType: AnyTypeOf<[VectorOfRankAndType<[1,2,3], [XeGPU_ScalarType]>, XeGPU_ScalarType]>;
 
@@ -70,16 +71,19 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
   let parameters = (ins ArrayRefParameter<"int64_t">:$shape,
                         "::mlir::Type":$elementType,
                         DefaultValuedParameter<"::imex::xegpu::MemoryScope", "xegpu::MemoryScope::GLOBAL">: $memory_scope,
-                        OptionalParameter<"::mlir::Attribute"> :$encoding);
+                        OptionalParameter<"::mlir::Attribute">: $encoding,
+                        OptionalParameter<"::mlir::Attribute">: $mapping
+                        );
 
   let builders = [
     TypeBuilderWithInferredContext<(ins
       "::llvm::ArrayRef<int64_t>":$shape,
       "::mlir::Type":$elementType,
       CArg<"::imex::xegpu::MemoryScope", "xegpu::MemoryScope::GLOBAL">: $memory_scope,
-      CArg<"::mlir::Attribute", "{}">:$encoding
+      CArg<"::mlir::Attribute", "{}">:$encoding,
+      CArg<"::mlir::Attribute", "{}">:$mapping
     ), [{
-      return $_get(elementType.getContext(), shape, elementType, memory_scope, encoding);
+      return $_get(elementType.getContext(), shape, elementType, memory_scope, encoding, mapping);
     }]>
   ];
 
@@ -99,7 +103,7 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
     }
   }];
 
-  let assemblyFormat = "`<` custom<ShapeAndType>($shape, $elementType)``custom<TensorDescAttr>($memory_scope, $encoding)`>`";
+  let assemblyFormat = "`<` custom<ShapeAndType>($shape, $elementType)``custom<TensorDescAttr>($memory_scope, $encoding, $mapping)`>`";
 }
 
 
diff --git a/lib/Conversion/XeGPUToSPIRV/XeGPUToSPIRV.cpp b/lib/Conversion/XeGPUToSPIRV/XeGPUToSPIRV.cpp
index b4e86fcc8..9b044065d 100644
--- a/lib/Conversion/XeGPUToSPIRV/XeGPUToSPIRV.cpp
+++ b/lib/Conversion/XeGPUToSPIRV/XeGPUToSPIRV.cpp
@@ -1474,11 +1474,11 @@ Value linearizeOffset(OpBuilder builder, Location loc,
 
 unsigned getElementPerWI(imex::xegpu::TensorDescType tDescType) {
   imex::xegpu::SubGroupMapAttr sgMap;
-  auto encoding = tDescType.getEncoding();
-  if (auto xeMapAttr = llvm::dyn_cast<imex::xegpu::XeMapAttr>(encoding)) {
+  auto mapping = tDescType.getMapping();
+  if (auto xeMapAttr = llvm::dyn_cast<imex::xegpu::XeMapAttr>(mapping)) {
     sgMap = xeMapAttr.getSg();
   } else {
-    sgMap = llvm::dyn_cast<imex::xegpu::SubGroupMapAttr>(encoding);
+    sgMap = llvm::dyn_cast<imex::xegpu::SubGroupMapAttr>(mapping);
   }
   auto blockSize = tDescType.getShape();
   auto wiLayout = sgMap.getWiLayout();
diff --git a/lib/Conversion/XeTileToXeGPU/XeTileOpConversion.cpp b/lib/Conversion/XeTileToXeGPU/XeTileOpConversion.cpp
index 2fe695760..79e5ae9f1 100644
--- a/lib/Conversion/XeTileToXeGPU/XeTileOpConversion.cpp
+++ b/lib/Conversion/XeTileToXeGPU/XeTileOpConversion.cpp
@@ -40,7 +40,7 @@ class SgInitTileOpPattern
     llvm::SmallVector<mlir::Value> offsets;
     auto staticOffsets = op.getStaticOffsets();
     auto dynamicOffsets = op.getOffsets();
-    for (int i = 0, j = 0; i != staticOffsets.size(); i++) {
+    for (size_t i = 0, j = 0; i != staticOffsets.size(); i++) {
       if (mlir::ShapedType::isDynamic(staticOffsets[i])) {
         offsets.push_back(dynamicOffsets[j++]);
       } else {
@@ -77,8 +77,6 @@ class SgInitTileOpPattern
         mlir::SmallVector<mlir::OpFoldResult> tDescOffsets{tDescOffsetX,
                                                            tDescOffsetY};
 
-        constexpr int64_t kDynamic = std::numeric_limits<int64_t>::min();
-
         // TODO: this needs improvement, it assumes the source is static
         // memeref.
         auto createNdOp = rewriter.create<xegpu::CreateNdDescOp>(
@@ -116,9 +114,6 @@ struct SgPrefetchTileOpPattern
       return mlir::failure();
     }
 
-    auto elementTy = tileTy.getElementType();
-    auto subVectorTy = mlir::VectorType::get({shape[2], shape[3]}, elementTy);
-
     auto L1 = xegpu::CacheReadHintAttr::get(op.getContext(),
                                             xegpu::CacheReadHint::CACHED);
     auto L2 = xegpu::CacheReadHintAttr::get(op.getContext(),
@@ -129,9 +124,8 @@ struct SgPrefetchTileOpPattern
     for (int i = 0; i < shape[0]; i++) {
       for (int j = 0; j < shape[1]; j++) {
         auto tile = tiles[i * shape[1] + j];
-        rewriter.create<xegpu::LoadNDOp>(
-            op.getLoc(), subVectorTy, tile, mlir::IntegerAttr(),
-            mlir::DenseI64ArrayAttr(), L1, L2, L3, imex::xegpu::Mode::VC);
+        rewriter.create<xegpu::PrefetchNDOp>(op.getLoc(), tile, L1, L2, L3,
+                                             imex::xegpu::Mode::VC);
       }
     }
 
@@ -175,11 +169,11 @@ struct SgLoadTileOpPattern
     mlir::IntegerAttr vnniAxisAttr;
     auto transposeAttr = op.getTransposeAttr();
     auto L1 = xegpu::CacheReadHintAttr::get(op.getContext(),
-                                            xegpu::CacheReadHint::UNCACHED);
+                                            xegpu::CacheReadHint::CACHED);
     auto L2 = xegpu::CacheReadHintAttr::get(op.getContext(),
-                                            xegpu::CacheReadHint::UNCACHED);
+                                            xegpu::CacheReadHint::CACHED);
     auto L3 = xegpu::CacheReadHintAttr::get(op.getContext(),
-                                            xegpu::CacheReadHint::UNCACHED);
+                                            xegpu::CacheReadHint::CACHED);
 
     llvm::SmallVector<int64_t> newShape = {shape[2], shape[3]};
     // needs vnni transform;
@@ -235,11 +229,11 @@ struct SgStoreTileOpPattern
 
     auto context = op.getContext();
     auto L1 = xegpu::CacheWriteHintAttr::get(context,
-                                             xegpu::CacheWriteHint::UNCACHED);
+                                             xegpu::CacheWriteHint::WRITE_BACK);
     auto L2 = xegpu::CacheWriteHintAttr::get(context,
-                                             xegpu::CacheWriteHint::UNCACHED);
+                                             xegpu::CacheWriteHint::WRITE_BACK);
     auto L3 = xegpu::CacheWriteHintAttr::get(context,
-                                             xegpu::CacheWriteHint::UNCACHED);
+                                             xegpu::CacheWriteHint::WRITE_BACK);
     for (size_t i = 0; i < tiles.size(); i++)
       rewriter.create<xegpu::StoreNDOp>(op.getLoc(), tiles[i], values[i], L1,
                                         L2, L3, imex::xegpu::Mode::VC);
diff --git a/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 4453f1607..43567fc66 100644
--- a/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -47,6 +47,13 @@ void XeGPUDialect::initialize() {
       >();
 }
 
+bool printDefaultValues() {
+  auto *env = getenv("IMEX_XEGPU_PRINT_DEFAULTS");
+  if (env && std::string(env) == "true")
+    return true;
+  return false;
+}
+
 // custom parser for XeGPU_TensorDesc (shape and type parameter)
 static mlir::LogicalResult parseShapeAndType(mlir::AsmParser &parser,
                                              llvm::SmallVector<int64_t> &shape,
@@ -77,9 +84,11 @@ static void printShapeAndType(mlir::AsmPrinter &printer,
   printer << type;
 }
 
+// custom parser for XeGPU_TensorDesc (scope, encoding and mapping parameter)
 static mlir::LogicalResult parseTensorDescAttr(mlir::AsmParser &parser,
                                                imex::xegpu::MemoryScope &scope,
-                                               mlir::Attribute &encoding) {
+                                               mlir::Attribute &encoding,
+                                               mlir::Attribute &mapping) {
   // implies no attrbutes
   if (mlir::failed(parser.parseOptionalComma()))
     return mlir::success();
@@ -107,7 +116,14 @@ static mlir::LogicalResult parseTensorDescAttr(mlir::AsmParser &parser,
         return parser.emitError(
             loc, "Failed to parse XeGPU_TensorDesc parameter 'encoding' which "
                  "is to be a `::mlir::Attribute`.\n");
-      encoding = *attrOptional;
+
+      if (llvm::isa<imex::xegpu::ScatteredAttr>(*attrOptional))
+        encoding = *attrOptional;
+
+      if (llvm::isa<imex::xegpu::SubGroupMapAttr>(*attrOptional) ||
+          llvm::isa<imex::xegpu::WorkGroupMapAttr>(*attrOptional) ||
+          llvm::isa<imex::xegpu::XeMapAttr>(*attrOptional))
+        mapping = *attrOptional;
       return mlir::success();
     }
   };
@@ -118,274 +134,79 @@ static mlir::LogicalResult parseTensorDescAttr(mlir::AsmParser &parser,
   return mlir::success();
 }
 
+// custom printer for XeGPU_TensorDesc (scope, encoding and mapping parameter)
 static void printTensorDescAttr(mlir::AsmPrinter &printer,
                                 imex::xegpu::MemoryScope scope,
-                                mlir::Attribute encoding) {
-  if (scope != imex::xegpu::MemoryScope::GLOBAL)
+                                mlir::Attribute encoding,
+                                mlir::Attribute mapping) {
+  if (printDefaultValues() || scope != imex::xegpu::MemoryScope::GLOBAL)
     printer << ", memory_scope = " << scope;
   if (encoding)
     printer << ", " << encoding;
-}
-
-template <typename T>
-static mlir::LogicalResult parseArrayList(mlir::AsmParser &parser,
-                                          llvm::SmallVector<T> &array,
-                                          bool parsePrecedenceEqual = false) {
-  mlir::FailureOr<llvm::SmallVector<T>> result;
-  // Parse literal '='
-  if (parsePrecedenceEqual)
-    if (parser.parseEqual())
-      return mlir::failure();
-
-  // Parse literal '['
-  if (parser.parseLSquare())
-    return mlir::failure();
-
-  result = mlir::FieldParser<::llvm::SmallVector<T>>::parse(parser);
-
-  if (::mlir::failed(result))
-    return mlir::failure();
-
-  // Parse literal ']'
-  if (parser.parseRSquare())
-    return mlir::failure();
-
-  array = result.value();
-  return mlir::success();
-}
-
-template <typename T>
-static void printArrayElement(mlir::AsmPrinter &printer,
-                              llvm::StringRef keyword,
-                              llvm::ArrayRef<T> array) {
-  printer << keyword;
-  printer << ' ' << "=";
-  printer << ' ' << "[";
-  printer.printStrippedAttrOrType(array);
-  printer << "]";
-}
-
-static mlir::LogicalResult
-parseSubGroupMapAttrElements(mlir::AsmParser &parser,
-                             llvm::SmallVector<unsigned> &layout,
-                             llvm::SmallVector<unsigned> &data,
-                             llvm::SmallVector<unsigned> &mmaBlockSize) {
-  auto parseElt = [&]() -> mlir::LogicalResult {
-    return mlir::AsmParser::KeywordSwitch<mlir::LogicalResult>(parser)
-        .Case("mma_block_size",
-              [&](llvm::StringRef, llvm::SMLoc) {
-                return parseArrayList(parser, mmaBlockSize, true);
-              })
-        .Case("wi_layout",
-              [&](llvm::StringRef, llvm::SMLoc) {
-                return parseArrayList(parser, layout, true);
-              })
-        .Case("wi_data",
-              [&](llvm::StringRef, llvm::SMLoc) {
-                return parseArrayList(parser, data, true);
-              })
-        .Default([&](llvm::StringRef keyword, llvm::SMLoc) {
-          parser.emitError(
-              parser.getCurrentLocation(),
-              "SubGroupMapAttr Parser meet an unexpected keywoard: ")
-              << keyword << "\n";
-          return mlir::failure();
-        });
-  };
-
-  if (parser.parseLBrace())
-    return mlir::failure();
-  if (parser.parseCommaSeparatedList(parseElt))
-    return mlir::failure();
-  if (parser.parseRBrace())
-    return mlir::failure();
-
-  return mlir::success();
-}
-
-static void printSubGroupMapAttrElements(
-    mlir::AsmPrinter &printer, llvm::ArrayRef<unsigned> layout,
-    llvm::ArrayRef<unsigned> data, llvm::ArrayRef<unsigned> mmaBlockSize) {
-  printer << "{";
-  if (mmaBlockSize.size()) {
-    printArrayElement(printer, "mma_block_size", mmaBlockSize);
-    printer << "," << ' ';
-  }
-  printArrayElement(printer, "wi_layout", layout);
-  printer << "," << ' ';
-  printArrayElement(printer, "wi_data", data);
-  printer << "}";
-}
-
-static mlir::LogicalResult
-parseWorkGroupMapAttrElements(mlir::AsmParser &parser,
-                              llvm::SmallVector<unsigned> &layout,
-                              llvm::SmallVector<unsigned> &data) {
-  auto parseElt = [&]() -> mlir::LogicalResult {
-    return mlir::AsmParser::KeywordSwitch<mlir::LogicalResult>(parser)
-        .Case("sg_layout",
-              [&](llvm::StringRef, llvm::SMLoc) {
-                return parseArrayList(parser, layout, true);
-              })
-        .Case("sg_data",
-              [&](llvm::StringRef, llvm::SMLoc) {
-                return parseArrayList(parser, data, true);
-              })
-        .Default([&](llvm::StringRef keyword, llvm::SMLoc) {
-          parser.emitError(
-              parser.getCurrentLocation(),
-              "WorkGroupMapAttr Parser meet an unexpected keywoard: ")
-              << keyword << "\n";
-          return mlir::failure();
-        });
-  };
-
-  if (parser.parseLBrace())
-    return mlir::failure();
-  if (parser.parseCommaSeparatedList(parseElt))
-    return mlir::failure();
-  if (parser.parseRBrace())
-    return mlir::failure();
-  return mlir::success();
-}
-
-static void printWorkGroupMapAttrElements(mlir::AsmPrinter &printer,
-                                          llvm::ArrayRef<unsigned> layout,
-                                          llvm::ArrayRef<unsigned> data) {
-  printer << "{";
-  printArrayElement(printer, "sg_layout", layout);
-  printer << "," << ' ';
-  printArrayElement(printer, "sg_data", data);
-  printer << "}";
+  if (mapping)
+    printer << ", " << mapping;
 }
 
 mlir::LogicalResult SubGroupMapAttr::verify(
     llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
-    llvm::ArrayRef<unsigned> layout, llvm::ArrayRef<unsigned> data,
-    llvm::ArrayRef<unsigned> mmaBlockSize) {
-
-  if (mmaBlockSize.size() != 2 && mmaBlockSize.size() != 0) {
-    emitError()
-        << "Failed to parse SubGroupMapAttr: mma_block_size should be a "
-           "`llvm::ArrayRef<unsigned>` with size 2 or empty. But it got "
-        << mmaBlockSize.size() << ".\n";
-    return mlir::failure();
-  }
+    mlir::DenseI32ArrayAttr mmaBlockSize, mlir::DenseI32ArrayAttr layout,
+    mlir::DenseI32ArrayAttr data) {
 
   if (layout.size() != 2) {
     emitError() << "Failed to parse SubGroupMapAttr: missing wi_layout which "
-                   "is to be a `llvm::ArrayRef<unsigned>` with size 2.\n";
+                   "is to be an integer array of size 2.\n";
     return mlir::failure();
   }
 
   if (data.size() != 2) {
     emitError() << "Failed to parse SubGroupMapAttr: missing wi_data which is "
-                   "to be a `llvm::ArrayRef<unsigned>` with size 2.\n";
+                   "to be an integer array of size 2.\n";
     return mlir::failure();
   }
 
+  if (mmaBlockSize) {
+    if (mmaBlockSize.size() != 2) {
+      emitError()
+          << "Failed to parse SubGroupMapAttr: the optional mma_block_size "
+             "should be an integer array of size 2 or empty. But it got "
+          << mmaBlockSize.size() << ".\n";
+      return mlir::failure();
+    }
+    for (int i = 0; i < mmaBlockSize.size(); i++) {
+      if ((mmaBlockSize[i] % (layout[i] * data[i]) != 0 &&
+           (layout[i] * data[i]) % mmaBlockSize[i] != 0) ||
+          mmaBlockSize[i] % layout[i] != 0 || mmaBlockSize[i] % data[i] != 0) {
+        return emitError()
+               << "Invalid SubGroupMapAttr. A valid SubGroupMapAttr should "
+                  "meet the following conditions: "
+                  "\n\tmmaBlockSize[i] % wi_layout[i] == 0 && "
+                  "\n\tmmaBlockSize[i] % wi_data[i] == 0 && "
+                  "\n\t(mmaBlockSize[i] % (wi_layout[i] * wi_data[i]) == 0 || "
+                  "\n\t (wi_layout[i] * wi_data[i]) % mmaBlockSize[i] == 0)";
+      }
+    }
+  }
+
   return mlir::success();
 }
 
 mlir::LogicalResult WorkGroupMapAttr::verify(
     llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
-    llvm::ArrayRef<unsigned> layout, llvm::ArrayRef<unsigned> data) {
+    mlir::DenseI32ArrayAttr layout, mlir::DenseI32ArrayAttr data) {
 
   if (layout.size() != 2) {
     emitError() << "Failed to parse WorkGroupMapAttr: missing sg_layout which "
-                   "is to be a `llvm::ArrayRef<unsigned>` with size 2.\n";
+                   "is to be a `llvm::ArrayRef<int32_t>` with size 2.\n";
     return mlir::failure();
   }
   if (data.size() != 2) {
     emitError() << "Failed to parse WorkGroupMapAttr: missing sg_data which is "
-                   "to be a `llvm::ArrayRef<unsigned>` with size 2.\n";
+                   "to be a `llvm::ArrayRef<int32_t>` with size 2.\n";
     return mlir::failure();
   }
   return mlir::success();
 }
 
-mlir::Attribute XeMapAttr::parse(mlir::AsmParser &parser, mlir::Type type) {
-  imex::xegpu::WorkGroupMapAttr wg;
-  imex::xegpu::SubGroupMapAttr sg;
-  // Parse literal '<'
-  if (parser.parseLess())
-    return {};
-
-  auto parseElt = [&]() -> mlir::ParseResult {
-    mlir::OptionalParseResult result =
-        mlir::AsmParser::KeywordSwitch<mlir::OptionalParseResult>(parser)
-            .Case("sg",
-                  [&](llvm::StringRef, llvm::SMLoc) {
-                    if (parser.parseEqual())
-                      return mlir::failure();
-                    llvm::SmallVector<unsigned> mmaBlockSize;
-                    llvm::SmallVector<unsigned> wiLayout;
-                    llvm::SmallVector<unsigned> wiData;
-                    if (mlir::failed(parseSubGroupMapAttrElements(
-                            parser, wiLayout, wiData, mmaBlockSize)))
-                      return mlir::failure();
-                    sg = imex::xegpu::SubGroupMapAttr::get(
-                        parser.getContext(), wiLayout, wiData, mmaBlockSize);
-                    return mlir::success(!!sg);
-                  })
-            .Case("wg",
-                  [&](llvm::StringRef, llvm::SMLoc) {
-                    if (parser.parseEqual())
-                      return mlir::failure();
-                    llvm::SmallVector<unsigned> sgLayout;
-                    llvm::SmallVector<unsigned> sgData;
-                    if (mlir::failed(parseWorkGroupMapAttrElements(
-                            parser, sgLayout, sgData)))
-                      return mlir::failure();
-                    wg = imex::xegpu::WorkGroupMapAttr::get(parser.getContext(),
-                                                            sgLayout, sgData);
-                    return mlir::success(!!wg);
-                  })
-            .Default([&](llvm::StringRef keyword, llvm::SMLoc) {
-              return std::nullopt;
-            });
-    return result.value();
-  };
-
-  // Parse wg and sg attrs
-  if (parser.parseCommaSeparatedList(parseElt))
-    return {};
-
-  // Parse literal '>'
-  if (parser.parseGreater())
-    return {};
-
-  if (!wg && !sg) {
-    parser.emitError(parser.getCurrentLocation(),
-                     "Expecting at least one of sg and wg attributes.\n");
-    return {};
-  }
-
-  return XeMapAttr::get(parser.getContext(), wg, sg);
-}
-
-void XeMapAttr::print(mlir::AsmPrinter &printer) const {
-  bool printSep = false;
-  printer << "<";
-  if (getWg()) {
-    printer << "wg = ";
-    printWorkGroupMapAttrElements(printer, getWg().getSgLayout(),
-                                  getWg().getSgData());
-    printSep = true;
-  }
-
-  if (getSg()) {
-    if (printSep)
-      printer << ", ";
-    printer << "sg = ";
-    printSubGroupMapAttrElements(printer, getSg().getWiLayout(),
-                                 getSg().getWiData(),
-                                 getSg().getMmaBlockSize());
-  }
-
-  printer << ">";
-}
-
 } // namespace xegpu
 } // namespace imex
 
diff --git a/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 225aae701..6756f3bc9 100644
--- a/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -70,12 +70,6 @@ static void transpose(llvm::ArrayRef<int64_t> trans,
     shape[i] = old[trans[i]];
 };
 
-static bool isMappingAttr(mlir::Attribute attr) {
-  return attr && (llvm::isa<imex::xegpu::SubGroupMapAttr>(attr) ||
-                  llvm::isa<imex::xegpu::WorkGroupMapAttr>(attr) ||
-                  llvm::isa<imex::xegpu::XeMapAttr>(attr));
-}
-
 bool dpasSupportedTypes(mlir::Type type, bool isResult) {
   if (isResult) {
     if (type.isF32() || type.isInteger(32))
@@ -114,10 +108,12 @@ bool dpasSupportedTypes(mlir::Type type, bool isResult) {
 //   return false;
 // }
 
+extern bool printDefaultValues();
+
 template <typename CustomEnum, typename CustomEnumAttr>
-static mlir::ParseResult parseCustomEnumAttr(mlir::OpAsmParser &parser,
-                                             mlir::OperationState &result,
-                                             llvm::StringRef attrKeyword) {
+static ::mlir::ParseResult parseCustomEnumAttr(mlir::OpAsmParser &parser,
+                                               mlir::OperationState &result,
+                                               llvm::StringRef attrKeyword) {
   auto loc = parser.getCurrentLocation();
   auto attrOptional = mlir::FieldParser<CustomEnum, CustomEnum>::parse(parser);
   if (mlir::failed(attrOptional))
@@ -129,9 +125,9 @@ static mlir::ParseResult parseCustomEnumAttr(mlir::OpAsmParser &parser,
 }
 
 template <typename AttrType>
-static mlir::ParseResult parseBoolAndIntegerAttr(mlir::OpAsmParser &parser,
-                                                 mlir::OperationState &result,
-                                                 llvm::StringRef attrKeyword) {
+static ::mlir::ParseResult
+parseBoolAndIntegerAttr(mlir::OpAsmParser &parser, mlir::OperationState &result,
+                        llvm::StringRef attrKeyword) {
   AttrType attr;
   mlir::Type ty;
 
@@ -160,7 +156,7 @@ static mlir::ParseResult parseBoolAndIntegerAttr(mlir::OpAsmParser &parser,
 /// @param result
 /// @param allowedKeywords
 /// @return
-static mlir::ParseResult
+static ::mlir::ParseResult
 parseOptionalAttrDict(mlir::OpAsmParser &parser, mlir::OperationState &result,
                       llvm::ArrayRef<llvm::StringRef> allowedKeywords,
                       bool isWrite = false) {
@@ -234,8 +230,55 @@ static void printCacheHintAttrs(mlir::OpAsmPrinter &printer, T op,
   }
 }
 
-mlir::ParseResult CreateNdDescOp::parse(mlir::OpAsmParser &parser,
-                                        mlir::OperationState &result) {
+static bool verifyAndInferShape(std::vector<int64_t> &shape,
+                                imex::xegpu::WorkGroupMapAttr wgMap,
+                                imex::xegpu::SubGroupMapAttr sgMap) {
+  if (wgMap) {
+    auto sgData = wgMap.getSgData();
+    auto sgLayout = wgMap.getSgLayout();
+
+    if (shape.size() != sgData.size() || shape.size() != sgLayout.size())
+      return false;
+
+    for (size_t i = 0; i < shape.size(); i++) {
+      if (shape[i] % sgLayout[i] != 0 || shape[i] % sgData[i] != 0 ||
+          (shape[i] % (sgLayout[i] * sgData[i]) != 0 &&
+           (sgLayout[i] * sgData[i]) % shape[i] != 0))
+        return false;
+      shape[i] /= sgLayout[i];
+    }
+  }
+
+  if (sgMap) {
+    auto blockSize = sgMap.getMmaBlockSize();
+    auto wiLayout = sgMap.getWiLayout();
+    auto wiData = sgMap.getWiData();
+
+    if (blockSize && shape.size() != blockSize.size()) {
+      return false;
+    }
+
+    if (shape.size() != wiData.size() || shape.size() != wiLayout.size()) {
+      return false;
+    }
+
+    for (size_t i = 0; i < shape.size(); i++) {
+
+      if ((shape[i] % (wiLayout[i] * wiData[i]) != 0 &&
+           (wiLayout[i] * wiData[i]) % shape[i] != 0) ||
+          (blockSize && shape[i] % blockSize[i] != 0) ||
+          shape[i] % wiLayout[i] != 0 || shape[i] % wiData[i] != 0) {
+        return false;
+      }
+      shape[i] /= wiLayout[i];
+    }
+  }
+
+  return true;
+}
+
+::mlir::ParseResult CreateNdDescOp::parse(mlir::OpAsmParser &parser,
+                                          mlir::OperationState &result) {
 
   // parse the source operand
   mlir::OpAsmParser::UnresolvedOperand sourceRawOperands[1];
@@ -325,6 +368,11 @@ mlir::ParseResult CreateNdDescOp::parse(mlir::OpAsmParser &parser,
 }
 
 void CreateNdDescOp::print(::mlir::OpAsmPrinter &printer) {
+  auto mode = getMode();
+  bool printSep = false;
+  auto check = getBoundaryCheck();
+  auto printDefaults = printDefaultValues();
+
   printer << ' ';
   printer << getSource();
   printDynamicIndexList(printer, *this, getOffsets(), getStaticOffsetsAttr());
@@ -342,11 +390,24 @@ void CreateNdDescOp::print(::mlir::OpAsmPrinter &printer) {
     printer << "]";
   }
 
-  printer << ' ' << "{";
-  printer << "mode = " << getMode();
-  printer << "," << ' ';
-  printer << "boundary_check = " << getBoundaryCheck();
-  printer << "}";
+  if (printDefaults || mode != imex::xegpu::Mode::SIMT || !check) {
+    printer << ' ' << "{";
+  }
+
+  if (printDefaults || mode != imex::xegpu::Mode::SIMT) {
+    printer << "mode = " << mode;
+    printSep = true;
+  }
+
+  if (printDefaults || !check) {
+    if (printSep)
+      printer << "," << ' ';
+    printer << "boundary_check = " << check;
+  }
+
+  if (printDefaults || mode != imex::xegpu::Mode::SIMT || !check) {
+    printer << "}";
+  }
 
   printer << ' ' << ":";
   printer << ' ';
@@ -356,11 +417,22 @@ void CreateNdDescOp::print(::mlir::OpAsmPrinter &printer) {
   printer << getTensorDesc().getType();
 }
 
-mlir::LogicalResult CreateNdDescOp::verify() {
+::mlir::LogicalResult CreateNdDescOp::verify() {
   auto mode = getMode();
   auto encoding = getTensorDesc().getType().getEncoding();
+  auto mapping = getTensorDesc().getType().getMapping();
 
-  if (mode == imex::xegpu::Mode::SIMT && !isMappingAttr(encoding)) {
+  if (encoding) {
+    return emitOpError("Encoding Attribute of TensorDesc is not expected for "
+                       "non-scattered operators.\n");
+  }
+
+  if (mode == imex::xegpu::Mode::VC && mapping) {
+    return emitOpError("Mapping attribute of TensorDesc is not expected "
+                       "for VC mode operations.\n");
+  }
+
+  if (mode == imex::xegpu::Mode::SIMT && !mapping) {
     return emitOpError("Expecting either SgMap, WgMap or XeMap attribute for "
                        "SIMT mode operators.\n");
   }
@@ -379,8 +451,8 @@ mlir::LogicalResult CreateNdDescOp::verify() {
   return mlir::success();
 }
 
-mlir::ParseResult CreateDescOp::parse(mlir::OpAsmParser &parser,
-                                      mlir::OperationState &result) {
+::mlir::ParseResult CreateDescOp::parse(mlir::OpAsmParser &parser,
+                                        mlir::OperationState &result) {
   mlir::OpAsmParser::UnresolvedOperand sourceRawOperands[1];
   llvm::ArrayRef<mlir::OpAsmParser::UnresolvedOperand> sourceOperands(
       sourceRawOperands);
@@ -434,17 +506,35 @@ mlir::ParseResult CreateDescOp::parse(mlir::OpAsmParser &parser,
 }
 
 void CreateDescOp::print(::mlir::OpAsmPrinter &printer) {
+  auto mode = getMode();
+  bool printSep = false;
+  auto chunk = getChunkSizePerLane();
+  auto printDefaults = printDefaultValues();
+
   printer << ' ';
   printer << getSource();
   printer << ",";
   printer << ' ';
   printer << getOffsets();
 
-  printer << ' ' << "{";
-  printer << "mode = " << getMode();
-  printer << "," << ' ';
-  printer << "chunk_size_per_lane = " << getChunkSizePerLane();
-  printer << "}";
+  if (printDefaults || mode != imex::xegpu::Mode::SIMT || chunk != 1) {
+    printer << ' ' << "{";
+  }
+
+  if (printDefaults || mode != imex::xegpu::Mode::SIMT) {
+    printer << "mode = " << mode;
+    printSep = true;
+  }
+
+  if (printDefaults || chunk != 1) {
+    if (printSep)
+      printer << "," << ' ';
+    printer << "chunk_size_per_lane = " << chunk;
+  }
+
+  if (printDefaults || mode != imex::xegpu::Mode::SIMT || chunk != 1) {
+    printer << "}";
+  }
 
   printer << ' ' << ":";
   printer << ' ';
@@ -457,45 +547,49 @@ void CreateDescOp::print(::mlir::OpAsmPrinter &printer) {
   printer << getTensorDesc().getType();
 }
 
-mlir::LogicalResult CreateDescOp::verify() {
-  if (getRankOf(getSource()) > 2)
-    return emitOpError(
-        "Expecting the source is a 2D/1D memref or pointer (uint64_t).");
-
-  std::vector<int64_t> shape;
-
+::mlir::LogicalResult CreateDescOp::verify() {
+  auto mode = getMode();
+  auto mapping = getTensorDesc().getType().getMapping();
   auto offsetTy = getOffsets().getType();
   auto tdescTy = getTensorDesc().getType();
   auto chunkSize = getChunkSizePerLane();
 
-  auto tdescShape = tdescTy.getShape();
+  if (mode == imex::xegpu::Mode::SIMT || mapping) {
+    return emitOpError("CreateDescOp only support VC mode and mapping "
+                       "attribute of TensorDesc is not expected.\n");
+  }
+
+  if (getRankOf(getSource()) > 2)
+    return emitOpError(
+        "Expecting the source is a 1D/2D memref or pointer (uint64_t).");
 
+  if (!tdescTy.getEncoding())
+    return emitOpError(
+        "Expecting the presence of scattered attribute for tensor descriptor.");
+
+  // Infer the TensorDesc shape
+  std::vector<int64_t> shape;
   if (llvm::isa<mlir::VectorType>(offsetTy)) {
     shape = llvm::dyn_cast<mlir::VectorType>(offsetTy).getShape().vec();
-    if (shape.size() > 2)
-      return emitOpError(
-          "Expecting the offset is either a 2D/1D vector (for VC) "
-          "or scalar (for SIMT).");
+    if (shape.size() != 1)
+      return emitOpError("Expecting the offset is a 1D vector.");
   }
 
-  if (offsetTy.isIndex() || chunkSize != 1) {
+  if (chunkSize != 1) {
     shape.push_back(chunkSize);
   }
 
+  auto tdescShape = tdescTy.getShape();
   if (shape != tdescShape.vec()) {
     return emitOpError("Expecting dimensions of offsets is the same as the "
                        "tensor descriptor, or one less than.");
   }
 
-  if (!tdescTy.getEncoding())
-    return emitOpError(
-        "Expecting the presence of scattered attribute for tensor descriptor.");
-
   return mlir::success();
 }
 
-mlir::ParseResult LoadNDOp::parse(::mlir::OpAsmParser &parser,
-                                  ::mlir::OperationState &result) {
+::mlir::ParseResult LoadNDOp::parse(::mlir::OpAsmParser &parser,
+                                    ::mlir::OperationState &result) {
   mlir::OpAsmParser::UnresolvedOperand TensorDescRawOperands[1];
   llvm::ArrayRef<::mlir::OpAsmParser::UnresolvedOperand> TensorDescOperands(
       TensorDescRawOperands);
@@ -533,27 +627,44 @@ mlir::ParseResult LoadNDOp::parse(::mlir::OpAsmParser &parser,
 }
 
 void LoadNDOp::print(::mlir::OpAsmPrinter &printer) {
+  auto mode = getMode();
+  bool printSep = false;
+  auto printDefaults = printDefaultValues();
+  auto numAttrs = (*this)->getAttrs().size();
+
   printer << ' ';
   printer << getTensorDesc();
 
-  if ((*this)->getAttrs().size()) {
+  if (printDefaults || mode != imex::xegpu::Mode::SIMT || numAttrs > 1) {
     printer << ' ' << "{";
-    printer << "mode = " << getMode();
-    if (getVnniAxisAttr()) {
+  }
+
+  if (printDefaults || mode != imex::xegpu::Mode::SIMT) {
+    printer << "mode = " << mode;
+    printSep = true;
+  }
+
+  if (getVnniAxisAttr()) {
+    if (printSep)
       printer << "," << ' ';
-      printer << "vnni_axis = " << getVnniAxis().value();
-    }
+    printer << "vnni_axis = " << getVnniAxis().value();
+    printSep = true;
+  }
 
-    if (getTransposeAttr()) {
+  if (getTransposeAttr()) {
+    if (printSep)
       printer << "," << ' ';
-      printer << "transpose = ";
-      getTransposeAttr().print(printer);
-    }
+    printer << "transpose = ";
+    getTransposeAttr().print(printer);
+    printSep = true;
+  }
 
-    printCacheHintAttrs<LoadNDOp>(printer, *this, true);
+  printCacheHintAttrs<LoadNDOp>(printer, *this, printSep);
 
+  if (printDefaults || mode != imex::xegpu::Mode::SIMT || numAttrs > 1) {
     printer << "}";
   }
+
   printer << ' ' << ":";
   printer << ' ';
   printer << getTensorDesc().getType();
@@ -562,15 +673,7 @@ void LoadNDOp::print(::mlir::OpAsmPrinter &printer) {
   printer << getValue().getType();
 }
 
-// mlir::LogicalResult CreateNbarrierOp::verify() {
-//   llvm::dbgs() << "\nOp: " << getValueAsString(*this)
-//                << "\n\tnum producers: " << getNumProducers()
-//                << "\n\tnum consumers: " << getNumConsumers()
-//                << "\n\n";
-//   return mlir::success();
-// }
-
-mlir::LogicalResult LoadNDOp::verify() {
+::mlir::LogicalResult LoadNDOp::verify() {
   auto tdescTy = getTensorDesc().getType();
   auto valueTy = llvm::dyn_cast<mlir::VectorType>(getValue().getType());
 
@@ -588,9 +691,9 @@ mlir::LogicalResult LoadNDOp::verify() {
     return emitOpError(
         "Value should have the same element type as TensorDesc.");
 
-  if (tdescTy.getRank() == 2) { // TODO: The following logic are architecture
-                                // dependent, pending to be moved
-    // out
+  if (tdescTy.getRank() == 2) {
+    // TODO: The following logic are architecture
+    // dependent, pending to be moved out
     auto width = tdescTy.getShape()[1];
     auto height = tdescTy.getShape()[0];
     auto elemTyByteWidth = tdescElemTy.getIntOrFloatBitWidth() / 8;
@@ -598,10 +701,11 @@ mlir::LogicalResult LoadNDOp::verify() {
     if (width < MIN_2D_BLOCK_WIDTH_IN_ELEMENTS ||
         width > MAX_2D_BLOCK_WIDTH_IN_ELEMENTS ||
         (width * elemTyByteWidth) % 4 != 0) {
-      return emitOpError("Invalid width size for 2D block load.  \
-                          The specification expects the value to \
-                          be in range [1, 64], and The the total \
-                          data size (width * elemTyBytes) to be multiple of 4.\n");
+      return emitOpError(
+          "Invalid width size for 2D block load.  "
+          "The specification expects the value to "
+          "be in range [1, 64], and The the total "
+          "data size (width * elemTyBytes) to be multiple of 4.\n");
     }
 
     if (height < MIN_2D_BLOCK_HEIGHT_IN_ELEMENTS ||
@@ -620,63 +724,36 @@ mlir::LogicalResult LoadNDOp::verify() {
     imex::xegpu::WorkGroupMapAttr wgMap;
     imex::xegpu::SubGroupMapAttr sgMap;
 
-    auto encoding = tdescTy.getEncoding();
-    if (!isMappingAttr(encoding)) {
+    auto mapping = tdescTy.getMapping();
+    if (!mapping) {
       return emitOpError("Expecting either SgMap, WgMap or XeMap attribute for "
                          "SIMT mode operators.\n");
     }
 
-    if (auto xeMapAttr = llvm::dyn_cast<imex::xegpu::XeMapAttr>(encoding)) {
+    if (auto xeMapAttr = llvm::dyn_cast<imex::xegpu::XeMapAttr>(mapping)) {
       wgMap = xeMapAttr.getWg();
       sgMap = xeMapAttr.getSg();
     } else {
-      wgMap = llvm::dyn_cast<imex::xegpu::WorkGroupMapAttr>(encoding);
-      sgMap = llvm::dyn_cast<imex::xegpu::SubGroupMapAttr>(encoding);
+      wgMap = llvm::dyn_cast<imex::xegpu::WorkGroupMapAttr>(mapping);
+      sgMap = llvm::dyn_cast<imex::xegpu::SubGroupMapAttr>(mapping);
     }
 
-    if (wgMap) {
-      auto sgData = wgMap.getSgData();
-      auto sgLayout = wgMap.getSgLayout();
-      for (size_t i = 0; i < sgData.size(); i++) {
-        if (tdescShape[i] % sgLayout[i] != 0 ||
-            tdescShape[i] % sgData[i] != 0 ||
-            tdescShape[i] % (sgLayout[i] * sgData[i]) != 0)
-          return emitOpError("Invalid WorkGroupMapAttr. It should meet the "
-                             "following conditions: "
-                             "tdescShape[i] % sgLayout[i] == 0 && "
-                             "tdescShape[i] % sgData[i] == 0 && "
-                             "tdescShape[i] % (sgLayout[i] *sgData[i]) == 0");
-        tdescShape[i] /= sgLayout[i];
-      }
-    }
-
-    if (sgMap) {
-      auto blockSize = sgMap.getMmaBlockSize();
-      auto wiLayout = sgMap.getWiLayout();
-      auto wiData = sgMap.getWiData();
-      for (size_t i = 0; i < blockSize.size(); i++) {
-        if (tdescShape[i] % blockSize[i] != 0 ||
-            blockSize[i] % wiLayout[i] != 0 || blockSize[i] % wiData[i] != 0 ||
-            blockSize[i] % (wiLayout[i] * wiData[i]) != 0) {
-          return emitOpError("Invalid SubGroupMapAttr. It should meet the "
-                             "following conditions: "
-                             "tdescShape[i] % blockSize[i] == 0 && "
-                             "blockSize[i] % wiLayout[i] == 0 && "
-                             "blockSize[i] % wiData[i] == 0 && "
-                             "blockSize[i] % (wiLayout[i] * wiData[i]) == 0 ");
-        }
-      }
-
-      for (size_t i = 0; i < wiLayout.size(); i++) {
-        if (tdescShape[i] % wiData[i] != 0 ||
-            tdescShape[i] % (wiLayout[i] * wiData[i]) != 0) {
-          return emitOpError("Invalid SubGroupMapAttr. It should meet the "
-                             "following conditions: "
-                             "tdescShape[i] % wiData[i] == 0 && "
-                             "tdescShape[i] % (wiLayout[i] * wiData[i]) == 0 ");
-        }
-        tdescShape[i] /= wiLayout[i];
-      }
+    if (!verifyAndInferShape(tdescShape, wgMap, sgMap)) {
+      return emitOpError("Failed to infer the shape.")
+             << "\nItshould meet the following conditions for "
+                "WorkGroupMapAttr: "
+             << "\n\t tdescShape[i] % sg_layout[i] == 0 && "
+             << "\n\t tdescShape[i] % sg_data[i] == 0 && "
+             << "\n\t (tdescShape[i] % (sg_layout[i] * sg_data[i]) == 0 ||"
+             << "\n\t  (sg_layout[i] * sg_data[i]) % tdescShape[i] == 0)"
+             << "\n\nAnd after performing shape[i] /= sg_layout[i]. "
+             << "The new shape[i] should meet the following condistions "
+                "for SubGroupMapAttr: "
+             << "\n\ttdescShape[i] % mma_block_size[i] == 0 (if it has) && "
+             << "\n\ttdescShape[i] % wi_layout[i] == 0 && "
+             << "\n\ttdescShape[i] % wi_data[i] == 0 && "
+             << "\n\t(tdescShape[i] % (wi_layout[i] * wi_data[i]) == 0 || "
+             << "\n\t (wi_layout[i] * wi_data[i]) % tdescShape[i] == 0).\n";
     }
   }
 
@@ -696,25 +773,18 @@ mlir::LogicalResult LoadNDOp::verify() {
   }
 
   if (tdescShape != valueShape)
-    return emitOpError(
-        "Result shape doesn't match TensorDesc shape."
-        "The expected shape is " +
-        makeString(tdescShape) +
-        ", while "
-        "the given shape is " +
-        makeString(valueShape) +
-        ". "
-        "In VC mode, when VNNI is not enabled, the result should have the same "
-        "shape (or transposed shape if transpose is also enabled) as "
-        "TensorDesc; "
-        "when VNNI is enabled, the result should have one more dimention than "
-        "the "
-        "TensorDesc, with last dimention having vnni factor, but having same "
-        "number "
-        "of total data elements. The vnni factor are typically calculated as "
-        "simd_lane_width / elementTypeBitWidth. "
-        "For element type having more than 32 bits, vnni shouldn't be used. "
-        "In SIMT mode, the shape is derived from the mapping attributes.\n");
+    return emitOpError("Result shape doesn't match TensorDesc shape.")
+           << "\nThe expected shape is " << makeString(tdescShape) << "."
+           << "\nBut the given shape is " << makeString(valueShape) << "."
+           << "\nIn VC mode, when VNNI is not enabled, the result should have "
+           << "the same shape (or transposed shape if transpose is enabled) "
+           << "as TensorDesc; \nwhen VNNI is enabled, the result should have "
+           << "one more dimention than the TensorDesc, with last dimention "
+           << "having vnni factor, \nbut having same number of total data "
+           << "elements. The vnni factor are typically calculated as "
+           << "simd_lane_width / elementTypeBitWidth. \nFor element type "
+           << "having more than 32 bits, vnni shouldn't be used. \nIn SIMT "
+           << "mode, the shape is derived from the mapping attributes.\n";
   return mlir::success();
 }
 
@@ -769,17 +839,32 @@ ::mlir::ParseResult StoreNDOp::parse(::mlir::OpAsmParser &parser,
 }
 
 void StoreNDOp::print(::mlir::OpAsmPrinter &printer) {
+  auto mode = getMode();
+  bool printSep = false;
+  auto printDefaults = printDefaultValues();
+  auto numAttrs = (*this)->getAttrs().size();
+
   printer << ' ';
   printer << getValue();
   printer << ",";
   printer << ' ';
   printer << getTensorDesc();
-  if ((*this)->getAttrs().size()) {
+
+  if (printDefaults || mode != imex::xegpu::Mode::SIMT || numAttrs > 1) {
     printer << ' ' << "{";
+  }
+
+  if (printDefaults || mode != imex::xegpu::Mode::SIMT) {
     printer << "mode = " << getMode();
-    printCacheHintAttrs<StoreNDOp>(printer, *this, true);
+    printSep = true;
+  }
+
+  printCacheHintAttrs<StoreNDOp>(printer, *this, true);
+
+  if (printDefaults || mode != imex::xegpu::Mode::SIMT || numAttrs > 1) {
     printer << "}";
   }
+
   printer << ' ' << ":";
   printer << ' ';
   printer << getValue().getType();
@@ -788,7 +873,7 @@ void StoreNDOp::print(::mlir::OpAsmPrinter &printer) {
   printer << getTensorDesc().getType();
 }
 
-mlir::LogicalResult StoreNDOp::verify() {
+::mlir::LogicalResult StoreNDOp::verify() {
   auto dstTy = getTensorDesc().getType();                              // Tile
   auto valTy = llvm::dyn_cast<mlir::VectorType>(getValue().getType()); // Vector
 
@@ -816,10 +901,11 @@ mlir::LogicalResult StoreNDOp::verify() {
     if (width < MIN_2D_BLOCK_WIDTH_IN_ELEMENTS ||
         width > MAX_2D_BLOCK_WIDTH_IN_ELEMENTS ||
         (width * elemTyByteWidth) % 4 != 0) {
-      return emitOpError("Invalid width size for 2D block write. \
-                          The specification expects the value to \
-                          be in range [1, 64], and The the total \
-                          data size (width * elemTyBytes) to be multiple of 4.\n");
+      return emitOpError(
+          "Invalid width size for 2D block write. "
+          "The specification expects the value to "
+          "be in range [1, 64], and The the total "
+          "data size (width * elemTyBytes) to be multiple of 4.\n");
     }
 
     if (height < MIN_2D_BLOCK_HEIGHT_IN_ELEMENTS ||
@@ -837,8 +923,8 @@ mlir::LogicalResult StoreNDOp::verify() {
       return emitOpError("In VC mode, the value (vector) shape doesn't match "
                          "the memory (dst) shape.\n");
   } else {
-    auto encoding = dstTy.getEncoding();
-    if (!isMappingAttr(encoding)) {
+    auto mapping = dstTy.getMapping();
+    if (!mapping) {
       return emitOpError("Expecting either SgMap, WgMap or XeMap attribute for "
                          "SIMT mode operators.\n");
     }
@@ -847,56 +933,30 @@ mlir::LogicalResult StoreNDOp::verify() {
     imex::xegpu::SubGroupMapAttr sgMap;
     std::vector<int64_t> shape = dstTy.getShape().vec();
 
-    if (auto xeMapAttr = llvm::dyn_cast<imex::xegpu::XeMapAttr>(encoding)) {
+    if (auto xeMapAttr = llvm::dyn_cast<imex::xegpu::XeMapAttr>(mapping)) {
       wgMap = xeMapAttr.getWg();
       sgMap = xeMapAttr.getSg();
     } else {
-      wgMap = llvm::dyn_cast<imex::xegpu::WorkGroupMapAttr>(encoding);
-      sgMap = llvm::dyn_cast<imex::xegpu::SubGroupMapAttr>(encoding);
-    }
-
-    if (wgMap) {
-      auto sgData = wgMap.getSgData();
-      auto sgLayout = wgMap.getSgLayout();
-      for (size_t i = 0; i < sgData.size(); i++) {
-        if (shape[i] % sgLayout[i] != 0 || shape[i] % sgData[i] != 0 ||
-            shape[i] % (sgLayout[i] * sgData[i]) != 0)
-          return emitOpError("Invalid WorkGroupMapAttr. It should meet the "
-                             "following conditions: "
-                             "tdescShape[i] % sgLayout[i] == 0 && "
-                             "tdescShape[i] % sgData[i] == 0 && "
-                             "tdescShape[i] % (sgLayout[i] *sgData[i]) == 0");
-        shape[i] /= sgLayout[i];
-      }
+      wgMap = llvm::dyn_cast<imex::xegpu::WorkGroupMapAttr>(mapping);
+      sgMap = llvm::dyn_cast<imex::xegpu::SubGroupMapAttr>(mapping);
     }
 
-    if (sgMap) {
-      auto blockSize = sgMap.getMmaBlockSize();
-      auto wiLayout = sgMap.getWiLayout();
-      auto wiData = sgMap.getWiData();
-      for (size_t i = 0; i < shape.size(); i++) {
-        if (blockSize[i] % (wiLayout[i] * wiData[i]) != 0 ||
-            blockSize[i] % wiLayout[i] != 0 || blockSize[i] % wiData[i] != 0 ||
-            shape[i] % blockSize[i] != 0) {
-          return emitOpError("Invalid SubGroupMapAttr. It should meet the "
-                             "following conditions: "
-                             "tdescShape[i] % blockSize[i] == 0 && "
-                             "blockSize[i] % wiLayout[i] == 0 && "
-                             "blockSize[i] % wiData[i] == 0 && "
-                             "blockSize[i] % (wiLayout[i] * wiData[i]) == 0 ");
-        }
-      }
-
-      for (size_t i = 0; i < wiLayout.size(); i++) {
-        if (shape[i] % wiData[i] != 0 ||
-            shape[i] % (wiLayout[i] * wiData[i]) != 0) {
-          return emitOpError("Invalid SubGroupMapAttr. It should meet the "
-                             "following conditions: "
-                             "tdescShape[i] % wiData[i] == 0 && "
-                             "tdescShape[i] % (wiLayout[i] * wiData[i]) == 0 ");
-        }
-        shape[i] /= wiLayout[i];
-      }
+    if (!verifyAndInferShape(shape, wgMap, sgMap)) {
+      return emitOpError("Failed to infer the shape.")
+             << "\nItshould meet the following conditions for "
+                "WorkGroupMapAttr: "
+             << "\n\t tdescShape[i] % sg_layout[i] == 0 && "
+             << "\n\t tdescShape[i] % sg_data[i] == 0 && "
+             << "\n\t (tdescShape[i] % (sg_layout[i] * sg_data[i]) == 0 ||"
+             << "\n\t  (sg_layout[i] * sg_data[i]) % tdescShape[i] == 0)"
+             << "\n\nAnd after performing shape[i] /= sg_layout[i]. "
+             << "The new shape[i] should meet the following condistions "
+                "for SubGroupMapAttr: "
+             << "\n\ttdescShape[i] % mma_block_size[i] == 0 (if it has) && "
+             << "\n\ttdescShape[i] % wi_layout[i] == 0 && "
+             << "\n\ttdescShape[i] % wi_data[i] == 0 && "
+             << "\n\t(tdescShape[i] % (wi_layout[i] * wi_data[i]) == 0 || "
+             << "\n\t (wi_layout[i] * wi_data[i]) % tdescShape[i] == 0).\n";
     }
 
     if (shape != valTy.getShape().vec())
@@ -936,13 +996,25 @@ ::mlir::ParseResult PrefetchNDOp::parse(::mlir::OpAsmParser &parser,
 }
 
 void PrefetchNDOp::print(::mlir::OpAsmPrinter &printer) {
+  auto mode = getMode();
+  bool printSep = false;
+  auto printDefaults = printDefaultValues();
+  auto numAttrs = (*this)->getAttrs().size();
   printer << ' ';
   printer << getTensorDesc();
-  // printer.printOptionalAttrDict((*this)->getAttrs());
-  if ((*this)->getAttrs().size()) {
+
+  if (printDefaults || mode != imex::xegpu::Mode::SIMT || numAttrs > 1) {
     printer << ' ' << "{";
+  }
+
+  if (printDefaults || mode != imex::xegpu::Mode::SIMT) {
     printer << "mode = " << getMode();
-    printCacheHintAttrs<PrefetchNDOp>(printer, *this, true);
+    printSep = true;
+  }
+
+  printCacheHintAttrs<PrefetchNDOp>(printer, *this, true);
+
+  if (printDefaults || mode != imex::xegpu::Mode::SIMT || numAttrs > 1) {
     printer << "}";
   }
 
@@ -951,7 +1023,7 @@ void PrefetchNDOp::print(::mlir::OpAsmPrinter &printer) {
   printer << getTensorDesc().getType();
 }
 
-mlir::LogicalResult DpasOp::verify() {
+::mlir::LogicalResult DpasOp::verify() {
 
   int64_t lhsRank = getLhsType().getRank();
   int64_t rhsRank = getRhsType().getRank();
@@ -1059,25 +1131,44 @@ ::mlir::ParseResult LoadGatherOp::parse(::mlir::OpAsmParser &parser,
 }
 
 void LoadGatherOp::print(mlir::OpAsmPrinter &printer) {
+  auto mode = getMode();
+  bool printSep = false;
+  auto printDefaults = printDefaultValues();
+  auto numAttrs = (*this)->getAttrs().size();
+
   printer << ' ';
   printer << getTensorDesc();
   printer << ",";
   printer << ' ';
   printer << getMask();
-  if ((*this)->getAttrs().size()) {
+
+  if (printDefaults || mode != imex::xegpu::Mode::SIMT || numAttrs > 1) {
     printer << ' ' << "{";
+  }
 
+  if (printDefaults || mode != imex::xegpu::Mode::SIMT) {
     printer << "mode = " << getMode();
-    if (getVnniAxisAttr())
-      printer << ", vnni_axis = " << getVnniAxis().value();
+    printSep = true;
+  }
 
-    if (getTransposeAttr()) {
-      printer << ", transpose = ";
-      getTransposeAttr().print(printer);
-    }
+  if (getVnniAxisAttr()) {
+    if (printSep)
+      printer << "," << ' ';
+    printer << "vnni_axis = " << getVnniAxis().value();
+    printSep = true;
+  }
 
-    printCacheHintAttrs<LoadGatherOp>(printer, *this, true);
+  if (getTransposeAttr()) {
+    if (printSep)
+      printer << "," << ' ';
+    printer << "transpose = ";
+    getTransposeAttr().print(printer);
+    printSep = true;
+  }
 
+  printCacheHintAttrs<LoadGatherOp>(printer, *this, printSep);
+
+  if (printDefaults || mode != imex::xegpu::Mode::SIMT || numAttrs > 1) {
     printer << "}";
   }
 
@@ -1092,11 +1183,16 @@ void LoadGatherOp::print(mlir::OpAsmPrinter &printer) {
   printer << getValue().getType();
 }
 
-mlir::LogicalResult LoadGatherOp::verify() {
+::mlir::LogicalResult LoadGatherOp::verify() {
   auto tdescTy = getTensorDesc().getType();
   auto maskTy = getMask().getType();
   auto valueTy = getValue().getType();
 
+  auto encoding = tdescTy.getEncoding();
+  if (!encoding || !llvm::isa<imex::xegpu::ScatteredAttr>(encoding))
+    return emitOpError(
+        "LoadGatherOp only works on TensorDesc with ScatteredAttr.");
+
   auto getElementType = [&](mlir::Type type) -> mlir::Type {
     if (type.isIntOrIndexOrFloat())
       return type;
@@ -1131,6 +1227,13 @@ mlir::LogicalResult LoadGatherOp::verify() {
   if (tdescShape != maskShape)
     return emitOpError("Mask should have the same shape as TensorDesc.");
 
+  auto mode = getMode();
+  auto mapping = tdescTy.getMapping();
+  if (mode == imex::xegpu::Mode::SIMT || mapping) {
+    return emitOpError("LoadGatherOp only supports VC mode and mapping "
+                       "attribute of TensorDesc is not expected.\n");
+  }
+
   if (getTranspose()) {
     auto trans = getTranspose().value();
     if (tdescShape.size() >= trans.size())
@@ -1150,15 +1253,12 @@ mlir::LogicalResult LoadGatherOp::verify() {
     return emitOpError(
         "Result shape doesn't match TensorDesc shape. when VNNI is not enabled,"
         "the result should have the same shape (or transposed shape if "
-        "transpose"
-        "is also enabled) as TensorDesc. When VNNI is enabled, the result "
-        "should"
-        "have one more dimention than the TensorDesc, with last dimention "
-        "having"
-        "vnni factor, but having same number of total data elements. The vnni "
-        "factor are typically calculated as simd_lane_width / "
-        "elementTypeBitWidth."
-        "For element type having more than 32 bits, vnni shouldn't be used.\n");
+        "transpose is also enabled) as TensorDesc. When VNNI is enabled, "
+        "the result should have one more dimention than the TensorDesc, "
+        "with last dimention having vnni factor, but having same number of"
+        "total data elements. The vnni factor are typically calculated as "
+        "simd_lane_width/elementTypeBitWidth. For element type having "
+        "more than 32 bits, vnni shouldn't be used.\n");
 
   return ::mlir::success();
 }
@@ -1244,6 +1344,11 @@ ::mlir::ParseResult StoreScatterOp::parse(::mlir::OpAsmParser &parser,
 }
 
 void StoreScatterOp::print(::mlir::OpAsmPrinter &printer) {
+  auto mode = getMode();
+  bool printSep = false;
+  auto printDefaults = printDefaultValues();
+  auto numAttrs = (*this)->getAttrs().size();
+
   printer << ' ';
   printer << getValue();
   printer << ",";
@@ -1252,10 +1357,19 @@ void StoreScatterOp::print(::mlir::OpAsmPrinter &printer) {
   printer << ",";
   printer << ' ';
   printer << getMask();
-  if ((*this)->getAttrs().size()) {
+
+  if (printDefaults || mode != imex::xegpu::Mode::SIMT || numAttrs > 1) {
     printer << ' ' << "{";
+  }
+
+  if (printDefaults || mode != imex::xegpu::Mode::SIMT) {
     printer << "mode = " << getMode();
-    printCacheHintAttrs<StoreScatterOp>(printer, *this, true);
+    printSep = true;
+  }
+
+  printCacheHintAttrs<StoreScatterOp>(printer, *this, printSep);
+
+  if (printDefaults || mode != imex::xegpu::Mode::SIMT || numAttrs > 1) {
     printer << "}";
   }
 
@@ -1275,6 +1389,11 @@ ::mlir::LogicalResult StoreScatterOp::verify() {
   auto tdescTy = getTensorDesc().getType();
   auto maskTy = getMask().getType();
 
+  auto encoding = tdescTy.getEncoding();
+  if (!encoding || !llvm::isa<imex::xegpu::ScatteredAttr>(encoding))
+    return emitOpError("Invalid TensorDesc. StoreScatterOp only works on "
+                       "TensorDescs with ScatteredAttr.");
+
   std::vector<int64_t> valueShape, maskShape;
   auto getShape = [&](mlir::Type type, std::vector<int64_t> &shape) -> void {
     if (type.isIntOrIndexOrFloat())
@@ -1288,12 +1407,102 @@ ::mlir::LogicalResult StoreScatterOp::verify() {
   getShape(valueTy, valueShape);
   getShape(maskTy, maskShape);
 
-  if (tdescTy.getShape().vec() != maskShape || valueShape != maskShape) {
-    return emitOpError(
-        "Mask and value should have the same shape/size as TensorDesc."
-        "Mask and Value can be scalar if TensorDesc is in form of "
-        "TensorDesc<1xf16>.");
+  if (valueShape != maskShape) {
+    return emitOpError("Mask and value should have the same shape/size");
+  }
+
+  auto tdescShape = tdescTy.getShape().vec();
+
+  auto mode = getMode();
+  auto mapping = tdescTy.getMapping();
+
+  if (mode != imex::xegpu::Mode::VC || mapping) {
+    return emitOpError("StoreScatterOp only supports VC mode and mapping "
+                       "attribute of TensorDesc is not expected.\n");
+  }
+
+  if (tdescShape != valueShape) {
+    return emitOpError("TensorDesc shape and value shape doesn't match. ")
+           << "The expected/derived value shape is: " << makeString(tdescShape)
+           << ".\nMask and value should have the same shape/size as "
+              "TensorDesc.\n";
+  }
+
+  return ::mlir::success();
+}
+
+::mlir::ParseResult PrefetchOp::parse(::mlir::OpAsmParser &parser,
+                                      ::mlir::OperationState &result) {
+  mlir::OpAsmParser::UnresolvedOperand TensorDescRawOperands[1];
+  llvm::ArrayRef<::mlir::OpAsmParser::UnresolvedOperand> TensorDescOperands(
+      TensorDescRawOperands);
+  llvm::SMLoc TensorDescOperandsLoc;
+  mlir::Type TensorDescRawTypes[1];
+  llvm::ArrayRef<::mlir::Type> TensorDescTypes(TensorDescRawTypes);
+
+  TensorDescOperandsLoc = parser.getCurrentLocation();
+  if (parser.parseOperand(TensorDescRawOperands[0]))
+    return ::mlir::failure();
+
+  if (parseOptionalAttrDict(parser, result,
+                            {"mode", "l1_hint", "l2_hint", "l3_hint"}))
+    return mlir::failure();
+
+  if (parser.parseColon())
+    return ::mlir::failure();
+
+  if (parser.parseType(TensorDescRawTypes[0]))
+    return ::mlir::failure();
+  if (parser.resolveOperands(TensorDescOperands, TensorDescTypes,
+                             TensorDescOperandsLoc, result.operands))
+    return ::mlir::failure();
+  return ::mlir::success();
+}
+
+void PrefetchOp::print(::mlir::OpAsmPrinter &printer) {
+  auto mode = getMode();
+  bool printSep = false;
+  auto printDefaults = printDefaultValues();
+  auto numAttrs = (*this)->getAttrs().size();
+
+  printer << ' ';
+  printer << getTensorDesc();
+
+  if (printDefaults || mode != imex::xegpu::Mode::SIMT || numAttrs > 1) {
+    printer << ' ' << "{";
+  }
+
+  if (printDefaults || mode != imex::xegpu::Mode::SIMT) {
+    printer << "mode = " << getMode();
+    printSep = true;
+  }
+
+  printCacheHintAttrs<PrefetchOp>(printer, *this, printSep);
+
+  if (printDefaults || mode != imex::xegpu::Mode::SIMT || numAttrs > 1) {
+    printer << "}";
+  }
+
+  printer << ' ' << ":";
+  printer << ' ';
+  printer << getTensorDesc().getType();
+}
+
+::mlir::LogicalResult PrefetchOp::verify() {
+  auto mode = getMode();
+  auto tdescTy = getTensorDesc().getType();
+  auto encoding = tdescTy.getEncoding();
+  auto mapping = tdescTy.getMapping();
+
+  if (!encoding || !llvm::isa<imex::xegpu::ScatteredAttr>(encoding))
+    return emitOpError("Invalid TensorDesc. PrefetchOp only works on "
+                       "TensorDescs with ScatteredAttr.");
+
+  if (mode != imex::xegpu::Mode::VC || mapping) {
+    return emitOpError("PrefetchOp only supports VC mode. and mapping "
+                       "attribute of TensorDesc is not expected.\n");
   }
+
   return ::mlir::success();
 }
 
@@ -1310,29 +1519,18 @@ ::mlir::LogicalResult UpdateOffsetOp::verify() {
   auto shape = srcTy.getShape();
   auto encoding = srcTy.getEncoding();
 
-  if (!encoding || !llvm::isa<imex::xegpu::ScatteredAttr>(encoding)) {
-    return emitOpError(
-        "Invalid TensorDesc, it should have a scattered attribute.");
+  if (!encoding) {
+    return emitOpError("Invalid TensorDesc. UpdateOffsetOp only works on "
+                       "TensorDescs with ScatteredAttr.");
   }
 
-  // For VC mode with chunkSize > 1. For chunkSize == 1, it is hard to
-  // distinguish between VC and SIMT mode by only looking at updateOffsetOp
-  // itself. So current verifier skipped these two cases.
-  if (shape.size() == 2) {
-    if (!llvm::isa<mlir::VectorType>(offTy))
-      return emitOpError(
-          "Based on TensorDesc shape, it is an VC tensor descriptor, "
-          "in which the offset should be an 1D vector.");
-
-    auto vecTy = llvm::dyn_cast<mlir::VectorType>(offTy);
-    if (vecTy.getRank() != 1)
-      return emitOpError("The index should be an 1D vector Type for VC mode "
-                         "tensor descriptor.");
+  auto vecTy = llvm::dyn_cast<mlir::VectorType>(offTy);
+  if (!vecTy || vecTy.getRank() != 1)
+    return emitOpError("The offset should be an 1D vector.\n");
 
-    if (shape[0] != vecTy.getShape()[0])
-      return emitOpError("For VC Mode TensorDesc. The offset should have same"
-                         "length as the dim-0 of TensorDesc.");
-  }
+  if (shape[0] != vecTy.getShape()[0])
+    return emitOpError(
+        "The offset should have same length as the dim-0 of TensorDesc.");
 
   return ::mlir::success();
 }
@@ -1340,10 +1538,19 @@ ::mlir::LogicalResult UpdateOffsetOp::verify() {
 ::mlir::LogicalResult UpdateNDOffsetOp::verify() {
   // number of offsets specified must match the rank of the tensor descriptor
   if (getTensorDesc().getType().getRank() != getOffsets().size()) {
-    return emitOpError("invalid number of offsets.");
+    return emitOpError("Invalid number of offsets.");
   }
   return ::mlir::success();
 }
+
+::mlir::LogicalResult AtomicRMWOp::verify() {
+  auto mode = getMode();
+  if (mode != imex::xegpu::Mode::VC) {
+    return emitOpError("AtomicRMWOp only work on VC mode.\n");
+  }
+  return ::mlir::success();
+}
+
 } // namespace xegpu
 } // namespace imex
 
diff --git a/test/Conversion/XeGPUToSPIRV/atomic_basic.vc.mlir b/test/Conversion/XeGPUToSPIRV/atomic_basic.vc.mlir
index a06272f10..67c4aab5a 100644
--- a/test/Conversion/XeGPUToSPIRV/atomic_basic.vc.mlir
+++ b/test/Conversion/XeGPUToSPIRV/atomic_basic.vc.mlir
@@ -14,8 +14,8 @@ module @gemm attributes {gpu.container_module} {
       %mask = arith.constant dense<true> : vector<16xi1>
       %offsets = arith.constant dense<[0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60]> : vector<16xindex>
       %1 = arith.constant dense<0.5> : vector<16xf32>
-      %2 = xegpu.create_tdesc %arg0, %offsets {chunk_size_per_lane = 1} : memref<8x16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
-      %3 = xegpu.atomic_rmw "addf" %2, %mask, %1 : !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1>, vector<16xf32> -> vector<16xf32>
+      %2 = xegpu.create_tdesc %arg0, %offsets {mode = vc, chunk_size_per_lane = 1} : memref<8x16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
+      %3 = xegpu.atomic_rmw "addf" %2, %mask, %1 {mode = vc} : !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1>, vector<16xf32> -> vector<16xf32>
       gpu.return
     }
   }
diff --git a/test/Conversion/XeTileToXeGPU/sg_level_gemm_1k_1k_1k_f16_f32.mlir b/test/Conversion/XeTileToXeGPU/sg_level_gemm_1k_1k_1k_f16_f32.mlir
index 47400e5cd..cd6384096 100644
--- a/test/Conversion/XeTileToXeGPU/sg_level_gemm_1k_1k_1k_f16_f32.mlir
+++ b/test/Conversion/XeTileToXeGPU/sg_level_gemm_1k_1k_1k_f16_f32.mlir
@@ -15,162 +15,162 @@ func.func @test_gemm(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: m
   // intialize C tile and load it
   //CHECK: arith.addi {{.*}} : index
   //CHECK-NEXT: arith.addi {{.*}} : index
-  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
   //CHECK-NEXT: arith.constant 0 : index
   //CHECK-NEXT: arith.constant 16 : index
   //CHECK-NEXT: arith.addi {{.*}} : index
   //CHECK-NEXT: arith.addi {{.*}} : index
-  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
   //CHECK-NEXT: arith.constant 0 : index
   //CHECK-NEXT: arith.constant 32 : index
   //CHECK-NEXT: arith.addi {{.*}} : index
   //CHECK-NEXT: arith.addi {{.*}} : index
-  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
   //CHECK-NEXT: arith.constant 0 : index
   //CHECK-NEXT: arith.constant 48 : index
   //CHECK-NEXT: arith.addi {{.*}} : index
   //CHECK-NEXT: arith.addi {{.*}} : index
-  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
   //CHECK-NEXT: arith.constant 8 : index
   //CHECK-NEXT: arith.constant 0 : index
   //CHECK-NEXT: arith.addi {{.*}} : index
   //CHECK-NEXT: arith.addi {{.*}} : index
-  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
   //CHECK-NEXT: arith.constant 8 : index
   //CHECK-NEXT: arith.constant 16 : index
   //CHECK-NEXT: arith.addi {{.*}} : index
   //CHECK-NEXT: arith.addi {{.*}} : index
-  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
   //CHECK-NEXT: arith.constant 8 : index
   //CHECK-NEXT: arith.constant 32 : index
   //CHECK-NEXT: arith.addi {{.*}} : index
   //CHECK-NEXT: arith.addi {{.*}} : index
-  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
   //CHECK-NEXT: arith.constant 8 : index
   //CHECK-NEXT: arith.constant 48 : index
   //CHECK-NEXT: arith.addi {{.*}} : index
   //CHECK-NEXT: arith.addi {{.*}} : index
-  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
   //CHECK-NEXT: arith.constant 16 : index
   //CHECK-NEXT: arith.constant 0 : index
   //CHECK-NEXT: arith.addi {{.*}} : index
   //CHECK-NEXT: arith.addi {{.*}} : index
-  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
   //CHECK-NEXT: arith.constant 16 : index
   //CHECK-NEXT: arith.constant 16 : index
   //CHECK-NEXT: arith.addi %2, %c16_14 : index
   //CHECK-NEXT: arith.addi %3, %c16_15 : index
-  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
   //CHECK-NEXT: arith.constant 16 : index
   //CHECK-NEXT: arith.constant 32 : index
   //CHECK-NEXT: arith.addi {{.*}} : index
   //CHECK-NEXT: arith.addi {{.*}} : index
-  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
   //CHECK-NEXT: arith.constant 16 : index
   //CHECK-NEXT: arith.constant 48 : index
   //CHECK-NEXT: arith.addi {{.*}} : index
   //CHECK-NEXT: arith.addi {{.*}} : index
-  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
   //CHECK-NEXT: arith.constant 24 : index
   //CHECK-NEXT: arith.constant 0 : index
   //CHECK-NEXT: arith.addi {{.*}} : index
   //CHECK-NEXT: arith.addi {{.*}} : index
-  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
   //CHECK-NEXT: arith.constant 24 : index
   //CHECK-NEXT: arith.constant 16 : index
   //CHECK-NEXT: arith.addi {{.*}} : index
   //CHECK-NEXT: arith.addi {{.*}} : index
-  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
   //CHECK-NEXT: arith.constant 24 : index
   //CHECK-NEXT: arith.constant 32 : index
   //CHECK-NEXT: arith.addi {{.*}} : index
   //CHECK-NEXT: arith.addi {{.*}} : index
-  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
   //CHECK-NEXT: arith.constant 24 : index
   //CHECK-NEXT: arith.constant 48 : index
   //CHECK-NEXT: arith.addi {{.*}} : index
   //CHECK-NEXT: arith.addi {{.*}} : index
-  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
   //CHECK-NEXT: arith.constant 32 : index
   //CHECK-NEXT: arith.constant 0 : index
   //CHECK-NEXT: arith.addi {{.*}} : index
   //CHECK-NEXT: arith.addi {{.*}} : index
-  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
   //CHECK-NEXT: arith.constant 32 : index
   //CHECK-NEXT: arith.constant 16 : index
   //CHECK-NEXT: arith.addi {{.*}} : index
   //CHECK-NEXT: arith.addi {{.*}} : index
-  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
   //CHECK-NEXT: arith.constant 32 : index
   //CHECK-NEXT: arith.constant 32 : index
   //CHECK-NEXT: arith.addi {{.*}} : index
   //CHECK-NEXT: arith.addi {{.*}} : index
-  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
   //CHECK-NEXT: arith.constant 32 : index
   //CHECK-NEXT: arith.constant 48 : index
   //CHECK-NEXT: arith.addi {{.*}} : index
   //CHECK-NEXT: arith.addi {{.*}} : index
-  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
   //CHECK-NEXT: arith.constant 40 : index
   //CHECK-NEXT: arith.constant 0 : index
   //CHECK-NEXT: arith.addi {{.*}} : index
   //CHECK-NEXT: arith.addi {{.*}} : index
-  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
   //CHECK-NEXT: arith.constant 40 : index
   //CHECK-NEXT: arith.constant 16 : index
   //CHECK-NEXT: arith.addi {{.*}} : index
   //CHECK-NEXT: arith.addi {{.*}} : index
-  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
   //CHECK-NEXT: arith.constant 40 : index
   //CHECK-NEXT: arith.constant 32 : index
   //CHECK-NEXT: arith.addi {{.*}} : index
   //CHECK-NEXT: arith.addi {{.*}} : index
-  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
   //CHECK-NEXT: arith.constant 40 : index
   //CHECK-NEXT: arith.constant 48 : index
   //CHECK-NEXT: arith.addi {{.*}} : index
   //CHECK-NEXT: arith.addi {{.*}} : index
-  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
   //CHECK-NEXT: arith.constant 48 : index
   //CHECK-NEXT: arith.constant 0 : index
   //CHECK-NEXT: arith.addi {{.*}} : index
   //CHECK-NEXT: arith.addi {{.*}} : index
-  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
   //CHECK-NEXT: arith.constant 48 : index
   //CHECK-NEXT: arith.constant 16 : index
   //CHECK-NEXT: arith.addi {{.*}} : index
   //CHECK-NEXT: arith.addi {{.*}} : index
-  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
   //CHECK-NEXT: arith.constant 48 : index
   //CHECK-NEXT: arith.constant 32 : index
   //CHECK-NEXT: arith.addi {{.*}} : index
   //CHECK-NEXT: arith.addi {{.*}} : index
-  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
   //CHECK-NEXT: arith.constant 48 : index
   //CHECK-NEXT: arith.constant 48 : index
   //CHECK-NEXT: arith.addi {{.*}} : index
   //CHECK-NEXT: arith.addi {{.*}} : index
-  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
   //CHECK-NEXT: arith.constant 56 : index
   //CHECK-NEXT: arith.constant 0 : index
   //CHECK-NEXT: arith.addi {{.*}} : index
   //CHECK-NEXT: arith.addi {{.*}} : index
-  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
   //CHECK-NEXT: arith.constant 56 : index
   //CHECK-NEXT: arith.constant 16 : index
   //CHECK-NEXT: arith.addi {{.*}} : index
   //CHECK-NEXT: arith.addi {{.*}} : index
-  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
   //CHECK-NEXT: arith.constant 56 : index
   //CHECK-NEXT: arith.constant 32 : index
   //CHECK-NEXT: arith.addi {{.*}} : index
   //CHECK-NEXT: arith.addi {{.*}} : index
-  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
   //CHECK-NEXT: arith.constant 56 : index
   //CHECK-NEXT: arith.constant 48 : index
   //CHECK-NEXT: arith.addi {{.*}} : index
   //CHECK-NEXT: arith.addi {{.*}} : index
-  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
   %c_init_tile = xetile.init_tile %C[%m, %n] : memref<1024x1024xf32> -> !xetile.tile<64x64xf32>
   //CHECK: xegpu.load_nd {{.*}} {mode = vc, {{.*}}} : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
   //CHECK-NEXT: xegpu.load_nd {{.*}} {mode = vc, {{.*}}} : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
@@ -209,196 +209,196 @@ func.func @test_gemm(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: m
   // CHECK:  arith.constant 0 : index
   // CHECK-NEXT:  arith.constant 0 : index
   // CHECK-NEXT:  arith.addi {{.*}} : index
-  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
   // CHECK-NEXT:  arith.constant 0 : index
   // CHECK-NEXT:  arith.constant 16 : index
   // CHECK-NEXT:  arith.addi {{.*}} : index
-  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
   // CHECK-NEXT:  arith.constant 0 : index
   // CHECK-NEXT:  arith.constant 32 : index
   // CHECK-NEXT:  arith.addi {{.*}} : index
-  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
   // CHECK-NEXT:  arith.constant 0 : index
   // CHECK-NEXT:  arith.constant 48 : index
   // CHECK-NEXT:  arith.addi {{.*}} : index
-  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
   // CHECK-NEXT:  arith.constant 8 : index
   // CHECK-NEXT:  arith.constant 0 : index
   // CHECK-NEXT:  arith.addi {{.*}} : index
-  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
   // CHECK-NEXT:  arith.constant 8 : index
   // CHECK-NEXT:  arith.constant 16 : index
   // CHECK-NEXT:  arith.addi {{.*}} : index
-  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
   // CHECK-NEXT:  arith.constant 8 : index
   // CHECK-NEXT:  arith.constant 32 : index
   // CHECK-NEXT:  arith.addi {{.*}} : index
-  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
   // CHECK-NEXT:  arith.constant 8 : index
   // CHECK-NEXT:  arith.constant 48 : index
   // CHECK-NEXT:  arith.addi {{.*}} : index
-  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
   // CHECK-NEXT:  arith.constant 16 : index
   // CHECK-NEXT:  arith.constant 0 : index
   // CHECK-NEXT:  arith.addi {{.*}} : index
-  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
   // CHECK-NEXT:  arith.constant 16 : index
   // CHECK-NEXT:  arith.constant 16 : index
   // CHECK-NEXT:  arith.addi {{.*}} : index
-  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
   // CHECK-NEXT:  arith.constant 16 : index
   // CHECK-NEXT:  arith.constant 32 : index
   // CHECK-NEXT:  arith.addi {{.*}} : index
-  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
   // CHECK-NEXT:  arith.constant 16 : index
   // CHECK-NEXT:  arith.constant 48 : index
   // CHECK-NEXT:  arith.addi {{.*}} : index
-  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
   // CHECK-NEXT:  arith.constant 24 : index
   // CHECK-NEXT:  arith.constant 0 : index
   // CHECK-NEXT:  arith.addi {{.*}} : index
-  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
   // CHECK-NEXT:  arith.constant 24 : index
   // CHECK-NEXT:  arith.constant 16 : index
   // CHECK-NEXT:  arith.addi {{.*}} : index
-  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
   // CHECK-NEXT:  arith.constant 24 : index
   // CHECK-NEXT:  arith.constant 32 : index
   // CHECK-NEXT:  arith.addi {{.*}} : index
-  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
   // CHECK-NEXT:  arith.constant 24 : index
   // CHECK-NEXT:  arith.constant 48 : index
   // CHECK-NEXT:  arith.addi {{.*}} : index
-  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
   // CHECK-NEXT:  arith.constant 32 : index
   // CHECK-NEXT:  arith.constant 0 : index
   // CHECK-NEXT:  arith.addi {{.*}} : index
-  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
   // CHECK-NEXT:  arith.constant 32 : index
   // CHECK-NEXT:  arith.constant 16 : index
   // CHECK-NEXT:  arith.addi {{.*}} : index
-  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
   // CHECK-NEXT:  arith.constant 32 : index
   // CHECK-NEXT:  arith.constant 32 : index
   // CHECK-NEXT:  arith.addi {{.*}} : index
-  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
   // CHECK-NEXT:  arith.constant 32 : index
   // CHECK-NEXT:  arith.constant 48 : index
   // CHECK-NEXT:  arith.addi {{.*}} : index
-  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
   // CHECK-NEXT:  arith.constant 40 : index
   // CHECK-NEXT:  arith.constant 0 : index
   // CHECK-NEXT:  arith.addi {{.*}} : index
-  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
   // CHECK-NEXT:  arith.constant 40 : index
   // CHECK-NEXT:  arith.constant 16 : index
   // CHECK-NEXT:  arith.addi {{.*}} : index
-  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
   // CHECK-NEXT:  arith.constant 40 : index
   // CHECK-NEXT:  arith.constant 32 : index
   // CHECK-NEXT:  arith.addi {{.*}} : index
-  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
   // CHECK-NEXT:  arith.constant 40 : index
   // CHECK-NEXT:  arith.constant 48 : index
   // CHECK-NEXT:  arith.addi {{.*}} : index
-  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
   // CHECK-NEXT:  arith.constant 48 : index
   // CHECK-NEXT:  arith.constant 0 : index
   // CHECK-NEXT:  arith.addi {{.*}} : index
-  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
   // CHECK-NEXT:  arith.constant 48 : index
   // CHECK-NEXT:  arith.constant 16 : index
   // CHECK-NEXT:  arith.addi {{.*}} : index
-  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
   // CHECK-NEXT:  arith.constant 48 : index
   // CHECK-NEXT:  arith.constant 32 : index
   // CHECK-NEXT:  arith.addi {{.*}} : index
-  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
   // CHECK-NEXT:  arith.constant 48 : index
   // CHECK-NEXT:  arith.constant 48 : index
   // CHECK-NEXT:  arith.addi {{.*}} : index
-  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
   // CHECK-NEXT:  arith.constant 56 : index
   // CHECK-NEXT:  arith.constant 0 : index
   // CHECK-NEXT:  arith.addi {{.*}} : index
-  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
   // CHECK-NEXT:  arith.constant 56 : index
   // CHECK-NEXT:  arith.constant 16 : index
   // CHECK-NEXT:  arith.addi {{.*}} : index
-  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
   // CHECK-NEXT:  arith.constant 56 : index
   // CHECK-NEXT:  arith.constant 32 : index
   // CHECK-NEXT:  arith.addi {{.*}} : index
-  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
   // CHECK-NEXT:  arith.constant 56 : index
   // CHECK-NEXT:  arith.constant 48 : index
   // CHECK-NEXT:  arith.addi {{.*}} : index
-  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+  // CHECK-NEXT:  xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
   %a_init_tile = xetile.init_tile %A[%m, %c0] : memref<1024x1024xf16> -> !xetile.tile<64x64xf16>
   // CHECK: arith.constant 0 : index
   // CHECK-NEXT: arith.constant 0 : index
   // CHECK-NEXT: arith.addi {{.*}} : index
-  // CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+  // CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
   // CHECK-NEXT: arith.constant 0 : index
   // CHECK-NEXT: arith.constant 16 : index
   // CHECK-NEXT: arith.addi {{.*}} : index
-  // CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+  // CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
   // CHECK-NEXT: arith.constant 0 : index
   // CHECK-NEXT: arith.constant 32 : index
   // CHECK-NEXT: arith.addi {{.*}} : index
-  // CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+  // CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
   // CHECK-NEXT: arith.constant 0 : index
   // CHECK-NEXT: arith.constant 48 : index
   // CHECK-NEXT: arith.addi {{.*}} : index
-  // CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+  // CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
   // CHECK-NEXT: arith.constant 16 : index
   // CHECK-NEXT: arith.constant 0 : index
   // CHECK-NEXT: arith.addi {{.*}} : index
-  // CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+  // CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
   // CHECK-NEXT: arith.constant 16 : index
   // CHECK-NEXT: arith.constant 16 : index
   // CHECK-NEXT: arith.addi {{.*}} : index
-  // CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+  // CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
   // CHECK-NEXT: arith.constant 16 : index
   // CHECK-NEXT: arith.constant 32 : index
   // CHECK-NEXT: arith.addi {{.*}} : index
-  // CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+  // CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
   // CHECK-NEXT: arith.constant 16 : index
   // CHECK-NEXT: arith.constant 48 : index
   // CHECK-NEXT: arith.addi {{.*}} : index
-  // CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+  // CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
   // CHECK-NEXT: arith.constant 32 : index
   // CHECK-NEXT: arith.constant 0 : index
   // CHECK-NEXT: arith.addi {{.*}} : index
-  // CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+  // CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
   // CHECK-NEXT: arith.constant 32 : index
   // CHECK-NEXT: arith.constant 16 : index
   // CHECK-NEXT: arith.addi {{.*}} : index
-  // CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+  // CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
   // CHECK-NEXT: arith.constant 32 : index
   // CHECK-NEXT: arith.constant 32 : index
   // CHECK-NEXT: arith.addi {{.*}} : index
-  // CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+  // CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
   // CHECK-NEXT: arith.constant 32 : index
   // CHECK-NEXT: arith.constant 48 : index
   // CHECK-NEXT: arith.addi {{.*}} : index
-  // CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+  // CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
   // CHECK-NEXT: arith.constant 48 : index
   // CHECK-NEXT: arith.constant 0 : index
   // CHECK-NEXT: arith.addi {{.*}} : index
-  // CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+  // CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
   // CHECK-NEXT: arith.constant 48 : index
   // CHECK-NEXT: arith.constant 16 : index
   // CHECK-NEXT: arith.addi {{.*}} : index
-  // CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+  // CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
   // CHECK-NEXT: arith.constant 48 : index
   // CHECK-NEXT: arith.constant 32 : index
   // CHECK-NEXT: arith.addi {{.*}} : index
-  // CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+  // CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
   // CHECK-NEXT: arith.constant 48 : index
   // CHECK-NEXT: arith.constant 48 : index
   // CHECK-NEXT: arith.addi {{.*}} : index
-  // CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+  // CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
   %b_init_tile = xetile.init_tile %B[%c0, %n] : memref<1024x1024xf16> -> !xetile.tile<64x64xf16>
   // compute the value of C tile by iterating over tiles in k-dimension and doing dpas
   // CHECK: scf.for
diff --git a/test/Conversion/XeTileToXeGPU/sg_level_load_tile.mlir b/test/Conversion/XeTileToXeGPU/sg_level_load_tile.mlir
index c333b564e..40b929854 100644
--- a/test/Conversion/XeTileToXeGPU/sg_level_load_tile.mlir
+++ b/test/Conversion/XeTileToXeGPU/sg_level_load_tile.mlir
@@ -5,10 +5,10 @@ func.func @sglevel_tiled_load_tile(%a: memref<1024x1024xf16>, %b: memref<1024x10
 
     //CHECK: arith.constant 0 : index
     //CHECK-NEXT: arith.constant 64 : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
     //CHECK-NEXT: arith.constant 8 : index
     //CHECK-NEXT: arith.constant 64 : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
 	%1 = xetile.init_tile %a[%c0, %c64] : memref<1024x1024xf16> -> !xetile.tile<16x16xf16>
 
     //CHECK: xegpu.load_nd {{.*}} {mode = vc, {{.*}}} : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
diff --git a/test/Conversion/XeTileToXeGPU/sg_level_scf_for.mlir b/test/Conversion/XeTileToXeGPU/sg_level_scf_for.mlir
index 4a09b538f..34e1364ce 100644
--- a/test/Conversion/XeTileToXeGPU/sg_level_scf_for.mlir
+++ b/test/Conversion/XeTileToXeGPU/sg_level_scf_for.mlir
@@ -4,10 +4,10 @@ func.func @sglevel_tiled_gemm(%a: memref<1024x1024xf16>, %b: memref<1024x1024xf1
   %c0 = arith.constant 0 : index
   %c64 = arith.constant 64 : index
   %c1024 = arith.constant 1024 : index
-  //CHECK: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+  //CHECK: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
   //CHECK-NEXT: arith.constant 8 : index
   //CHECK-NEXT: arith.constant 64 : index
-  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
 	%1 = xetile.init_tile %a[%c0, %c64] : memref<1024x1024xf16> -> !xetile.tile<16x16xf16>
   %2 = arith.constant dense<0.0> : vector<16x16xf16>
   //CHECK: !xegpu.tensor_desc<8x16xf16>, !xegpu.tensor_desc<8x16xf16>, vector<8x16xf16>, vector<8x16xf16>
@@ -22,8 +22,8 @@ func.func @sglevel_tiled_gemm(%a: memref<1024x1024xf16>, %b: memref<1024x1024xf1
     scf.yield %5, %3: !xetile.tile<16x16xf16>, vector<16x16xf16>
   }
 
-  //CHECK: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
-  //CHECK: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+  //CHECK: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+  //CHECK: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
 	%5 = xetile.init_tile %b[%c0, %c64] : memref<1024x1024xf16> -> !xetile.tile<16x16xf16>
   //CHECK: xegpu.store_nd {{.*}} {mode = vc, {{.*}}} : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16>
   //CHECK: xegpu.store_nd {{.*}} {mode = vc, {{.*}}} : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16>
diff --git a/test/Conversion/XeTileToXeGPU/sg_level_store.mlir b/test/Conversion/XeTileToXeGPU/sg_level_store.mlir
index 833140e3e..6e0d04be6 100644
--- a/test/Conversion/XeTileToXeGPU/sg_level_store.mlir
+++ b/test/Conversion/XeTileToXeGPU/sg_level_store.mlir
@@ -12,28 +12,28 @@ func.func @sglevel_tiled_store(%a: memref<1024x1024xf32>) {
 
     // CHECK: arith.constant 0 : index
     // CHECK-NEXT: arith.constant 32 : index
-    // CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+    // CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
     // CHECK-NEXT: arith.constant 0 : index
     // CHECK-NEXT: arith.constant 48 : index
-    // CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+    // CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
     // CHECK-NEXT: arith.constant 8 : index
     // CHECK-NEXT: arith.constant 32 : index
-    // CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+    // CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
     // CHECK-NEXT: arith.constant 8 : index
     // CHECK-NEXT: arith.constant 48 : index
-    // CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+    // CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
     // CHECK-NEXT: arith.constant 16 : index
     // CHECK-NEXT: arith.constant 32 : index
-    // CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+    // CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
     // CHECK-NEXT: arith.constant 16 : index
     // CHECK-NEXT: arith.constant 48 : index
-    // CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+    // CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
     // CHECK-NEXT: arith.constant 24 : index
     // CHECK-NEXT: arith.constant 32 : index
-    // CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+    // CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
     // CHECK-NEXT: arith.constant 24 : index
     // CHECK-NEXT: arith.constant 48 : index
-    // CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+    // CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
 	%1 = xetile.init_tile %a[0, 32] : memref<1024x1024xf32> -> !xetile.tile<32x32xf32>
 
 	// CHECK: xegpu.store_nd {{.*}} {mode = vc, {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
diff --git a/test/Conversion/XeTileToXeGPU/sg_level_tile_mma.mlir b/test/Conversion/XeTileToXeGPU/sg_level_tile_mma.mlir
index e654e7b4f..76d42e9c6 100644
--- a/test/Conversion/XeTileToXeGPU/sg_level_tile_mma.mlir
+++ b/test/Conversion/XeTileToXeGPU/sg_level_tile_mma.mlir
@@ -5,28 +5,28 @@ func.func @sglevel_tiled_gemm(%a: memref<1024x1024xf16>, %b: memref<1024x1024xf1
 
     //CHECK:      arith.constant 0 : index
     //CHECK-NEXT: arith.constant 64 : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
     //CHECK-NEXT: arith.constant 0 : index
     //CHECK-NEXT: arith.constant 80 : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
     //CHECK-NEXT: arith.constant 8 : index
     //CHECK-NEXT: arith.constant 64 : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
     //CHECK-NEXT: arith.constant 8 : index
     //CHECK-NEXT: arith.constant 80 : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
     //CHECK-NEXT: arith.constant 16 : index
     //CHECK-NEXT: arith.constant 64 : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
     //CHECK-NEXT: arith.constant 16 : index
     //CHECK-NEXT: arith.constant 80 : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
     //CHECK-NEXT: arith.constant 24 : index
     //CHECK-NEXT: arith.constant 64 : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
     //CHECK-NEXT: arith.constant 24 : index
     //CHECK-NEXT: arith.constant 80 : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
 	%1 = xetile.init_tile %a[%c0, %c64] : memref<1024x1024xf16> -> !xetile.tile<4x2x8x16xf16>
 
     //CHECK:      xegpu.load_nd {{.*}} {mode = vc, vnni_axis = 1, {{.*}}} : !xegpu.tensor_desc<8x16xf16> -> vector<8x8x2xf16>
@@ -41,28 +41,28 @@ func.func @sglevel_tiled_gemm(%a: memref<1024x1024xf16>, %b: memref<1024x1024xf1
 
     //CHECK:      arith.constant 0 : index
     //CHECK-NEXT: arith.constant 64 : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
     //CHECK-NEXT: arith.constant 16 : index
     //CHECK-NEXT: arith.constant 64 : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
     //CHECK-NEXT: arith.constant 32 : index
     //CHECK-NEXT: arith.constant 64 : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
     //CHECK-NEXT: arith.constant 48 : index
     //CHECK-NEXT: arith.constant 64 : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
     //CHECK-NEXT: arith.constant 0 : index
     //CHECK-NEXT: arith.constant 80 : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
     //CHECK-NEXT: arith.constant 16 : index
     //CHECK-NEXT: arith.constant 80 : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
     //CHECK-NEXT: arith.constant 32 : index
     //CHECK-NEXT: arith.constant 80 : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
     //CHECK-NEXT: arith.constant 48 : index
     //CHECK-NEXT: arith.constant 80 : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
 	%3 = xetile.init_tile %b[%c64, %c0] : memref<1024x1024xf16> -> !xetile.tile<2x4x16x16xf16>
 
     //CHECK:      xegpu.load_nd {{.*}} {mode = vc, vnni_axis = 0, {{.*}}} : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16>
diff --git a/test/Conversion/XeTileToXeGPU/sg_level_tiled_gemm.mlir b/test/Conversion/XeTileToXeGPU/sg_level_tiled_gemm.mlir
index 56b2334d3..e7f348e2d 100644
--- a/test/Conversion/XeTileToXeGPU/sg_level_tiled_gemm.mlir
+++ b/test/Conversion/XeTileToXeGPU/sg_level_tiled_gemm.mlir
@@ -32,162 +32,162 @@ func.func @sglevel_tiled_gemm(%a: memref<1024x1024xf16>,
         	//CHECK-NEXT: arith.constant 0 : index
         	//CHECK-NEXT: arith.addi {{.*}}: index
         	//CHECK-NEXT: arith.addi {{.*}}: index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
         	//CHECK-NEXT: arith.constant 0 : index
         	//CHECK-NEXT: arith.constant 16 : index
         	//CHECK-NEXT: arith.addi {{.*}}: index
         	//CHECK-NEXT: arith.addi {{.*}}: index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
         	//CHECK-NEXT: arith.constant 0 : index
         	//CHECK-NEXT: arith.constant 32 : index
         	//CHECK-NEXT: arith.addi {{.*}}: index
         	//CHECK-NEXT: arith.addi {{.*}}: index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
         	//CHECK-NEXT: arith.constant 0 : index
         	//CHECK-NEXT: arith.constant 48 : index
         	//CHECK-NEXT: arith.addi {{.*}}: index
         	//CHECK-NEXT: arith.addi {{.*}}: index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
         	//CHECK-NEXT: arith.constant 8 : index
         	//CHECK-NEXT: arith.constant 0 : index
         	//CHECK-NEXT: arith.addi {{.*}}: index
         	//CHECK-NEXT: arith.addi {{.*}}: index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
         	//CHECK-NEXT: arith.constant 8 : index
         	//CHECK-NEXT: arith.constant 16 : index
         	//CHECK-NEXT: arith.addi {{.*}}: index
         	//CHECK-NEXT: arith.addi {{.*}}: index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
         	//CHECK-NEXT: arith.constant 8 : index
         	//CHECK-NEXT: arith.constant 32 : index
         	//CHECK-NEXT: arith.addi {{.*}}: index
         	//CHECK-NEXT: arith.addi {{.*}}: index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
         	//CHECK-NEXT: arith.constant 8 : index
         	//CHECK-NEXT: arith.constant 48 : index
         	//CHECK-NEXT: arith.addi {{.*}}: index
         	//CHECK-NEXT: arith.addi {{.*}}: index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
         	//CHECK-NEXT: arith.constant 16 : index
         	//CHECK-NEXT: arith.constant 0 : index
         	//CHECK-NEXT: arith.addi {{.*}}: index
         	//CHECK-NEXT: arith.addi {{.*}}: index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
         	//CHECK-NEXT: arith.constant 16 : index
         	//CHECK-NEXT: arith.constant 16 : index
         	//CHECK-NEXT: arith.addi {{.*}}: index
         	//CHECK-NEXT: arith.addi {{.*}}: index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
         	//CHECK-NEXT: arith.constant 16 : index
         	//CHECK-NEXT: arith.constant 32 : index
         	//CHECK-NEXT: arith.addi {{.*}}: index
         	//CHECK-NEXT: arith.addi {{.*}}: index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
         	//CHECK-NEXT: arith.constant 16 : index
         	//CHECK-NEXT: arith.constant 48 : index
         	//CHECK-NEXT: arith.addi {{.*}}: index
         	//CHECK-NEXT: arith.addi {{.*}}: index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
         	//CHECK-NEXT: arith.constant 24 : index
         	//CHECK-NEXT: arith.constant 0 : index
         	//CHECK-NEXT: arith.addi {{.*}}: index
         	//CHECK-NEXT: arith.addi {{.*}}: index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
         	//CHECK-NEXT: arith.constant 24 : index
         	//CHECK-NEXT: arith.constant 16 : index
         	//CHECK-NEXT: arith.addi {{.*}}: index
         	//CHECK-NEXT: arith.addi {{.*}}: index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
         	//CHECK-NEXT: arith.constant 24 : index
         	//CHECK-NEXT: arith.constant 32 : index
         	//CHECK-NEXT: arith.addi {{.*}}: index
         	//CHECK-NEXT: arith.addi {{.*}}: index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
         	//CHECK-NEXT: arith.constant 24 : index
         	//CHECK-NEXT: arith.constant 48 : index
         	//CHECK-NEXT: arith.addi {{.*}}: index
         	//CHECK-NEXT: arith.addi {{.*}}: index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
         	//CHECK-NEXT: arith.constant 32 : index
         	//CHECK-NEXT: arith.constant 0 : index
         	//CHECK-NEXT: arith.addi {{.*}}: index
         	//CHECK-NEXT: arith.addi {{.*}}: index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
         	//CHECK-NEXT: arith.constant 32 : index
         	//CHECK-NEXT: arith.constant 16 : index
         	//CHECK-NEXT: arith.addi {{.*}}: index
         	//CHECK-NEXT: arith.addi {{.*}}: index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
         	//CHECK-NEXT: arith.constant 32 : index
         	//CHECK-NEXT: arith.constant 32 : index
         	//CHECK-NEXT: arith.addi {{.*}}: index
         	//CHECK-NEXT: arith.addi {{.*}}: index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
         	//CHECK-NEXT: arith.constant 32 : index
         	//CHECK-NEXT: arith.constant 48 : index
         	//CHECK-NEXT: arith.addi {{.*}}: index
         	//CHECK-NEXT: arith.addi {{.*}}: index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
         	//CHECK-NEXT: arith.constant 40 : index
         	//CHECK-NEXT: arith.constant 0 : index
         	//CHECK-NEXT: arith.addi {{.*}}: index
         	//CHECK-NEXT: arith.addi {{.*}}: index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
         	//CHECK-NEXT: arith.constant 40 : index
         	//CHECK-NEXT: arith.constant 16 : index
         	//CHECK-NEXT: arith.addi {{.*}}: index
         	//CHECK-NEXT: arith.addi {{.*}}: index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
         	//CHECK-NEXT: arith.constant 40 : index
         	//CHECK-NEXT: arith.constant 32 : index
         	//CHECK-NEXT: arith.addi {{.*}}: index
         	//CHECK-NEXT: arith.addi {{.*}}: index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
         	//CHECK-NEXT: arith.constant 40 : index
         	//CHECK-NEXT: arith.constant 48 : index
         	//CHECK-NEXT: arith.addi {{.*}}: index
         	//CHECK-NEXT: arith.addi {{.*}}: index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
         	//CHECK-NEXT: arith.constant 48 : index
         	//CHECK-NEXT: arith.constant 0 : index
         	//CHECK-NEXT: arith.addi {{.*}}: index
         	//CHECK-NEXT: arith.addi {{.*}}: index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
         	//CHECK-NEXT: arith.constant 48 : index
         	//CHECK-NEXT: arith.constant 16 : index
         	//CHECK-NEXT: arith.addi {{.*}}: index
         	//CHECK-NEXT: arith.addi {{.*}}: index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
         	//CHECK-NEXT: arith.constant 48 : index
         	//CHECK-NEXT: arith.constant 32 : index
         	//CHECK-NEXT: arith.addi {{.*}}: index
         	//CHECK-NEXT: arith.addi {{.*}}: index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
         	//CHECK-NEXT: arith.constant 48 : index
         	//CHECK-NEXT: arith.constant 48 : index
         	//CHECK-NEXT: arith.addi {{.*}}: index
         	//CHECK-NEXT: arith.addi {{.*}}: index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
         	//CHECK-NEXT: arith.constant 56 : index
         	//CHECK-NEXT: arith.constant 0 : index
         	//CHECK-NEXT: arith.addi {{.*}}: index
         	//CHECK-NEXT: arith.addi {{.*}}: index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
         	//CHECK-NEXT: arith.constant 56 : index
         	//CHECK-NEXT: arith.constant 16 : index
         	//CHECK-NEXT: arith.addi {{.*}}: index
         	//CHECK-NEXT: arith.addi {{.*}}: index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
         	//CHECK-NEXT: arith.constant 56 : index
         	//CHECK-NEXT: arith.constant 32 : index
         	//CHECK-NEXT: arith.addi {{.*}}: index
         	//CHECK-NEXT: arith.addi {{.*}}: index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
         	//CHECK-NEXT: arith.constant 56 : index
         	//CHECK-NEXT: arith.constant 48 : index
         	//CHECK-NEXT: arith.addi {{.*}}: index
         	//CHECK-NEXT: arith.addi {{.*}}: index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
 			%1 = xetile.init_tile %a[%offset_0_dim_0, %offset_0_dim_1] : memref<1024x1024xf16> -> !xetile.tile<8x4x8x16xf16>
 
 			%tile_1_dim_0 = arith.constant 64 : index
@@ -207,82 +207,82 @@ func.func @sglevel_tiled_gemm(%a: memref<1024x1024xf16>,
         	//CHECK-NEXT: arith.constant 0 : index
         	//CHECK-NEXT: arith.addi {{.*}}: index
         	//CHECK-NEXT: arith.addi {{.*}}: index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
         	//CHECK-NEXT: arith.constant 0 : index
         	//CHECK-NEXT: arith.constant 16 : index
         	//CHECK-NEXT: arith.addi {{.*}}: index
         	//CHECK-NEXT: arith.addi {{.*}}: index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
         	//CHECK-NEXT: arith.constant 0 : index
         	//CHECK-NEXT: arith.constant 32 : index
         	//CHECK-NEXT: arith.addi {{.*}}: index
         	//CHECK-NEXT: arith.addi {{.*}}: index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
         	//CHECK-NEXT: arith.constant 0 : index
         	//CHECK-NEXT: arith.constant 48 : index
         	//CHECK-NEXT: arith.addi {{.*}}: index
         	//CHECK-NEXT: arith.addi {{.*}}: index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
         	//CHECK-NEXT: arith.constant 16 : index
         	//CHECK-NEXT: arith.constant 0 : index
         	//CHECK-NEXT: arith.addi {{.*}}: index
         	//CHECK-NEXT: arith.addi {{.*}}: index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
         	//CHECK-NEXT: arith.constant 16 : index
         	//CHECK-NEXT: arith.constant 16 : index
         	//CHECK-NEXT: arith.addi {{.*}}: index
         	//CHECK-NEXT: arith.addi {{.*}}: index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
         	//CHECK-NEXT: arith.constant 16 : index
         	//CHECK-NEXT: arith.constant 32 : index
         	//CHECK-NEXT: arith.addi {{.*}}: index
         	//CHECK-NEXT: arith.addi {{.*}}: index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
         	//CHECK-NEXT: arith.constant 16 : index
         	//CHECK-NEXT: arith.constant 48 : index
         	//CHECK-NEXT: arith.addi {{.*}}: index
         	//CHECK-NEXT: arith.addi {{.*}}: index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
         	//CHECK-NEXT: arith.constant 32 : index
         	//CHECK-NEXT: arith.constant 0 : index
         	//CHECK-NEXT: arith.addi {{.*}}: index
         	//CHECK-NEXT: arith.addi {{.*}}: index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
         	//CHECK-NEXT: arith.constant 32 : index
         	//CHECK-NEXT: arith.constant 16 : index
         	//CHECK-NEXT: arith.addi {{.*}}: index
         	//CHECK-NEXT: arith.addi {{.*}}: index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
         	//CHECK-NEXT: arith.constant 32 : index
         	//CHECK-NEXT: arith.constant 32 : index
         	//CHECK-NEXT: arith.addi {{.*}}: index
         	//CHECK-NEXT: arith.addi {{.*}}: index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
         	//CHECK-NEXT: arith.constant 32 : index
         	//CHECK-NEXT: arith.constant 48 : index
         	//CHECK-NEXT: arith.addi {{.*}}: index
         	//CHECK-NEXT: arith.addi {{.*}}: index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
         	//CHECK-NEXT: arith.constant 48 : index
         	//CHECK-NEXT: arith.constant 0 : index
         	//CHECK-NEXT: arith.addi {{.*}}: index
         	//CHECK-NEXT: arith.addi {{.*}}: index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
         	//CHECK-NEXT: arith.constant 48 : index
         	//CHECK-NEXT: arith.constant 16 : index
         	//CHECK-NEXT: arith.addi {{.*}}: index
         	//CHECK-NEXT: arith.addi {{.*}}: index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
         	//CHECK-NEXT: arith.constant 48 : index
         	//CHECK-NEXT: arith.constant 32 : index
         	//CHECK-NEXT: arith.addi {{.*}}: index
         	//CHECK-NEXT: arith.addi {{.*}}: index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
         	//CHECK-NEXT: arith.constant 48 : index
         	//CHECK-NEXT: arith.constant 48 : index
         	//CHECK-NEXT: arith.addi {{.*}}: index
         	//CHECK-NEXT: arith.addi {{.*}}: index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
 			%2 = xetile.init_tile %b[%offset_1_dim_0, %offset_1_dim_1] : memref<1024x1024xf16> -> !xetile.tile<4x4x16x16xf16>
 
         	//CHECK:      arith.constant dense<0.000000e+00> : vector<8x16xf32>
@@ -598,162 +598,162 @@ func.func @sglevel_tiled_gemm(%a: memref<1024x1024xf16>,
         	//CHECK-NEXT: arith.constant 0 : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
         	//CHECK-NEXT: arith.constant 0 : index
         	//CHECK-NEXT: arith.constant 16 : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
         	//CHECK-NEXT: arith.constant 0 : index
         	//CHECK-NEXT: arith.constant 32 : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
         	//CHECK-NEXT: arith.constant 0 : index
         	//CHECK-NEXT: arith.constant 48 : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
         	//CHECK-NEXT: arith.constant 8 : index
         	//CHECK-NEXT: arith.constant 0 : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
         	//CHECK-NEXT: arith.constant 8 : index
         	//CHECK-NEXT: arith.constant 16 : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
         	//CHECK-NEXT: arith.constant 8 : index
         	//CHECK-NEXT: arith.constant 32 : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
         	//CHECK-NEXT: arith.constant 8 : index
         	//CHECK-NEXT: arith.constant 48 : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
         	//CHECK-NEXT: arith.constant 16 : index
         	//CHECK-NEXT: arith.constant 0 : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
         	//CHECK-NEXT: arith.constant 16 : index
         	//CHECK-NEXT: arith.constant 16 : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
         	//CHECK-NEXT: arith.constant 16 : index
         	//CHECK-NEXT: arith.constant 32 : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
         	//CHECK-NEXT: arith.constant 16 : index
         	//CHECK-NEXT: arith.constant 48 : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
         	//CHECK-NEXT: arith.constant 24 : index
         	//CHECK-NEXT: arith.constant 0 : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
         	//CHECK-NEXT: arith.constant 24 : index
         	//CHECK-NEXT: arith.constant 16 : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
         	//CHECK-NEXT: arith.constant 24 : index
         	//CHECK-NEXT: arith.constant 32 : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
         	//CHECK-NEXT: arith.constant 24 : index
         	//CHECK-NEXT: arith.constant 48 : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
         	//CHECK-NEXT: arith.constant 32 : index
         	//CHECK-NEXT: arith.constant 0 : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
         	//CHECK-NEXT: arith.constant 32 : index
         	//CHECK-NEXT: arith.constant 16 : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
         	//CHECK-NEXT: arith.constant 32 : index
         	//CHECK-NEXT: arith.constant 32 : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
         	//CHECK-NEXT: arith.constant 32 : index
         	//CHECK-NEXT: arith.constant 48 : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
         	//CHECK-NEXT: arith.constant 40 : index
         	//CHECK-NEXT: arith.constant 0 : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
         	//CHECK-NEXT: arith.constant 40 : index
         	//CHECK-NEXT: arith.constant 16 : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
         	//CHECK-NEXT: arith.constant 40 : index
         	//CHECK-NEXT: arith.constant 32 : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
         	//CHECK-NEXT: arith.constant 40 : index
         	//CHECK-NEXT: arith.constant 48 : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
         	//CHECK-NEXT: arith.constant 48 : index
         	//CHECK-NEXT: arith.constant 0 : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
         	//CHECK-NEXT: arith.constant 48 : index
         	//CHECK-NEXT: arith.constant 16 : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
         	//CHECK-NEXT: arith.constant 48 : index
         	//CHECK-NEXT: arith.constant 32 : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
         	//CHECK-NEXT: arith.constant 48 : index
         	//CHECK-NEXT: arith.constant 48 : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
         	//CHECK-NEXT: arith.constant 56 : index
         	//CHECK-NEXT: arith.constant 0 : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
         	//CHECK-NEXT: arith.constant 56 : index
         	//CHECK-NEXT: arith.constant 16 : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
         	//CHECK-NEXT: arith.constant 56 : index
         	//CHECK-NEXT: arith.constant 32 : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc %arg2[%261, %262] {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc %arg2[%261, %262] {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
         	//CHECK-NEXT: arith.constant 56 : index
         	//CHECK-NEXT: arith.constant 48 : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
         	//CHECK-NEXT: arith.addi {{.*}} : index
-        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+        	//CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
 
 			%9 = xetile.init_tile %c[%offset_3_dim_0, %offset_3_dim_1] : memref<1024x1024xf32> -> !xetile.tile<8x4x8x16xf32>
 
diff --git a/test/Conversion/XeTileToXeGPU/sg_level_tiled_load_tile.mlir b/test/Conversion/XeTileToXeGPU/sg_level_tiled_load_tile.mlir
index 8a7a9fbd0..1fa6b5f4c 100644
--- a/test/Conversion/XeTileToXeGPU/sg_level_tiled_load_tile.mlir
+++ b/test/Conversion/XeTileToXeGPU/sg_level_tiled_load_tile.mlir
@@ -4,10 +4,10 @@ func.func @sglevel_tiled_load_tile(%a: memref<1024x1024xf16>, %b: memref<1024x10
     %c64 = arith.constant 64 : index
     //CHECK: arith.constant 0 : index
     //CHECK-NEXT: arith.constant 64 : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
     //CHECK-NEXT: arith.constant 8 : index
     //CHECK-NEXT: arith.constant 64 : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
 	%1 = xetile.init_tile %a[%c0, %c64] : memref<1024x1024xf16> -> !xetile.tile<2x1x8x16xf16>
     //CHECK: xegpu.load_nd {{.*}} {mode = vc, {{.*}}} : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
     //CHECK-NEXT: xegpu.load_nd {{.*}} {mode = vc, {{.*}}} : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
diff --git a/test/Conversion/XeTileToXeGPU/sg_level_tiled_scf_for.mlir b/test/Conversion/XeTileToXeGPU/sg_level_tiled_scf_for.mlir
index 5a4cb1d86..79508f499 100644
--- a/test/Conversion/XeTileToXeGPU/sg_level_tiled_scf_for.mlir
+++ b/test/Conversion/XeTileToXeGPU/sg_level_tiled_scf_for.mlir
@@ -8,10 +8,10 @@ func.func @sglevel_tiled_gemm(%a: memref<1024x1024xf16>, %b: memref<1024x1024xf1
   %c1024 = arith.constant 1024 : index
   //CHECK: arith.constant 0 : index
   //CHECK: arith.constant 64 : index
-  //CHECK: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+  //CHECK: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
   //CHECK: arith.constant 8 : index
   //CHECK: arith.constant 64 : index
-  //CHECK: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+  //CHECK: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
 	%1 = xetile.init_tile %a[%c0, %c64] : memref<1024x1024xf16> -> !xetile.tile<2x1x8x16xf16>
   //CHECK: arith.constant dense<0.000000e+00> : vector<8x16xf16>
   //CHECK: arith.constant dense<0.000000e+00> : vector<8x16xf16>
@@ -30,10 +30,10 @@ func.func @sglevel_tiled_gemm(%a: memref<1024x1024xf16>, %b: memref<1024x1024xf1
   }
   //CHECK: arith.constant 0 : index
   //CHECK-NEXT: arith.constant 64 : index
-  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
   //CHECK-NEXT: arith.constant 8 : index
   //CHECK-NEXT: arith.constant 64 : index
-  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+  //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
 	%5 = xetile.init_tile %b[%c0, %c64] : memref<1024x1024xf16> -> !xetile.tile<2x1x8x16xf16>
   //CHECK: xegpu.store_nd {{.*}} {mode = vc, {{.*}}} : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16>
   //CHECK-NEXT: xegpu.store_nd {{.*}} {mode = vc, {{.*}}} : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16>
diff --git a/test/Conversion/XeTileToXeGPU/sg_level_tiled_simple.mlir b/test/Conversion/XeTileToXeGPU/sg_level_tiled_simple.mlir
index cb8d84377..be68ad3a6 100644
--- a/test/Conversion/XeTileToXeGPU/sg_level_tiled_simple.mlir
+++ b/test/Conversion/XeTileToXeGPU/sg_level_tiled_simple.mlir
@@ -22,24 +22,24 @@ func.func @sglevel_tiled_gemm(%a: memref<1024x1024xf16>, %b: memref<1024x1024xf1
     //CHECK-NEXT: arith.constant 0 : index
     //CHECK-NEXT: arith.addi %6, %c0_1 : index
     //CHECK-NEXT: arith.addi %7, %c0_2 : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
     //CHECK-NEXT: arith.constant 8 : index
     //CHECK-NEXT: arith.constant 0 : index
     //CHECK-NEXT: arith.addi %6, %c8 : index
     //CHECK-NEXT: arith.addi %7, %c0_3 : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
 	%1 = xetile.init_tile %a[%offset_0_dim_0, %offset_0_dim_1] : memref<1024x1024xf16> -> !xetile.tile<2x1x8x16xf16>
 
     //CHECK:      arith.constant 0 : index
     //CHECK-NEXT: arith.constant 0 : index
     //CHECK-NEXT: arith.addi {{.*}} : index
     //CHECK-NEXT: arith.addi {{.*}} : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
     //CHECK-NEXT: arith.constant 0 : index
     //CHECK-NEXT: arith.constant 16 : index
     //CHECK-NEXT: arith.addi {{.*}} : index
     //CHECK-NEXT: arith.addi {{.*}} : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
 	%2 = xetile.init_tile %b[%offset_0_dim_0, %offset_0_dim_1] : memref<1024x1024xf16> -> !xetile.tile<1x2x16x16xf16>
 
     //CHECK: arith.constant dense<0.000000e+00> : vector<8x16xf32>
@@ -80,22 +80,22 @@ func.func @sglevel_tiled_gemm(%a: memref<1024x1024xf16>, %b: memref<1024x1024xf1
     //CHECK-NEXT: arith.constant 0 : index
     //CHECK-NEXT: arith.addi {{.*}} : index
     //CHECK-NEXT: arith.addi {{.*}} : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
     //CHECK-NEXT: arith.constant 0 : index
     //CHECK-NEXT: arith.constant 16 : index
     //CHECK-NEXT: arith.addi {{.*}} : index
     //CHECK-NEXT: arith.addi {{.*}} : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
     //CHECK-NEXT: arith.constant 8 : index
     //CHECK-NEXT: arith.constant 0 : index
     //CHECK-NEXT: arith.addi {{.*}} : index
     //CHECK-NEXT: arith.addi {{.*}} : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
     //CHECK-NEXT: arith.constant 8 : index
     //CHECK-NEXT: arith.constant 16 : index
     //CHECK-NEXT: arith.addi {{.*}} : index
     //CHECK-NEXT: arith.addi {{.*}} : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
 	%5 = xetile.init_tile %c[%offset_0_dim_0, %offset_0_dim_1] : memref<1024x1024xf32> -> !xetile.tile<2x2x8x16xf32>
     //CHECK:      xegpu.store_nd {{.*}} {mode = vc, {{.*}}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
     //CHECK-NEXT: xegpu.store_nd {{.*}} {mode = vc, {{.*}}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
diff --git a/test/Conversion/XeTileToXeGPU/sg_level_tiled_store.mlir b/test/Conversion/XeTileToXeGPU/sg_level_tiled_store.mlir
index 7a47e7216..9b33b89e9 100644
--- a/test/Conversion/XeTileToXeGPU/sg_level_tiled_store.mlir
+++ b/test/Conversion/XeTileToXeGPU/sg_level_tiled_store.mlir
@@ -2,100 +2,100 @@
 func.func @sglevel_tiled_store(%a: memref<1024x1024xf32>) {
 	// CHECK: arith.constant 0 : index
 	// CHECK-NEXT: arith.constant 32 : index
-	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
 	// CHECK-NEXT: arith.constant 0 : index
 	// CHECK-NEXT: arith.constant 48 : index
-	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
 	// CHECK-NEXT: arith.constant 0 : index
 	// CHECK-NEXT: arith.constant 64 : index
-	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
 	// CHECK-NEXT: arith.constant 0 : index
 	// CHECK-NEXT: arith.constant 80 : index
-	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
 	// CHECK-NEXT: arith.constant 8 : index
 	// CHECK-NEXT: arith.constant 32 : index
-	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
 	// CHECK-NEXT: arith.constant 8 : index
 	// CHECK-NEXT: arith.constant 48 : index
-	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
 	// CHECK-NEXT: arith.constant 8 : index
 	// CHECK-NEXT: arith.constant 64 : index
-	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
 	// CHECK-NEXT: arith.constant 8 : index
 	// CHECK-NEXT: arith.constant 80 : index
-	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
 	// CHECK-NEXT: arith.constant 16 : index
 	// CHECK-NEXT: arith.constant 32 : index
-	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
 	// CHECK-NEXT: arith.constant 16 : index
 	// CHECK-NEXT: arith.constant 48 : index
-	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
 	// CHECK-NEXT: arith.constant 16 : index
 	// CHECK-NEXT: arith.constant 64 : index
-	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
 	// CHECK-NEXT: arith.constant 16 : index
 	// CHECK-NEXT: arith.constant 80 : index
-	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
 	// CHECK-NEXT: arith.constant 24 : index
 	// CHECK-NEXT: arith.constant 32 : index
-	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
 	// CHECK-NEXT: arith.constant 24 : index
 	// CHECK-NEXT: arith.constant 48 : index
-	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
 	// CHECK-NEXT: arith.constant 24 : index
 	// CHECK-NEXT: arith.constant 64 : index
-	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
 	// CHECK-NEXT: arith.constant 24 : index
 	// CHECK-NEXT: arith.constant 80 : index
-	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
 	// CHECK-NEXT: arith.constant 32 : index
 	// CHECK-NEXT: arith.constant 32 : index
-	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
 	// CHECK-NEXT: arith.constant 32 : index
 	// CHECK-NEXT: arith.constant 48 : index
-	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
 	// CHECK-NEXT: arith.constant 32 : index
 	// CHECK-NEXT: arith.constant 64 : index
-	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
 	// CHECK-NEXT: arith.constant 32 : index
 	// CHECK-NEXT: arith.constant 80 : index
-	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
 	// CHECK-NEXT: arith.constant 40 : index
 	// CHECK-NEXT: arith.constant 32 : index
-	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
 	// CHECK-NEXT: arith.constant 40 : index
 	// CHECK-NEXT: arith.constant 48 : index
-	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
 	// CHECK-NEXT: arith.constant 40 : index
 	// CHECK-NEXT: arith.constant 64 : index
-	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
 	// CHECK-NEXT: arith.constant 40 : index
 	// CHECK-NEXT: arith.constant 80 : index
-	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
 	// CHECK-NEXT: arith.constant 48 : index
 	// CHECK-NEXT: arith.constant 32 : index
-	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
 	// CHECK-NEXT: arith.constant 48 : index
 	// CHECK-NEXT: arith.constant 48 : index
-	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
 	// CHECK-NEXT: arith.constant 48 : index
 	// CHECK-NEXT: arith.constant 64 : index
-	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
 	// CHECK-NEXT: arith.constant 48 : index
 	// CHECK-NEXT: arith.constant 80 : index
-	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
 	// CHECK-NEXT: arith.constant 56 : index
 	// CHECK-NEXT: arith.constant 32 : index
-	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
 	// CHECK-NEXT: arith.constant 56 : index
 	// CHECK-NEXT: arith.constant 48 : index
-	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
 	// CHECK-NEXT: arith.constant 56 : index
 	// CHECK-NEXT: arith.constant 64 : index
-	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
 	// CHECK-NEXT: arith.constant 56 : index
 	// CHECK-NEXT: arith.constant 80 : index
-	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+	// CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
 	%1 = xetile.init_tile %a[0, 32] : memref<1024x1024xf32> -> !xetile.tile<8x4x8x16xf32>
 
 	//CHECK: arith.constant dense<0.000000e+00> : vector<8x16xf32>
diff --git a/test/Conversion/XeTileToXeGPU/sg_level_tiled_tile_mma.mlir b/test/Conversion/XeTileToXeGPU/sg_level_tiled_tile_mma.mlir
index 9a0e4cc76..43b553971 100644
--- a/test/Conversion/XeTileToXeGPU/sg_level_tiled_tile_mma.mlir
+++ b/test/Conversion/XeTileToXeGPU/sg_level_tiled_tile_mma.mlir
@@ -20,162 +20,162 @@ func.func @sglevel_tiled_gemm(%a: memref<1024x1024xf16>, %b: memref<1024x1024xf1
     //CHECK-NEXT: arith.constant 0 : index
     //CHECK-NEXT: arith.addi {{.*}} : index
     //CHECK-NEXT: arith.addi {{.*}} : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
     //CHECK-NEXT: arith.constant 0 : index
     //CHECK-NEXT: arith.constant 16 : index
     //CHECK-NEXT: arith.addi {{.*}} : index
     //CHECK-NEXT: arith.addi {{.*}} : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
     //CHECK-NEXT: arith.constant 0 : index
     //CHECK-NEXT: arith.constant 32 : index
     //CHECK-NEXT: arith.addi {{.*}} : index
     //CHECK-NEXT: arith.addi {{.*}} : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
     //CHECK-NEXT: arith.constant 0 : index
     //CHECK-NEXT: arith.constant 48 : index
     //CHECK-NEXT: arith.addi {{.*}} : index
     //CHECK-NEXT: arith.addi {{.*}} : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
     //CHECK-NEXT: arith.constant 8 : index
     //CHECK-NEXT: arith.constant 0 : index
     //CHECK-NEXT: arith.addi {{.*}} : index
     //CHECK-NEXT: arith.addi {{.*}} : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
     //CHECK-NEXT: arith.constant 8 : index
     //CHECK-NEXT: arith.constant 16 : index
     //CHECK-NEXT: arith.addi {{.*}} : index
     //CHECK-NEXT: arith.addi {{.*}} : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
     //CHECK-NEXT: arith.constant 8 : index
     //CHECK-NEXT: arith.constant 32 : index
     //CHECK-NEXT: arith.addi {{.*}} : index
     //CHECK-NEXT: arith.addi {{.*}} : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
     //CHECK-NEXT: arith.constant 8 : index
     //CHECK-NEXT: arith.constant 48 : index
     //CHECK-NEXT: arith.addi {{.*}} : index
     //CHECK-NEXT: arith.addi {{.*}} : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
     //CHECK-NEXT: arith.constant 16 : index
     //CHECK-NEXT: arith.constant 0 : index
     //CHECK-NEXT: arith.addi {{.*}} : index
     //CHECK-NEXT: arith.addi {{.*}} : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
     //CHECK-NEXT: arith.constant 16 : index
     //CHECK-NEXT: arith.constant 16 : index
     //CHECK-NEXT: arith.addi {{.*}} : index
     //CHECK-NEXT: arith.addi {{.*}} : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
     //CHECK-NEXT: arith.constant 16 : index
     //CHECK-NEXT: arith.constant 32 : index
     //CHECK-NEXT: arith.addi {{.*}} : index
     //CHECK-NEXT: arith.addi {{.*}} : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
     //CHECK-NEXT: arith.constant 16 : index
     //CHECK-NEXT: arith.constant 48 : index
     //CHECK-NEXT: arith.addi {{.*}} : index
     //CHECK-NEXT: arith.addi {{.*}} : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
     //CHECK-NEXT: arith.constant 24 : index
     //CHECK-NEXT: arith.constant 0 : index
     //CHECK-NEXT: arith.addi {{.*}} : index
     //CHECK-NEXT: arith.addi {{.*}} : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
     //CHECK-NEXT: arith.constant 24 : index
     //CHECK-NEXT: arith.constant 16 : index
     //CHECK-NEXT: arith.addi {{.*}} : index
     //CHECK-NEXT: arith.addi {{.*}} : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
     //CHECK-NEXT: arith.constant 24 : index
     //CHECK-NEXT: arith.constant 32 : index
     //CHECK-NEXT: arith.addi {{.*}} : index
     //CHECK-NEXT: arith.addi {{.*}} : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
     //CHECK-NEXT: arith.constant 24 : index
     //CHECK-NEXT: arith.constant 48 : index
     //CHECK-NEXT: arith.addi {{.*}} : index
     //CHECK-NEXT: arith.addi {{.*}} : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
     //CHECK-NEXT: arith.constant 32 : index
     //CHECK-NEXT: arith.constant 0 : index
     //CHECK-NEXT: arith.addi {{.*}} : index
     //CHECK-NEXT: arith.addi {{.*}} : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
     //CHECK-NEXT: arith.constant 32 : index
     //CHECK-NEXT: arith.constant 16 : index
     //CHECK-NEXT: arith.addi {{.*}} : index
     //CHECK-NEXT: arith.addi {{.*}} : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
     //CHECK-NEXT: arith.constant 32 : index
     //CHECK-NEXT: arith.constant 32 : index
     //CHECK-NEXT: arith.addi {{.*}} : index
     //CHECK-NEXT: arith.addi {{.*}} : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
     //CHECK-NEXT: arith.constant 32 : index
     //CHECK-NEXT: arith.constant 48 : index
     //CHECK-NEXT: arith.addi {{.*}} : index
     //CHECK-NEXT: arith.addi {{.*}} : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
     //CHECK-NEXT: arith.constant 40 : index
     //CHECK-NEXT: arith.constant 0 : index
     //CHECK-NEXT: arith.addi {{.*}} : index
     //CHECK-NEXT: arith.addi {{.*}} : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
     //CHECK-NEXT: arith.constant 40 : index
     //CHECK-NEXT: arith.constant 16 : index
     //CHECK-NEXT: arith.addi {{.*}} : index
     //CHECK-NEXT: arith.addi {{.*}} : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
     //CHECK-NEXT: arith.constant 40 : index
     //CHECK-NEXT: arith.constant 32 : index
     //CHECK-NEXT: arith.addi {{.*}} : index
     //CHECK-NEXT: arith.addi {{.*}} : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
     //CHECK-NEXT: arith.constant 40 : index
     //CHECK-NEXT: arith.constant 48 : index
     //CHECK-NEXT: arith.addi {{.*}} : index
     //CHECK-NEXT: arith.addi {{.*}} : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
     //CHECK-NEXT: arith.constant 48 : index
     //CHECK-NEXT: arith.constant 0 : index
     //CHECK-NEXT: arith.addi {{.*}} : index
     //CHECK-NEXT: arith.addi {{.*}} : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
     //CHECK-NEXT: arith.constant 48 : index
     //CHECK-NEXT: arith.constant 16 : index
     //CHECK-NEXT: arith.addi {{.*}} : index
     //CHECK-NEXT: arith.addi {{.*}} : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
     //CHECK-NEXT: arith.constant 48 : index
     //CHECK-NEXT: arith.constant 32 : index
     //CHECK-NEXT: arith.addi {{.*}} : index
     //CHECK-NEXT: arith.addi {{.*}} : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
     //CHECK-NEXT: arith.constant 48 : index
     //CHECK-NEXT: arith.constant 48 : index
     //CHECK-NEXT: arith.addi {{.*}} : index
     //CHECK-NEXT: arith.addi {{.*}} : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
     //CHECK-NEXT: arith.constant 56 : index
     //CHECK-NEXT: arith.constant 0 : index
     //CHECK-NEXT: arith.addi {{.*}} : index
     //CHECK-NEXT: arith.addi {{.*}} : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
     //CHECK-NEXT: arith.constant 56 : index
     //CHECK-NEXT: arith.constant 16 : index
     //CHECK-NEXT: arith.addi {{.*}} : index
     //CHECK-NEXT: arith.addi {{.*}} : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
     //CHECK-NEXT: arith.constant 56 : index
     //CHECK-NEXT: arith.constant 32 : index
     //CHECK-NEXT: arith.addi {{.*}} : index
     //CHECK-NEXT: arith.addi {{.*}} : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
     //CHECK-NEXT: arith.constant 56 : index
     //CHECK-NEXT: arith.constant 48 : index
     //CHECK-NEXT: arith.addi {{.*}} : index
     //CHECK-NEXT: arith.addi {{.*}} : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x16xf16>
 	%1 = xetile.init_tile %a[%offset_0_dim_0, %offset_0_dim_1] : memref<1024x1024xf16> -> !xetile.tile<8x4x8x16xf16>
 
     //CHECK:      xegpu.load_nd {{.*}} {mode = vc, vnni_axis = 1, {{.*}}} : !xegpu.tensor_desc<8x16xf16> -> vector<8x8x2xf16>
@@ -229,82 +229,82 @@ func.func @sglevel_tiled_gemm(%a: memref<1024x1024xf16>, %b: memref<1024x1024xf1
     //CHECK-NEXT: arith.constant 0 : index
     //CHECK-NEXT: arith.addi {{.*}} : index
     //CHECK-NEXT: arith.addi {{.*}} : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
     //CHECK-NEXT: arith.constant 0 : index
     //CHECK-NEXT: arith.constant 16 : index
     //CHECK-NEXT: arith.addi {{.*}} : index
     //CHECK-NEXT: arith.addi {{.*}} : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
     //CHECK-NEXT: arith.constant 0 : index
     //CHECK-NEXT: arith.constant 32 : index
     //CHECK-NEXT: arith.addi {{.*}} : index
     //CHECK-NEXT: arith.addi {{.*}} : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
     //CHECK-NEXT: arith.constant 0 : index
     //CHECK-NEXT: arith.constant 48 : index
     //CHECK-NEXT: arith.addi {{.*}} : index
     //CHECK-NEXT: arith.addi {{.*}} : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
     //CHECK-NEXT: arith.constant 16 : index
     //CHECK-NEXT: arith.constant 0 : index
     //CHECK-NEXT: arith.addi {{.*}} : index
     //CHECK-NEXT: arith.addi {{.*}} : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
     //CHECK-NEXT: arith.constant 16 : index
     //CHECK-NEXT: arith.constant 16 : index
     //CHECK-NEXT: arith.addi {{.*}} : index
     //CHECK-NEXT: arith.addi {{.*}} : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
     //CHECK-NEXT: arith.constant 16 : index
     //CHECK-NEXT: arith.constant 32 : index
     //CHECK-NEXT: arith.addi {{.*}} : index
     //CHECK-NEXT: arith.addi {{.*}} : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
     //CHECK-NEXT: arith.constant 16 : index
     //CHECK-NEXT: arith.constant 48 : index
     //CHECK-NEXT: arith.addi {{.*}} : index
     //CHECK-NEXT: arith.addi {{.*}} : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
     //CHECK-NEXT: arith.constant 32 : index
     //CHECK-NEXT: arith.constant 0 : index
     //CHECK-NEXT: arith.addi {{.*}} : index
     //CHECK-NEXT: arith.addi {{.*}} : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
     //CHECK-NEXT: arith.constant 32 : index
     //CHECK-NEXT: arith.constant 16 : index
     //CHECK-NEXT: arith.addi {{.*}} : index
     //CHECK-NEXT: arith.addi {{.*}} : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
     //CHECK-NEXT: arith.constant 32 : index
     //CHECK-NEXT: arith.constant 32 : index
     //CHECK-NEXT: arith.addi {{.*}} : index
     //CHECK-NEXT: arith.addi {{.*}} : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
     //CHECK-NEXT: arith.constant 32 : index
     //CHECK-NEXT: arith.constant 48 : index
     //CHECK-NEXT: arith.addi {{.*}} : index
     //CHECK-NEXT: arith.addi {{.*}} : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
     //CHECK-NEXT: arith.constant 48 : index
     //CHECK-NEXT: arith.constant 0 : index
     //CHECK-NEXT: arith.addi {{.*}} : index
     //CHECK-NEXT: arith.addi {{.*}} : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
     //CHECK-NEXT: arith.constant 48 : index
     //CHECK-NEXT: arith.constant 16 : index
     //CHECK-NEXT: arith.addi {{.*}} : index
     //CHECK-NEXT: arith.addi {{.*}} : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
     //CHECK-NEXT: arith.constant 48 : index
     //CHECK-NEXT: arith.constant 32 : index
     //CHECK-NEXT: arith.addi {{.*}} : index
     //CHECK-NEXT: arith.addi {{.*}} : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
     //CHECK-NEXT: arith.constant 48 : index
     //CHECK-NEXT: arith.constant 48 : index
     //CHECK-NEXT: arith.addi {{.*}} : index
     //CHECK-NEXT: arith.addi {{.*}} : index
-    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc, boundary_check = true} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
+    //CHECK-NEXT: xegpu.create_nd_tdesc {{.*}} {mode = vc} : memref<1024x1024xf16> -> !xegpu.tensor_desc<16x16xf16>
 	%3 = xetile.init_tile %b[%offset_1_dim_0, %offset_1_dim_1] : memref<1024x1024xf16> -> !xetile.tile<4x4x16x16xf16>
 
     //CHECK:      xegpu.load_nd {{.*}} {mode = vc, vnni_axis = 0, {{.*}}} : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16>
diff --git a/test/Dialect/XeGPU/IR/XeGPUOps.mlir b/test/Dialect/XeGPU/IR/XeGPUOps.mlir
index cf1f34f60..1b106ab0a 100644
--- a/test/Dialect/XeGPU/IR/XeGPUOps.mlir
+++ b/test/Dialect/XeGPU/IR/XeGPUOps.mlir
@@ -53,13 +53,13 @@ func.func @test_store_nd_vc(%src: memref<24x32xf16>, %dst: memref<24x32xf16>) {
   %c1 = arith.constant 4 : index
 
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = vc, boundary_check = true}
+  // CHECK-SAME: {mode = vc}
   // CHECK-SAME: memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
   %1 = xegpu.create_nd_tdesc %src[%c0, %c1] {mode = vc}
       : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
 
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = vc, boundary_check = true}
+  // CHECK-SAME: {mode = vc}
   // CHECK-SAME: memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
   %2 = xegpu.create_nd_tdesc %dst[%c0, %c1] {mode = vc}
       : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
@@ -92,7 +92,7 @@ func.func @test_update_nd_offset_vc(%src: memref<24x32xf32>) {
   %c1 = arith.constant 4 : index
 
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = vc, boundary_check = true}
+  // CHECK-SAME: {mode = vc}
   // CHECK-SAME: memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   %1 = xegpu.create_nd_tdesc %src[%c0, %c1] {mode = vc}
       : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
@@ -112,7 +112,7 @@ func.func @test_update_nd_offset_vc(%src: memref<24x32xf32>) {
 // CHECK-LABEL: func @test_prefetch_nd_vc({{.*}}) {
 func.func @test_prefetch_nd_vc(%src: memref<24x32xf16>, %x : index, %y : index) {
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = vc, boundary_check = true}
+  // CHECK-SAME: {mode = vc}
   // CHECK-SAME: memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
   %1 = xegpu.create_nd_tdesc %src[%x, %y] {mode = vc} : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
   // CHECK: xegpu.prefetch_nd
diff --git a/test/Dialect/XeGPU/IR/atomic_rmw.mlir b/test/Dialect/XeGPU/IR/atomic_rmw.mlir
index dc5bdc70a..001b8cffa 100644
--- a/test/Dialect/XeGPU/IR/atomic_rmw.mlir
+++ b/test/Dialect/XeGPU/IR/atomic_rmw.mlir
@@ -4,36 +4,40 @@
 // Verify the generic form can be parsed.
 // RUN: imex-opt -mlir-print-op-generic %s | imex-opt | FileCheck %s
 
-#sg_map_fp32 = #xegpu.sg_map<{mma_block_size = [8, 16], wi_layout = [2, 8], wi_data = [1, 2]}>
 // CHECK-LABEL: func @test_atomic_rmw({{.*}}) {
-func.func @test_atomic_rmw(%src: ui64, %offsets : vector<16 x index>, %value : vector<16x1xf32>, %mask : vector<16xi1>) {
-  %1 = xegpu.create_tdesc %src, %offsets: ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #sg_map_fp32>
+func.func @test_atomic_rmw(%src: ui64, %offsets : vector<16 x index>, %value : vector<16xf32>, %mask : vector<16xi1>) {
+  %1 = xegpu.create_tdesc %src, %offsets {mode=vc}: ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
 
   // CHECK: xegpu.atomic_rmw
-  // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.sg_map<{mma_block_size = [8, 16], wi_layout = [2, 8], wi_data = [1, 2]}>>, vector<16xi1>, vector<16x1xf32>
-  xegpu.atomic_rmw "addf" %1, %mask, %value: !xegpu.tensor_desc<16xf32, #sg_map_fp32>, vector<16xi1>, vector<16x1xf32> -> vector<16x1xf32>
+  // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1>, vector<16xf32>
+  xegpu.atomic_rmw "addf" %1, %mask, %value {mode=vc}
+        : !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1>, vector<16xf32> -> vector<16xf32>
 
   return
 }
 
 // CHECK-LABEL: func @test_atomic_rmw_0({{.*}}) {
 func.func @test_atomic_rmw_0(%src: ui64, %offsets : vector<16 x index>, %value : vector<16x2xf32>, %mask : vector<16xi1>) {
-  %1 = xegpu.create_tdesc %src, %offsets {chunk_size_per_lane = 2}: ui64, vector<16 x index> -> !xegpu.tensor_desc<16x2xf32, #sg_map_fp32>
+  %1 = xegpu.create_tdesc %src, %offsets {chunk_size_per_lane = 2, mode=vc}
+        : ui64, vector<16 x index> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scattered>
 
   // CHECK: xegpu.atomic_rmw
-  // CHECK-SAME: tensor_desc<16x2xf32, #xegpu.sg_map<{mma_block_size = [8, 16], wi_layout = [2, 8], wi_data = [1, 2]}>>, vector<16xi1>, vector<16x2xf32> -> vector<16x2xf32>
-  xegpu.atomic_rmw "mulf" %1, %mask, %value : !xegpu.tensor_desc<16x2xf32, #sg_map_fp32>, vector<16xi1>, vector<16x2xf32> -> vector<16x2xf32>
+  // CHECK-SAME: tensor_desc<16x2xf32, #xegpu.scattered>, vector<16xi1>, vector<16x2xf32> -> vector<16x2xf32>
+  xegpu.atomic_rmw "mulf" %1, %mask, %value {mode=vc}
+        : !xegpu.tensor_desc<16x2xf32, #xegpu.scattered>, vector<16xi1>, vector<16x2xf32> -> vector<16x2xf32>
 
   return
 }
 
 // CHECK-LABEL: func @test_atomic_rmw_1({{.*}}) {
 func.func @test_atomic_rmw_1(%src: ui64, %offsets : vector<16 x index>, %value : vector<16x2xi32>, %mask : vector<16xi1>) {
-  %1 = xegpu.create_tdesc %src, %offsets {chunk_size_per_lane = 2}: ui64, vector<16 x index> -> !xegpu.tensor_desc<16x2xi32, #sg_map_fp32>
+  %1 = xegpu.create_tdesc %src, %offsets {chunk_size_per_lane = 2, mode=vc}
+        : ui64, vector<16 x index> -> !xegpu.tensor_desc<16x2xi32, #xegpu.scattered>
 
   // CHECK: xegpu.atomic_rmw
-  // CHECK-SAME: !xegpu.tensor_desc<16x2xi32, #xegpu.sg_map<{mma_block_size = [8, 16], wi_layout = [2, 8], wi_data = [1, 2]}>>, vector<16xi1>, vector<16x2xi32> -> vector<16x2xf32>
-  xegpu.atomic_rmw "andi" %1, %mask, %value: !xegpu.tensor_desc<16x2xi32, #sg_map_fp32>, vector<16xi1>, vector<16x2xi32> -> vector<16x2xf32>
+  // CHECK-SAME: !xegpu.tensor_desc<16x2xi32, #xegpu.scattered>, vector<16xi1>, vector<16x2xi32> -> vector<16x2xf32>
+  xegpu.atomic_rmw "andi" %1, %mask, %value {mode=vc}
+        : !xegpu.tensor_desc<16x2xi32, #xegpu.scattered>, vector<16xi1>, vector<16x2xi32> -> vector<16x2xf32>
 
   return
 }
diff --git a/test/Dialect/XeGPU/IR/create_nd_tdesc.mlir b/test/Dialect/XeGPU/IR/create_nd_tdesc.mlir
index d5aa32cb8..ce10c2471 100644
--- a/test/Dialect/XeGPU/IR/create_nd_tdesc.mlir
+++ b/test/Dialect/XeGPU/IR/create_nd_tdesc.mlir
@@ -4,21 +4,19 @@
 // Verify the generic form can be parsed.
 // RUN: imex-opt -mlir-print-op-generic %s | imex-opt | FileCheck %s
 
-#sg_map_fp16 = #xegpu.sg_map<{mma_block_size = [8, 16], wi_layout = [2, 8], wi_data = [1, 2]}>
+#sg_map_fp16 = #xegpu.sg_map<mma_block_size = [8, 16], wi_layout = [2, 8], wi_data = [1, 2]>
 
 func.func @test_create_nd_tdesc_0(%src: memref<24x32xf16>) {
   %c0 = arith.constant 2 : index
   %c1 = arith.constant 4 : index
 
   // CHECK:  xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = simt, boundary_check = true}
-  // CHECK-SAME: memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<{mma_block_size = [8, 16], wi_layout = [2, 8], wi_data = [1, 2]}>>
+  // CHECK-SAME: memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<mma_block_size = [8, 16], wi_layout = [2, 8], wi_data = [1, 2]>>
   %1 = xegpu.create_nd_tdesc %src[%c0, %c1]
       : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #sg_map_fp16>
 
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = simt, boundary_check = true}
-  // CHECK-SAME: memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<{mma_block_size = [8, 16], wi_layout = [2, 8], wi_data = [1, 2]}>>
+  // CHECK-SAME: memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<mma_block_size = [8, 16], wi_layout = [2, 8], wi_data = [1, 2]>>
   %2 = xegpu.create_nd_tdesc %src[2, 4]
       : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #sg_map_fp16>
 
@@ -28,8 +26,7 @@ func.func @test_create_nd_tdesc_0(%src: memref<24x32xf16>) {
 // CHECK-LABEL: func @test_create_nd_tdesc_1({{.*}}) {
 func.func @test_create_nd_tdesc_1(%src: memref<24x32xf16>, %x : index, %y : index) {
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = simt, boundary_check = true}
-  // CHECK-SAME: memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<{mma_block_size = [8, 16], wi_layout = [2, 8], wi_data = [1, 2]}>>
+  // CHECK-SAME: memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<mma_block_size = [8, 16], wi_layout = [2, 8], wi_data = [1, 2]>>
   %1 = xegpu.create_nd_tdesc %src[%x, %y]
       : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #sg_map_fp16>
   return
@@ -39,8 +36,7 @@ func.func @test_create_nd_tdesc_1(%src: memref<24x32xf16>, %x : index, %y : inde
 func.func @test_create_nd_tdesc_2(%src: ui64, %w : index, %h : index, %x : index, %y : index) {
   %c1 = arith.constant 1 : index
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = simt, boundary_check = true}
-  // CHECK-SAME: ui64 -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<{mma_block_size = [8, 16], wi_layout = [2, 8], wi_data = [1, 2]}>>
+  // CHECK-SAME: ui64 -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<mma_block_size = [8, 16], wi_layout = [2, 8], wi_data = [1, 2]>>
   %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1]  : ui64 -> !xegpu.tensor_desc<8x16xf16, #sg_map_fp16>
   return
 }
@@ -49,8 +45,7 @@ func.func @test_create_nd_tdesc_2(%src: ui64, %w : index, %h : index, %x : index
 func.func @test_create_nd_tdesc_3(%src: memref<?x?xf16>, %w : index, %h : index, %x : index, %y : index) {
   %c1 = arith.constant 1 : index
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = simt, boundary_check = true}
-  // CHECK-SAME: memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<{mma_block_size = [8, 16], wi_layout = [2, 8], wi_data = [1, 2]}>>
+  // CHECK-SAME: memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<mma_block_size = [8, 16], wi_layout = [2, 8], wi_data = [1, 2]>>
   %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1]  : memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, #sg_map_fp16>
   return
 }
@@ -60,8 +55,7 @@ func.func @test_create_nd_tdesc_3(%src: memref<?x?xf16>, %w : index, %h : index,
 func.func @test_create_nd_tdesc_4(%src: memref<?x?xf16>, %w : index, %h : index, %x : index, %y : index) {
   %c1 = arith.constant 1 : index
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = simt, boundary_check = true}
-  // CHECK-SAME: memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<{mma_block_size = [8, 16], wi_layout = [2, 8], wi_data = [1, 2]}>>
+  // CHECK-SAME: memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<mma_block_size = [8, 16], wi_layout = [2, 8], wi_data = [1, 2]>>
   %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] {boundary_check = true} : memref<?x?xf16>
     -> !xegpu.tensor_desc<8x16xf16, #sg_map_fp16>
   return
@@ -71,8 +65,7 @@ func.func @test_create_nd_tdesc_4(%src: memref<?x?xf16>, %w : index, %h : index,
 func.func @test_create_nd_tdesc_5(%src: memref<?x?xf16>, %w : index, %h : index, %x : index, %y : index) {
   %c1 = arith.constant 1 : index
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = simt, boundary_check = true}
-  // CHECK-SAME: memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, memory_scope = slm, #xegpu.sg_map<{mma_block_size = [8, 16], wi_layout = [2, 8], wi_data = [1, 2]}>>
+  // CHECK-SAME: memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, memory_scope = slm, #xegpu.sg_map<mma_block_size = [8, 16], wi_layout = [2, 8], wi_data = [1, 2]>>
   %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1]
                                   : memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, memory_scope = slm, #sg_map_fp16>
   return
@@ -82,8 +75,7 @@ func.func @test_create_nd_tdesc_5(%src: memref<?x?xf16>, %w : index, %h : index,
 func.func @test_create_nd_tdesc_6(%src: memref<?x?xf16>, %w : index, %h : index, %x : index, %y : index) {
   %c1 = arith.constant 1 : index
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = simt, boundary_check = true}
-  // CHECK-SAME: memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, memory_scope = slm, #xegpu.sg_map<{mma_block_size = [8, 16], wi_layout = [2, 8], wi_data = [1, 2]}>>
+  // CHECK-SAME: memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, memory_scope = slm, #xegpu.sg_map<mma_block_size = [8, 16], wi_layout = [2, 8], wi_data = [1, 2]>>
   %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] {boundary_check = true}
                             : memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, memory_scope = slm, #sg_map_fp16>
   return
@@ -92,8 +84,7 @@ func.func @test_create_nd_tdesc_6(%src: memref<?x?xf16>, %w : index, %h : index,
 // CHECK-LABEL: func @test_create_nd_tdesc_7({{.*}}) {
 func.func @test_create_nd_tdesc_7(%src: memref<1024xf16>, %offset : index) {
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = simt, boundary_check = true}
-  // CHECK-SAME: memref<1024xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.sg_map<{mma_block_size = [8, 16], wi_layout = [2, 8], wi_data = [1, 2]}>>
+  // CHECK-SAME: memref<1024xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.sg_map<mma_block_size = [8, 16], wi_layout = [2, 8], wi_data = [1, 2]>>
   %1 = xegpu.create_nd_tdesc %src[%offset] : memref<1024xf16> -> !xegpu.tensor_desc<16xf16, #sg_map_fp16>
   return
 }
@@ -103,8 +94,7 @@ func.func @test_create_nd_tdesc_7(%src: memref<1024xf16>, %offset : index) {
 func.func @test_create_nd_tdesc_8(%src: memref<?x?xf16>, %w : index, %h : index, %x : index) {
   %c1 = arith.constant 1 : index
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = simt, boundary_check = true}
-  // CHECK-SAME: memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, memory_scope = slm, #xegpu.sg_map<{mma_block_size = [8, 16], wi_layout = [2, 8], wi_data = [1, 2]}>>
+  // CHECK-SAME: memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, memory_scope = slm, #xegpu.sg_map<mma_block_size = [8, 16], wi_layout = [2, 8], wi_data = [1, 2]>>
   %1 = xegpu.create_nd_tdesc %src[8, %x], [%h, %w], [%w, %c1] {boundary_check = true}
                                     : memref<?x?xf16> -> !xegpu.tensor_desc<8x16xf16, memory_scope = slm, #sg_map_fp16>
   return
@@ -114,8 +104,7 @@ func.func @test_create_nd_tdesc_8(%src: memref<?x?xf16>, %w : index, %h : index,
 func.func @test_create_nd_tdesc_9(%src: memref<?x?xf16>, %w : index, %h : index, %x : index) {
   %c1 = arith.constant 1 : index
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = simt, boundary_check = true}
-  // CHECK-SAME: memref<?x?xf16> -> !xegpu.tensor_desc<64x128xf16, memory_scope = slm, #xegpu.sg_map<{mma_block_size = [8, 16], wi_layout = [2, 8], wi_data = [1, 2]}>>
+  // CHECK-SAME: memref<?x?xf16> -> !xegpu.tensor_desc<64x128xf16, memory_scope = slm, #xegpu.sg_map<mma_block_size = [8, 16], wi_layout = [2, 8], wi_data = [1, 2]>>
   %1 = xegpu.create_nd_tdesc %src[8, %x], [%h, %w], [%w, %c1] {boundary_check = true} : memref<?x?xf16>
             -> !xegpu.tensor_desc<64x128xf16, memory_scope = slm, #sg_map_fp16>
   return
diff --git a/test/Dialect/XeGPU/IR/create_tdesc_vc.mlir b/test/Dialect/XeGPU/IR/create_tdesc_vc.mlir
index 1bf1098a8..073aede88 100644
--- a/test/Dialect/XeGPU/IR/create_tdesc_vc.mlir
+++ b/test/Dialect/XeGPU/IR/create_tdesc_vc.mlir
@@ -1,15 +1,15 @@
-// RUN: imex-opt %s | FileCheck %s
+// RUN: IMEX_XEGPU_PRINT_DEFAULTS=true imex-opt %s | FileCheck %s
 // Verify the printed output can be parsed.
-// RUN: imex-opt %s | imex-opt | FileCheck %s
+// RUN: IMEX_XEGPU_PRINT_DEFAULTS=true imex-opt %s | IMEX_XEGPU_PRINT_DEFAULTS=true imex-opt | FileCheck %s
 // Verify the generic form can be parsed.
-// RUN: imex-opt -mlir-print-op-generic %s | imex-opt | FileCheck %s
+// RUN: IMEX_XEGPU_PRINT_DEFAULTS=true imex-opt -mlir-print-op-generic %s | IMEX_XEGPU_PRINT_DEFAULTS=true imex-opt | FileCheck %s
 
 
 // CHECK-LABEL: func @test_create_tdesc_vc({{.*}}) {
 func.func @test_create_tdesc_vc(%src: ui64, %offsets : vector<16 x index>) {
   // CHECK: xegpu.create_tdesc %arg0, %arg1
   // CHECK-SAME: {mode = vc, chunk_size_per_lane = 1}
-  // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
+  // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, memory_scope = global, #xegpu.scattered>
   %1 = xegpu.create_tdesc %src, %offsets {mode = vc}: ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
   return
 }
@@ -28,7 +28,7 @@ func.func @test_create_tdesc_vc_2(%src: ui64, %offsets : vector<16 x index>) {
 func.func @test_create_tdesc_vc_3(%src: ui64, %offsets : vector<16 x index>) {
   // CHECK: xegpu.create_tdesc %arg0, %arg1
   // CHECK-SAME: {mode = vc, chunk_size_per_lane = 8}
-  // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scattered>
+  // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, memory_scope = global, #xegpu.scattered>
   %1 = xegpu.create_tdesc %src, %offsets {mode = vc, chunk_size_per_lane = 8}
                                           : ui64, vector<16 x index> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scattered>
   return
@@ -54,24 +54,3 @@ func.func @test_create_tdesc_vc_5(%src: memref<?xf32>, %offsets : vector<16 x in
               : memref<?xf32>, vector<16 x index> -> !xegpu.tensor_desc<16x2xf32, memory_scope = slm, #xegpu.scattered>
   return
 }
-
-
-// CHECK-LABEL: func @test_create_tdesc_vc_6({{.*}}) {
-func.func @test_create_tdesc_vc_6(%src: memref<?xf32>, %offset : index) {
-  // CHECK: xegpu.create_tdesc
-  // CHECK-SAME: {mode = vc, chunk_size_per_lane = 2}
-  // CHECK-SAME: memref<?xf32>, index -> !xegpu.tensor_desc<2xf32, memory_scope = slm, #xegpu.scattered>
-  %1 = xegpu.create_tdesc %src, %offset {mode = vc, chunk_size_per_lane = 2}
-                    : memref<?xf32>, index -> !xegpu.tensor_desc<2xf32, memory_scope = slm, #xegpu.scattered>
-  return
-}
-
-// CHECK-LABEL: func @test_create_tdesc_vc_7({{.*}}) {
-func.func @test_create_tdesc_vc_7(%src: memref<?xf32>, %offset : index) {
-  // CHECK: xegpu.create_tdesc
-  // CHECK-SAME: {mode = vc, chunk_size_per_lane = 1}
-  // CHECK-SAME: memref<?xf32>, index -> !xegpu.tensor_desc<1xf32, memory_scope = slm, #xegpu.scattered>
-  %1 = xegpu.create_tdesc %src, %offset {mode = vc, chunk_size_per_lane = 1}
-                    : memref<?xf32>, index -> !xegpu.tensor_desc<1xf32, memory_scope = slm, #xegpu.scattered>
-  return
-}
diff --git a/test/Dialect/XeGPU/IR/load_gather_vc.mlir b/test/Dialect/XeGPU/IR/load_gather_vc.mlir
index 9201aa18c..daa1ea56a 100644
--- a/test/Dialect/XeGPU/IR/load_gather_vc.mlir
+++ b/test/Dialect/XeGPU/IR/load_gather_vc.mlir
@@ -1,8 +1,8 @@
-// RUN: imex-opt %s | FileCheck %s
+// RUN: IMEX_XEGPU_PRINT_DEFAULTS=true imex-opt %s | FileCheck %s
 // Verify the printed output can be parsed.
-// RUN: imex-opt %s | imex-opt | FileCheck %s
+// RUN: IMEX_XEGPU_PRINT_DEFAULTS=true imex-opt %s | IMEX_XEGPU_PRINT_DEFAULTS=true imex-opt | FileCheck %s
 // Verify the generic form can be parsed.
-// RUN: imex-opt -mlir-print-op-generic %s | imex-opt | FileCheck %s
+// RUN: IMEX_XEGPU_PRINT_DEFAULTS=true imex-opt -mlir-print-op-generic %s | IMEX_XEGPU_PRINT_DEFAULTS=true imex-opt | FileCheck %s
 
 
 // CHECK-LABEL: func @test_load_gather_vc({{.*}}) {
@@ -10,12 +10,12 @@ func.func @test_load_gather_vc(%src: ui64, %offsets : vector<16xindex>) {
   %0 = arith.constant dense<1>: vector<16xi1>
   // CHECK: xegpu.create_tdesc
   // CHECK-SAME: {mode = vc, chunk_size_per_lane = 1}
-  // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
+  // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, memory_scope = global, #xegpu.scattered>
   %1 = xegpu.create_tdesc %src, %offsets {mode = vc}: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
 
   // CHECK: xegpu.load
   // CHECK-SAME: {mode = vc, l1_hint = cached, l2_hint = uncached}
-  // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1> -> vector<16xf32>
+  // CHECK-SAME: !xegpu.tensor_desc<16xf32, memory_scope = global, #xegpu.scattered>, vector<16xi1> -> vector<16xf32>
   %2 = xegpu.load %1, %0 {mode = vc, l1_hint = cached, l2_hint = uncached}
                 : !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1> -> vector<16xf32>
   return
@@ -26,49 +26,30 @@ func.func @test_load_gather_vc_2(%src: ui64, %offsets : vector<16xindex>) {
   %0 = arith.constant dense<1>: vector<16x8xi1>
   // CHECK: xegpu.create_tdesc
   // CHECK-SAME: {mode = vc, chunk_size_per_lane = 8}
-  // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scattered>
+  // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, memory_scope = global, #xegpu.scattered>
   %1 = xegpu.create_tdesc %src, %offsets {mode = vc, chunk_size_per_lane = 8}
                 : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scattered>
 
   // CHECK: xegpu.load
   // CHECK-SAME: {mode = vc, transpose = [1, 0], l1_hint = cached, l2_hint = uncached}
-  // CHECK-SAME: !xegpu.tensor_desc<16x8xf32, #xegpu.scattered>, vector<16x8xi1> -> vector<8x16xf32>
+  // CHECK-SAME: !xegpu.tensor_desc<16x8xf32, memory_scope = global, #xegpu.scattered>, vector<16x8xi1> -> vector<8x16xf32>
   %2 = xegpu.load %1, %0 {mode = vc, transpose = [1, 0], l1_hint = cached, l2_hint = uncached}
                : !xegpu.tensor_desc<16x8xf32, #xegpu.scattered>, vector<16x8xi1> -> vector<8x16xf32>
   return
 }
 
-
-// CHECK-LABEL: func @test_load_gather_vc_3({{.*}}) {
-func.func @test_load_gather_vc_3(%src: ui64, %offset : index) {
-  %0 = arith.constant dense<1>: vector<8xi1>
-  // CHECK: xegpu.create_tdesc
-  // CHECK-SAME: {mode = vc, chunk_size_per_lane = 8}
-  // CHECK-SAME: ui64, index -> !xegpu.tensor_desc<8xf32, #xegpu.scattered>
-  %1 = xegpu.create_tdesc %src, %offset {mode = vc, chunk_size_per_lane = 8}
-                : ui64, index -> !xegpu.tensor_desc<8xf32, #xegpu.scattered>
-
-  // CHECK: xegpu.load
-  // CHECK-SAME: {mode = vc, l1_hint = cached, l2_hint = uncached}
-  // CHECK-SAME: !xegpu.tensor_desc<8xf32, #xegpu.scattered>, vector<8xi1> -> vector<8xf32>
-  %2 = xegpu.load %1, %0 {mode = vc, l1_hint = cached, l2_hint = uncached}
-               : !xegpu.tensor_desc<8xf32, #xegpu.scattered>, vector<8xi1> -> vector<8xf32>
-  return
-}
-
-
 // CHECK-LABEL: func @test_load_gather_vc_4({{.*}}) {
 func.func @test_load_gather_vc_4(%src: ui64, %offsets : vector<16xindex>) {
   %0 = arith.constant dense<1>: vector<16xi1>
   // CHECK: xegpu.create_tdesc
   // CHECK-SAME: {mode = vc, chunk_size_per_lane = 1}
-  // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
+  // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, memory_scope = global, #xegpu.scattered>
   %1 = xegpu.create_tdesc %src, %offsets {mode = vc, chunk_size_per_lane = 1}
                 : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
 
   // CHECK: xegpu.load
   // CHECK-SAME: {mode = vc, l1_hint = cached, l2_hint = uncached}
-  // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1> -> vector<16xf32>
+  // CHECK-SAME: !xegpu.tensor_desc<16xf32, memory_scope = global, #xegpu.scattered>, vector<16xi1> -> vector<16xf32>
   %2 = xegpu.load %1, %0 {mode = vc, l1_hint = cached, l2_hint = uncached}
                 : !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1> -> vector<16xf32>
   return
diff --git a/test/Dialect/XeGPU/IR/load_nd.mlir b/test/Dialect/XeGPU/IR/load_nd.mlir
index 3616c05bd..922f9970a 100644
--- a/test/Dialect/XeGPU/IR/load_nd.mlir
+++ b/test/Dialect/XeGPU/IR/load_nd.mlir
@@ -4,213 +4,194 @@
 // Verify the generic form can be parsed.
 // RUN: imex-opt -mlir-print-op-generic %s | imex-opt | FileCheck %s
 
-#sg_map_fp16_a = #xegpu.sg_map<{mma_block_size = [8, 16], wi_layout = [2, 8], wi_data = [1, 2]}>
-#sg_map_fp16_b = #xegpu.sg_map<{mma_block_size = [16, 16], wi_layout = [1, 16], wi_data = [1, 1]}>
-#sg_map_fp16_c = #xegpu.sg_map<{mma_block_size = [8, 16], wi_layout = [1, 16], wi_data = [1, 1]}>
-#sg_map_fp16_d = #xegpu.sg_map<{wi_layout = [2, 8], wi_data = [1, 2]}>
+#sg_map_fp16_a = #xegpu.sg_map<mma_block_size = [8, 16], wi_layout = [2, 8], wi_data = [1, 2]>
+#sg_map_fp16_b = #xegpu.sg_map<mma_block_size = [16, 16], wi_layout = [1, 16], wi_data = [1, 1]>
+#sg_map_fp16_c = #xegpu.sg_map<mma_block_size = [8, 16], wi_layout = [1, 16], wi_data = [1, 1]>
+#sg_map_fp16_d = #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 2]>
 // CHECK-LABEL: func @test_load_nd_fp16({{.*}}) {
 func.func @test_load_nd_fp16(%A: memref<24x32xf16>, %B : memref<24x32xf16>, %C : memref<24x32xf16>) {
   %c0 = arith.constant 2 : index
   %c1 = arith.constant 4 : index
 
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = simt, boundary_check = true}
   // CHECK-SAME: memref<24x32xf16>
-  // CHECK-SAME: -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<{mma_block_size = [8, 16], wi_layout = [2, 8], wi_data = [1, 2]}>>
+  // CHECK-SAME: -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<mma_block_size = [8, 16], wi_layout = [2, 8], wi_data = [1, 2]>>
   %1 = xegpu.create_nd_tdesc %A[%c0, %c1]
       : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #sg_map_fp16_a>
 
   // CHECK: xegpu.load_nd
-  // CHECK-SAME: {mode = simt, vnni_axis = 1}
-  // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<{mma_block_size = [8, 16], wi_layout = [2, 8], wi_data = [1, 2]}>>
+  // CHECK-SAME: {vnni_axis = 1}
+  // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<mma_block_size = [8, 16], wi_layout = [2, 8], wi_data = [1, 2]>>
   // CHECK-SAME: -> vector<4x1x2xf16>
   %2 = xegpu.load_nd %1 {vnni_axis = 1} : !xegpu.tensor_desc<8x16xf16, #sg_map_fp16_a> -> vector<4x1x2xf16>
 
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = simt, boundary_check = true}
   // CHECK-SAME: memref<24x32xf16>
-  // CHECK-SAME: -> !xegpu.tensor_desc<16x16xf16, #xegpu.sg_map<{mma_block_size = [16, 16], wi_layout = [1, 16], wi_data = [1, 1]}>>
+  // CHECK-SAME: -> !xegpu.tensor_desc<16x16xf16, #xegpu.sg_map<mma_block_size = [16, 16], wi_layout = [1, 16], wi_data = [1, 1]>>
   %3 = xegpu.create_nd_tdesc %B[%c0, %c1]
     : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #sg_map_fp16_b>
 
   // CHECK: xegpu.load_nd
-  // CHECK-SAME: {mode = simt, vnni_axis = 0}
-  // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.sg_map<{mma_block_size = [16, 16], wi_layout = [1, 16], wi_data = [1, 1]}>>
+  // CHECK-SAME: {vnni_axis = 0}
+  // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.sg_map<mma_block_size = [16, 16], wi_layout = [1, 16], wi_data = [1, 1]>>
   // CHECK-SAME: -> vector<8x1x2xf16>
   %4 = xegpu.load_nd %3 {vnni_axis = 0} : !xegpu.tensor_desc<16x16xf16, #sg_map_fp16_b> -> vector<8x1x2xf16>
 
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = simt, boundary_check = true}
   // CHECK-SAME: memref<24x32xf16>
-  // CHECK-SAME: -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<{mma_block_size = [8, 16], wi_layout = [1, 16], wi_data = [1, 1]}>>
+  // CHECK-SAME: -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<mma_block_size = [8, 16], wi_layout = [1, 16], wi_data = [1, 1]>>
   %5 = xegpu.create_nd_tdesc %C[%c0, %c1]
     : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf32, #sg_map_fp16_c>
 
   // CHECK: xegpu.load_nd
-  // CHECK-SAME: {mode = simt}
-  // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<{mma_block_size = [8, 16], wi_layout = [1, 16], wi_data = [1, 1]}>>
+  // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<mma_block_size = [8, 16], wi_layout = [1, 16], wi_data = [1, 1]>>
   // CHECK-SAME: -> vector<8x1xf32>
   %6 = xegpu.load_nd %5 : !xegpu.tensor_desc<8x16xf32, #sg_map_fp16_c> -> vector<8x1xf32>
 
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = simt, boundary_check = true}
   // CHECK-SAME: memref<24x32xf16>
-  // CHECK-SAME: -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<{wi_layout = [2, 8], wi_data = [1, 2]}>>
+  // CHECK-SAME: -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 2]>>
   %7 = xegpu.create_nd_tdesc %A[%c0, %c1]
       : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #sg_map_fp16_d>
   // CHECK: xegpu.load_nd
-  // CHECK-SAME: {mode = simt, vnni_axis = 1}
-  // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<{wi_layout = [2, 8], wi_data = [1, 2]}>>
+  // CHECK-SAME: {vnni_axis = 1}
+  // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<wi_layout = [2, 8], wi_data = [1, 2]>>
   // CHECK-SAME: -> vector<4x1x2xf16>
   %8 = xegpu.load_nd %7 {vnni_axis = 1} : !xegpu.tensor_desc<8x16xf16, #sg_map_fp16_d> -> vector<4x1x2xf16>
 
   return
 }
 
-#sg_map_bf16_a = #xegpu.sg_map<{mma_block_size = [8, 16], wi_layout = [2, 8], wi_data = [1, 2]}>
-#sg_map_bf16_b = #xegpu.sg_map<{mma_block_size = [16, 16], wi_layout = [1, 16], wi_data = [1, 1]}>
-#sg_map_bf16_c = #xegpu.sg_map<{mma_block_size = [8, 16], wi_layout = [1, 16], wi_data = [1, 1]}>
+#sg_map_bf16_a = #xegpu.sg_map<mma_block_size = [8, 16], wi_layout = [2, 8], wi_data = [1, 2]>
+#sg_map_bf16_b = #xegpu.sg_map<mma_block_size = [16, 16], wi_layout = [1, 16], wi_data = [1, 1]>
+#sg_map_bf16_c = #xegpu.sg_map<mma_block_size = [8, 16], wi_layout = [1, 16], wi_data = [1, 1]>
 // CHECK-LABEL: func @test_load_nd_bf16({{.*}}) {
 func.func @test_load_nd_bf16(%A: memref<24x32xbf16>, %B : memref<24x32xbf16>, %C : memref<24x32xbf16>) {
   %c0 = arith.constant 2 : index
   %c1 = arith.constant 4 : index
 
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = simt, boundary_check = true}
   // CHECK-SAME: memref<24x32xbf16>
-  // CHECK-SAME: -> !xegpu.tensor_desc<8x16xbf16, #xegpu.sg_map<{mma_block_size = [8, 16], wi_layout = [2, 8], wi_data = [1, 2]}>>
+  // CHECK-SAME: -> !xegpu.tensor_desc<8x16xbf16, #xegpu.sg_map<mma_block_size = [8, 16], wi_layout = [2, 8], wi_data = [1, 2]>>
   %1 = xegpu.create_nd_tdesc %A[%c0, %c1]
       : memref<24x32xbf16> -> !xegpu.tensor_desc<8x16xbf16, #sg_map_bf16_a>
 
   // CHECK: xegpu.load_nd
-  // CHECK-SAME: {mode = simt, vnni_axis = 1}
-  // CHECK-SAME: !xegpu.tensor_desc<8x16xbf16, #xegpu.sg_map<{mma_block_size = [8, 16], wi_layout = [2, 8], wi_data = [1, 2]}>>
+  // CHECK-SAME: {vnni_axis = 1}
+  // CHECK-SAME: !xegpu.tensor_desc<8x16xbf16, #xegpu.sg_map<mma_block_size = [8, 16], wi_layout = [2, 8], wi_data = [1, 2]>>
   // CHECK-SAME: -> vector<4x1x2xbf16>
   %2 = xegpu.load_nd %1 {vnni_axis = 1} : !xegpu.tensor_desc<8x16xbf16, #sg_map_bf16_a> -> vector<4x1x2xbf16>
 
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = simt, boundary_check = true}
   // CHECK-SAME: memref<24x32xbf16>
-  // CHECK-SAME: -> !xegpu.tensor_desc<16x16xbf16, #xegpu.sg_map<{mma_block_size = [16, 16], wi_layout = [1, 16], wi_data = [1, 1]}>>
+  // CHECK-SAME: -> !xegpu.tensor_desc<16x16xbf16, #xegpu.sg_map<mma_block_size = [16, 16], wi_layout = [1, 16], wi_data = [1, 1]>>
   %3 = xegpu.create_nd_tdesc %B[%c0, %c1]
     : memref<24x32xbf16> -> !xegpu.tensor_desc<16x16xbf16, #sg_map_bf16_b>
 
   // CHECK: xegpu.load_nd
-  // CHECK-SAME: {mode = simt, vnni_axis = 0}
-  // CHECK-SAME: !xegpu.tensor_desc<16x16xbf16, #xegpu.sg_map<{mma_block_size = [16, 16], wi_layout = [1, 16], wi_data = [1, 1]}>>
+  // CHECK-SAME: {vnni_axis = 0}
+  // CHECK-SAME: !xegpu.tensor_desc<16x16xbf16, #xegpu.sg_map<mma_block_size = [16, 16], wi_layout = [1, 16], wi_data = [1, 1]>>
   // CHECK-SAME: -> vector<8x1x2xbf16>
   %4 = xegpu.load_nd %3 {vnni_axis = 0} : !xegpu.tensor_desc<16x16xbf16, #sg_map_bf16_b> -> vector<8x1x2xbf16>
 
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = simt, boundary_check = true}
   // CHECK-SAME: memref<24x32xbf16>
-  // CHECK-SAME: -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<{mma_block_size = [8, 16], wi_layout = [1, 16], wi_data = [1, 1]}>>
+  // CHECK-SAME: -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<mma_block_size = [8, 16], wi_layout = [1, 16], wi_data = [1, 1]>>
   %5 = xegpu.create_nd_tdesc %C[%c0, %c1]
     : memref<24x32xbf16> -> !xegpu.tensor_desc<8x16xf32, #sg_map_fp16_c>
 
   // CHECK: xegpu.load_nd
-  // CHECK-SAME: {mode = simt}
-  // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<{mma_block_size = [8, 16], wi_layout = [1, 16], wi_data = [1, 1]}>>
+  // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<mma_block_size = [8, 16], wi_layout = [1, 16], wi_data = [1, 1]>>
   // CHECK-SAME: -> vector<8x1xf32>
   %6 = xegpu.load_nd %5 : !xegpu.tensor_desc<8x16xf32, #sg_map_bf16_c> -> vector<8x1xf32>
 
   return
 }
 
-#sg_map_i8_a = #xegpu.sg_map<{mma_block_size = [8, 32], wi_layout = [2, 8], wi_data = [1, 4]}>
-#sg_map_i8_b = #xegpu.sg_map<{mma_block_size = [32, 16], wi_layout = [1, 16], wi_data = [1, 1]}>
-#sg_map_i8_c = #xegpu.sg_map<{mma_block_size = [8, 16], wi_layout = [1, 16], wi_data = [1, 1]}>
+#sg_map_i8_a = #xegpu.sg_map<mma_block_size = [8, 32], wi_layout = [2, 8], wi_data = [1, 4]>
+#sg_map_i8_b = #xegpu.sg_map<mma_block_size = [32, 16], wi_layout = [1, 16], wi_data = [1, 1]>
+#sg_map_i8_c = #xegpu.sg_map<mma_block_size = [8, 16], wi_layout = [1, 16], wi_data = [1, 1]>
 // CHECK-LABEL: func @test_load_nd_i8({{.*}}) {
 func.func @test_load_nd_i8(%A: memref<64x64xi8>, %B : memref<64x64xi8>, %C : memref<64x64xi8>) {
   %c0 = arith.constant 2 : index
   %c1 = arith.constant 4 : index
 
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = simt, boundary_check = true}
   // CHECK-SAME: memref<64x64xi8>
-  // CHECK-SAME: -> !xegpu.tensor_desc<8x32xi8, #xegpu.sg_map<{mma_block_size = [8, 32], wi_layout = [2, 8], wi_data = [1, 4]}>>
+  // CHECK-SAME: -> !xegpu.tensor_desc<8x32xi8, #xegpu.sg_map<mma_block_size = [8, 32], wi_layout = [2, 8], wi_data = [1, 4]>>
   %1 = xegpu.create_nd_tdesc %A[%c0, %c1]
       : memref<64x64xi8> -> !xegpu.tensor_desc<8x32xi8, #sg_map_i8_a>
 
   // CHECK: xegpu.load_nd
-  // CHECK-SAME: {mode = simt, vnni_axis = 1}
-  // CHECK-SAME:  !xegpu.tensor_desc<8x32xi8, #xegpu.sg_map<{mma_block_size = [8, 32], wi_layout = [2, 8], wi_data = [1, 4]}>>
+  // CHECK-SAME: {vnni_axis = 1}
+  // CHECK-SAME:  !xegpu.tensor_desc<8x32xi8, #xegpu.sg_map<mma_block_size = [8, 32], wi_layout = [2, 8], wi_data = [1, 4]>>
   // CHECK-SAME: -> vector<4x1x4xi8>
   %2 = xegpu.load_nd %1 {vnni_axis = 1} : !xegpu.tensor_desc<8x32xi8, #sg_map_i8_a> -> vector<4x1x4xi8>
 
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = simt, boundary_check = true}
   // CHECK-SAME: memref<64x64xi8>
-  // CHECK-SAME: -> !xegpu.tensor_desc<32x16xi8, #xegpu.sg_map<{mma_block_size = [32, 16], wi_layout = [1, 16], wi_data = [1, 1]}>>
+  // CHECK-SAME: -> !xegpu.tensor_desc<32x16xi8, #xegpu.sg_map<mma_block_size = [32, 16], wi_layout = [1, 16], wi_data = [1, 1]>>
   %3 = xegpu.create_nd_tdesc %B[%c0, %c1]
     : memref<64x64xi8> -> !xegpu.tensor_desc<32x16xi8, #sg_map_i8_b>
 
   // CHECK: xegpu.load_nd
-  // CHECK-SAME: {mode = simt, vnni_axis = 0}
-  // CHECK-SAME: !xegpu.tensor_desc<32x16xi8, #xegpu.sg_map<{mma_block_size = [32, 16], wi_layout = [1, 16], wi_data = [1, 1]}>>
+  // CHECK-SAME: {vnni_axis = 0}
+  // CHECK-SAME: !xegpu.tensor_desc<32x16xi8, #xegpu.sg_map<mma_block_size = [32, 16], wi_layout = [1, 16], wi_data = [1, 1]>>
   // CHECK-SAME: -> vector<8x1x4xi8>
   %4 = xegpu.load_nd %3 {vnni_axis = 0} : !xegpu.tensor_desc<32x16xi8, #sg_map_i8_b> -> vector<8x1x4xi8>
 
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = simt, boundary_check = true}
   // CHECK-SAME: memref<64x64xi8>
-  // CHECK-SAME: -> !xegpu.tensor_desc<8x16xi32, #xegpu.sg_map<{mma_block_size = [8, 16], wi_layout = [1, 16], wi_data = [1, 1]}>>
+  // CHECK-SAME: -> !xegpu.tensor_desc<8x16xi32, #xegpu.sg_map<mma_block_size = [8, 16], wi_layout = [1, 16], wi_data = [1, 1]>>
   %5 = xegpu.create_nd_tdesc %C[%c0, %c1]
     : memref<64x64xi8> -> !xegpu.tensor_desc<8x16xi32, #sg_map_i8_c>
 
   // CHECK: xegpu.load_nd
-  // CHECK-SAME: {mode = simt}
-  // CHECK-SAME:  !xegpu.tensor_desc<8x16xi32, #xegpu.sg_map<{mma_block_size = [8, 16], wi_layout = [1, 16], wi_data = [1, 1]}>>
+  // CHECK-SAME:  !xegpu.tensor_desc<8x16xi32, #xegpu.sg_map<mma_block_size = [8, 16], wi_layout = [1, 16], wi_data = [1, 1]>>
   // CHECK-SAME: -> vector<8x1xi32>
   %6 = xegpu.load_nd %5 : !xegpu.tensor_desc<8x16xi32, #sg_map_i8_c> -> vector<8x1xi32>
 
   return
 }
 
-#sg_map_f64_a = #xegpu.sg_map<{mma_block_size = [4, 8], wi_layout = [2, 8], wi_data = [1, 1]}>
-#sg_map_f64_b = #xegpu.sg_map<{mma_block_size = [8, 8], wi_layout = [2, 8], wi_data = [1, 1]}>
-#sg_map_f64_c = #xegpu.sg_map<{mma_block_size = [4, 8], wi_layout = [2, 8], wi_data = [1, 1]}>
+#sg_map_f64_a = #xegpu.sg_map<mma_block_size = [4, 8], wi_layout = [2, 8], wi_data = [1, 1]>
+#sg_map_f64_b = #xegpu.sg_map<mma_block_size = [8, 8], wi_layout = [2, 8], wi_data = [1, 1]>
+#sg_map_f64_c = #xegpu.sg_map<mma_block_size = [4, 8], wi_layout = [2, 8], wi_data = [1, 1]>
 // CHECK-LABEL: func @test_load_nd_f64({{.*}}) {
 func.func @test_load_nd_f64(%A: memref<64x64xf64>, %B : memref<64x64xf64>, %C : memref<64x64xf64>) {
   %c0 = arith.constant 2 : index
   %c1 = arith.constant 4 : index
 
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = simt, boundary_check = true}
   // CHECK-SAME: memref<64x64xf64>
-  // CHECK-SAME: -> !xegpu.tensor_desc<4x8xf64, #xegpu.sg_map<{mma_block_size = [4, 8], wi_layout = [2, 8], wi_data = [1, 1]}>>
+  // CHECK-SAME: -> !xegpu.tensor_desc<4x8xf64, #xegpu.sg_map<mma_block_size = [4, 8], wi_layout = [2, 8], wi_data = [1, 1]>>
   %1 = xegpu.create_nd_tdesc %A[%c0, %c1]
       : memref<64x64xf64> -> !xegpu.tensor_desc<4x8xf64, #sg_map_f64_a>
 
   // CHECK: xegpu.load_nd
-  // CHECK-SAME: {mode = simt}
-  // CHECK-SAME: !xegpu.tensor_desc<4x8xf64, #xegpu.sg_map<{mma_block_size = [4, 8], wi_layout = [2, 8], wi_data = [1, 1]}>>
+  // CHECK-SAME: !xegpu.tensor_desc<4x8xf64, #xegpu.sg_map<mma_block_size = [4, 8], wi_layout = [2, 8], wi_data = [1, 1]>>
   // CHECK-SAME: -> vector<2x1xf64>
   %2 = xegpu.load_nd %1 : !xegpu.tensor_desc<4x8xf64, #sg_map_f64_a> -> vector<2x1xf64>
 
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = simt, boundary_check = true}
   // CHECK-SAME:  memref<64x64xf64>
-  // CHECK-SAME:  -> !xegpu.tensor_desc<8x8xf64, #xegpu.sg_map<{mma_block_size = [8, 8], wi_layout = [2, 8], wi_data = [1, 1]}>>
+  // CHECK-SAME:  -> !xegpu.tensor_desc<8x8xf64, #xegpu.sg_map<mma_block_size = [8, 8], wi_layout = [2, 8], wi_data = [1, 1]>>
   %3 = xegpu.create_nd_tdesc %B[%c0, %c1]
     : memref<64x64xf64> -> !xegpu.tensor_desc<8x8xf64, #sg_map_f64_b>
 
   // CHECK: xegpu.load_nd
-  // CHECK-SAME: {mode = simt}
-  // CHECK-SAME: !xegpu.tensor_desc<8x8xf64, #xegpu.sg_map<{mma_block_size = [8, 8], wi_layout = [2, 8], wi_data = [1, 1]}>>
+  // CHECK-SAME: !xegpu.tensor_desc<8x8xf64, #xegpu.sg_map<mma_block_size = [8, 8], wi_layout = [2, 8], wi_data = [1, 1]>>
   // CHECK-SAME: -> vector<4x1xf64>
   %4 = xegpu.load_nd %3  : !xegpu.tensor_desc<8x8xf64, #sg_map_f64_b> -> vector<4x1xf64>
 
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = simt, boundary_check = true}
   // CHECK-SAME: memref<64x64xf64>
-  // CHECK-SAME: -> !xegpu.tensor_desc<4x8xf64, #xegpu.sg_map<{mma_block_size = [4, 8], wi_layout = [2, 8], wi_data = [1, 1]}>>
+  // CHECK-SAME: -> !xegpu.tensor_desc<4x8xf64, #xegpu.sg_map<mma_block_size = [4, 8], wi_layout = [2, 8], wi_data = [1, 1]>>
   %5 = xegpu.create_nd_tdesc %C[%c0, %c1]
     : memref<64x64xf64> -> !xegpu.tensor_desc<4x8xf64, #sg_map_f64_c>
 
   // CHECK: xegpu.load_nd
-  // CHECK-SAME: {mode = simt}
-  // CHECK-SAME: !xegpu.tensor_desc<4x8xf64, #xegpu.sg_map<{mma_block_size = [4, 8], wi_layout = [2, 8], wi_data = [1, 1]}>>
+  // CHECK-SAME: !xegpu.tensor_desc<4x8xf64, #xegpu.sg_map<mma_block_size = [4, 8], wi_layout = [2, 8], wi_data = [1, 1]>>
   // CHECK-SAME: -> vector<2x1xf64>
   %6 = xegpu.load_nd %5 : !xegpu.tensor_desc<4x8xf64, #sg_map_f64_c> -> vector<2x1xf64>
 
diff --git a/test/Dialect/XeGPU/IR/load_nd_vc.mlir b/test/Dialect/XeGPU/IR/load_nd_vc.mlir
index dd794285b..89f76e146 100644
--- a/test/Dialect/XeGPU/IR/load_nd_vc.mlir
+++ b/test/Dialect/XeGPU/IR/load_nd_vc.mlir
@@ -11,7 +11,7 @@ func.func @test_load_nd_simd_f32(%src: memref<24x32xf32>) {
   %c1 = arith.constant 4 : index
 
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = vc, boundary_check = true}
+  // CHECK-SAME: {mode = vc}
   // CHECK-SAME: memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   %1 = xegpu.create_nd_tdesc %src[%c0, %c1] {mode = vc, boundary_check = true}
       : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
@@ -32,7 +32,7 @@ func.func @test_load_nd_simd_f32(%src: memref<24x32xf32>) {
 func.func @test_load_nd_simd_f16(%src: memref<24x32xf16>, %x : index, %y : index) {
   // CHECK: xegpu.create_nd_tdesc
   // CHECK-SAME: %arg0[%arg1, %arg2]
-  // CHECK-SAME: {mode = vc, boundary_check = true}
+  // CHECK-SAME: {mode = vc}
   // CHECK-SAME: memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
   %1 = xegpu.create_nd_tdesc %src[%x, %y] {mode = vc, boundary_check = true}
       : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
@@ -49,7 +49,7 @@ func.func @test_load_nd_simd_bf16(%src: ui64, %w : index, %h : index, %x : index
   %c1 = arith.constant 1 : index
   // CHECK: xegpu.create_nd_tdesc
   // CHECK-SAME: %arg0[%arg3, %arg4], [%arg2, %arg1], [%arg1, %c1]
-  // CHECK-SAME: {mode = vc, boundary_check = true}
+  // CHECK-SAME: {mode = vc}
   // CHECK-SAME: ui64 -> !xegpu.tensor_desc<8x16xbf16>
   %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] {mode = vc, boundary_check = true} : ui64 -> !xegpu.tensor_desc<8x16xbf16>
   // CHECK: xegpu.load_nd
diff --git a/test/Dialect/XeGPU/IR/prefetch_nd_vc.mlir b/test/Dialect/XeGPU/IR/prefetch_nd_vc.mlir
index 5d8a2fd0c..98dc8ccc0 100644
--- a/test/Dialect/XeGPU/IR/prefetch_nd_vc.mlir
+++ b/test/Dialect/XeGPU/IR/prefetch_nd_vc.mlir
@@ -19,7 +19,7 @@ func.func @test_prefetch_nd_tdesc_vc_0(%src: memref<24x32xf32>) {
 // CHECK-LABEL: func @test_prefetch_nd_tdesc_vc_1({{.*}}) {
 func.func @test_prefetch_nd_tdesc_vc_1(%src: memref<24x32xf16>, %x : index, %y : index) {
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = vc, boundary_check = true}
+  // CHECK-SAME: {mode = vc}
   // CHECK-SAME: memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
   %1 = xegpu.create_nd_tdesc %src[%x, %y] {mode = vc}
       : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
@@ -45,7 +45,7 @@ func.func @test_prefetch_nd_tdesc_vc_i8(%src: memref<24x32xi8>) {
 // CHECK-LABEL: func @test_prefetch_nd_tdesc_vc_bf16({{.*}}) {
 func.func @test_prefetch_nd_tdesc_vc_bf16(%src: memref<24x32xbf16>, %x : index, %y : index) {
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = vc, boundary_check = true}
+  // CHECK-SAME: {mode = vc}
   // CHECK-SAME: memref<24x32xbf16> -> !xegpu.tensor_desc<8x16xbf16>
   %1 = xegpu.create_nd_tdesc %src[%x, %y] {mode = vc}
       : memref<24x32xbf16> -> !xegpu.tensor_desc<8x16xbf16>
diff --git a/test/Dialect/XeGPU/IR/simple_gemm.mlir b/test/Dialect/XeGPU/IR/simple_gemm.mlir
index 7c0d59827..53c659b38 100644
--- a/test/Dialect/XeGPU/IR/simple_gemm.mlir
+++ b/test/Dialect/XeGPU/IR/simple_gemm.mlir
@@ -6,9 +6,9 @@
 
 // ---- BF16 ------
 
-#sg_map_fp16_a = #xegpu.sg_map<{mma_block_size = [8, 16], wi_layout = [2, 8], wi_data = [1, 2]}>
-#sg_map_fp16_b = #xegpu.sg_map<{mma_block_size = [16, 16], wi_layout = [1, 16], wi_data = [1, 1]}>
-#sg_map_fp16_c = #xegpu.sg_map<{mma_block_size = [8, 16], wi_layout = [1, 16], wi_data = [1, 1]}>
+#sg_map_fp16_a = #xegpu.sg_map<mma_block_size = [8, 16], wi_layout = [2, 8], wi_data = [1, 2]>
+#sg_map_fp16_b = #xegpu.sg_map<mma_block_size = [16, 16], wi_layout = [1, 16], wi_data = [1, 1]>
+#sg_map_fp16_c = #xegpu.sg_map<mma_block_size = [8, 16], wi_layout = [1, 16], wi_data = [1, 1]>
 // CHECK-LABEL: func @test_gemm_bf16({{.*}}) {
 func.func @test_gemm_bf16(%a : memref<1024x1024xbf16>, %b: memref<1024x1024xbf16>, %c: memref<1024x1024xf32>) {
   %c0 = arith.constant 0 : index
@@ -25,12 +25,12 @@ func.func @test_gemm_bf16(%a : memref<1024x1024xbf16>, %b: memref<1024x1024xbf16
     scf.for %j= %c0 to %c1024 step %c16 {
       // CHECK: xegpu.create_nd_tdesc
       // CHECK-SAME: memref<1024x1024xbf16>
-      // CHECK-SAME: -> !xegpu.tensor_desc<8x16xbf16, #xegpu.sg_map<{mma_block_size = [8, 16], wi_layout = [2, 8], wi_data = [1, 2]}>>
+      // CHECK-SAME: -> !xegpu.tensor_desc<8x16xbf16, #xegpu.sg_map<mma_block_size = [8, 16], wi_layout = [2, 8], wi_data = [1, 2]>>
       %1 = xegpu.create_nd_tdesc %a[%i, %c0] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16, #sg_map_fp16_a>
 
       // CHECK: xegpu.create_nd_tdesc
       // CHECK-SAME: memref<1024x1024xbf16>
-      // CHECK-SAME: -> !xegpu.tensor_desc<16x16xbf16, #xegpu.sg_map<{mma_block_size = [16, 16], wi_layout = [1, 16], wi_data = [1, 1]}>>
+      // CHECK-SAME: -> !xegpu.tensor_desc<16x16xbf16, #xegpu.sg_map<mma_block_size = [16, 16], wi_layout = [1, 16], wi_data = [1, 1]>>
       %2 = xegpu.create_nd_tdesc %b[%c0, %j] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16, #sg_map_fp16_b>
 
       %3 = arith.constant dense<0.0> : vector<8x1xf32>
diff --git a/test/Dialect/XeGPU/IR/store_nd_vc.mlir b/test/Dialect/XeGPU/IR/store_nd_vc.mlir
index 16a2824f1..e15b276a2 100644
--- a/test/Dialect/XeGPU/IR/store_nd_vc.mlir
+++ b/test/Dialect/XeGPU/IR/store_nd_vc.mlir
@@ -10,13 +10,13 @@ func.func @test_store_nd_vc_bf16(%src: memref<24x32xbf16>, %dst: memref<24x32xbf
   %c1 = arith.constant 4 : index
 
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = vc, boundary_check = true}
+  // CHECK-SAME: {mode = vc}
   // CHECK-SAME: memref<24x32xbf16> -> !xegpu.tensor_desc<8x16xbf16>
   %1 = xegpu.create_nd_tdesc %src[%c0, %c1] {mode = vc}
       : memref<24x32xbf16> -> !xegpu.tensor_desc<8x16xbf16>
 
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = vc, boundary_check = true}
+  // CHECK-SAME: {mode = vc}
   // CHECK-SAME: memref<24x32xbf16> -> !xegpu.tensor_desc<8x16xbf16>
   %2 = xegpu.create_nd_tdesc %dst[%c0, %c1] {mode = vc}
       : memref<24x32xbf16> -> !xegpu.tensor_desc<8x16xbf16>
@@ -39,13 +39,13 @@ func.func @test_store_nd_vc_f64(%src: memref<24x32xf64>, %dst: memref<24x32xf64>
   %c1 = arith.constant 4 : index
 
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = vc, boundary_check = true}
+  // CHECK-SAME: {mode = vc}
   // CHECK-SAME: memref<24x32xf64> -> !xegpu.tensor_desc<8x16xf64>
   %1 = xegpu.create_nd_tdesc %src[%c0, %c1] {mode = vc}
       : memref<24x32xf64> -> !xegpu.tensor_desc<8x16xf64>
 
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = vc, boundary_check = true}
+  // CHECK-SAME: {mode = vc}
   // CHECK-SAME: memref<24x32xf64> -> !xegpu.tensor_desc<8x16xf64>
   %2 = xegpu.create_nd_tdesc %dst[%c0, %c1] {mode = vc}
       : memref<24x32xf64> -> !xegpu.tensor_desc<8x16xf64>
@@ -68,13 +68,13 @@ func.func @test_store_nd_vc_i8(%src: memref<24x32xi8>, %dst: memref<24x32xi8>) {
   %c1 = arith.constant 4 : index
 
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = vc, boundary_check = true}
+  // CHECK-SAME: {mode = vc}
   // CHECK-SAME: memref<24x32xi8> -> !xegpu.tensor_desc<8x16xi8>
   %1 = xegpu.create_nd_tdesc %src[%c0, %c1] {mode = vc}
       : memref<24x32xi8> -> !xegpu.tensor_desc<8x16xi8>
 
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = vc, boundary_check = true}
+  // CHECK-SAME: {mode = vc}
   // CHECK-SAME: memref<24x32xi8> -> !xegpu.tensor_desc<8x16xi8>
   %2 = xegpu.create_nd_tdesc %dst[%c0, %c1] {mode = vc}
       : memref<24x32xi8> -> !xegpu.tensor_desc<8x16xi8>
diff --git a/test/Dialect/XeGPU/IR/store_scatter.mlir b/test/Dialect/XeGPU/IR/store_scatter.mlir
index 8924aefb8..8c1bb1c38 100644
--- a/test/Dialect/XeGPU/IR/store_scatter.mlir
+++ b/test/Dialect/XeGPU/IR/store_scatter.mlir
@@ -5,29 +5,29 @@
 // RUN: imex-opt -mlir-print-op-generic %s | imex-opt | FileCheck %s
 
 // CHECK-LABEL: func @test_store_scatter({{.*}}) {
-func.func @test_store_scatter(%src: ui64, %offsets : index, %dst: ui64) {
-  %0 = arith.constant 1: i1
+func.func @test_store_scatter(%src: ui64, %offsets : vector<16xindex>, %dst: ui64) {
+  %0 = arith.constant dense<true>: vector<16xi1>
   // CHECK: xegpu.create_tdesc
-  // CHECK-SAME: {mode = simt, chunk_size_per_lane = 1}
-  // CHECK-SAME: ui64, index -> !xegpu.tensor_desc<1xf32, #xegpu.scattered>
-  %1 = xegpu.create_tdesc %src, %offsets
-          : ui64, index -> !xegpu.tensor_desc<1xf32, #xegpu.scattered>
+  // CHECK-SAME: {mode = vc}
+  // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
+  %1 = xegpu.create_tdesc %src, %offsets {mode = vc}
+          : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
 
   // CHECK: xegpu.create_tdesc
-  // CHECK-SAME: {mode = simt, chunk_size_per_lane = 1}
-  // CHECK-SAME: ui64, index -> !xegpu.tensor_desc<1xf32, #xegpu.scattered>
-  %2 = xegpu.create_tdesc %dst, %offsets
-                : ui64, index -> !xegpu.tensor_desc<1xf32, #xegpu.scattered>
+  // CHECK-SAME: {mode = vc}
+  // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
+  %2 = xegpu.create_tdesc %dst, %offsets {mode = vc}
+                : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
 
   // CHECK: xegpu.load
-  // CHECK-SAME: {mode = simt, l1_hint = cached, l2_hint = uncached}
-  // CHECK-SAME: !xegpu.tensor_desc<1xf32, #xegpu.scattered>, i1 -> f32
-  %3 = xegpu.load %1, %0 {l1_hint = cached, l2_hint = uncached}
-                  : !xegpu.tensor_desc<1xf32, #xegpu.scattered>, i1 -> f32
+  // CHECK-SAME: {mode = vc, l1_hint = cached, l2_hint = uncached}
+  // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1> -> vector<16xf32>
+  %3 = xegpu.load %1, %0 {mode = vc, l1_hint = cached, l2_hint = uncached}
+                  : !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1> -> vector<16xf32>
   // CHECK: xegpu.store
-  // CHECK-SAME: {mode = simt, l1_hint = write_back, l2_hint = uncached}
-  // CHECK-SAME: f32, !xegpu.tensor_desc<1xf32, #xegpu.scattered>, i1
-  xegpu.store %3, %2, %0 {l1_hint = write_back, l2_hint = uncached}
-                  : f32, !xegpu.tensor_desc<1xf32, #xegpu.scattered>, i1
+  // CHECK-SAME: {mode = vc, l1_hint = write_back, l2_hint = uncached}
+  // CHECK-SAME: vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1>
+  xegpu.store %3, %2, %0 {mode = vc, l1_hint = write_back, l2_hint = uncached}
+                  : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scattered>, vector<16xi1>
   return
 }
diff --git a/test/Dialect/XeGPU/IR/store_scatter_vc.mlir b/test/Dialect/XeGPU/IR/store_scatter_vc.mlir
index e8650efb0..83f95487d 100644
--- a/test/Dialect/XeGPU/IR/store_scatter_vc.mlir
+++ b/test/Dialect/XeGPU/IR/store_scatter_vc.mlir
@@ -8,13 +8,13 @@
 func.func @test_store_scatter_vc(%src: ui64, %offsets : vector<16 x index>, %dst: ui64) {
   %0 = arith.constant dense<1>: vector<16xi1>
   // CHECK: xegpu.create_tdesc
-  // CHECK-SAME: {mode = vc, chunk_size_per_lane = 1}
+  // CHECK-SAME: {mode = vc}
   // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
   %1 = xegpu.create_tdesc %src, %offsets {mode = vc}
           : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
 
   // CHECK: xegpu.create_tdesc
-  // CHECK-SAME: {mode = vc, chunk_size_per_lane = 1}
+  // CHECK-SAME: {mode = vc}
   // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
   %2 = xegpu.create_tdesc %dst, %offsets {mode = vc}
           : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
diff --git a/test/Dialect/XeGPU/IR/update_nd_offset.mlir b/test/Dialect/XeGPU/IR/update_nd_offset.mlir
index 4de5560e0..342cb6ae0 100644
--- a/test/Dialect/XeGPU/IR/update_nd_offset.mlir
+++ b/test/Dialect/XeGPU/IR/update_nd_offset.mlir
@@ -9,7 +9,7 @@ func.func @test_update_nd_offset_vc_0(%src: memref<24x32xf32>) {
   %c1 = arith.constant 4 : index
 
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = vc, boundary_check = true}
+  // CHECK-SAME: {mode = vc}
   // CHECK-SAME: memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   %1 = xegpu.create_nd_tdesc %src[%c0, %c1] {mode = vc}
       : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
diff --git a/test/Dialect/XeGPU/IR/update_offset_vc.mlir b/test/Dialect/XeGPU/IR/update_offset_vc.mlir
index 812bbace2..e131d243a 100644
--- a/test/Dialect/XeGPU/IR/update_offset_vc.mlir
+++ b/test/Dialect/XeGPU/IR/update_offset_vc.mlir
@@ -8,7 +8,7 @@
 func.func @test_update_offset_VC(%src: ui64, %offsets : vector<16 x index>) {
   %0 = arith.constant dense<1>: vector<16xi1>
   // CHECK: xegpu.create_tdesc
-  // CHECK-SAME: {mode = vc, chunk_size_per_lane = 1}
+  // CHECK-SAME: {mode = vc}
   // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
   %1 = xegpu.create_tdesc %src, %offsets {mode = vc}
               : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>