diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2e5037e7..44ddf787 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,7 @@
 ## [v0.3.2](https://github.com/intel/xetla/releases/tag/v0.3.2) (2023-06-16)
 - New Features
   * Added some kernel-level APIs' parameters check functions, users need to explicit call them before launch the kernel; will return fail and print error messages when detect unsupported scenarios, continue launching the kernel for unspported scenarios may lead to unpredictable result.
+  * Removed reduce_sum + tile_op epilogue policy.
   * Added some unit test cases.
   * Refined some examples code.
   * Updated documents, added some diagrams and details.
diff --git a/README.md b/README.md
index 87929fb7..d0452e9c 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,7 @@
-# Intel® Xe Templates for Linear Algebra [v0.3.2](/CHANGELOG.md)
+# Intel® Xe Templates for Linear Algebra
+
+_Intel® XeTLA [v0.3.2](/CHANGELOG.md) - June 2023_
+
 Intel® Xe Templates for Linear Algebra (Intel® XeTLA) is a collection of SYCL/ESIMD templates that enable high-performance General Matrix Multiply (GEMM), Convolution (CONV), and related computations on Intel Xe GPU architecture. Intel® XeTLA offers reusable C++ templates for kernel, work-group and sub-group levels, allowing developers to optimize and specialize kernels based on data types, tiling policies, algorithms, fusion policies, and more. 
 
 One of the key features of Intel® XeTLA is its ability to abstract and hide details of Xe hardware implementations, particularly those related to matrix computations, such as the systolic array and other low level instructions. This ensures that SYCL/DPC++ developers can focus on leveraging the performance benefits of Intel® XeTLA without being burdened by hardware-specific instructions.
diff --git a/examples/01_basic_gemm/basic_gemm.cpp b/examples/01_basic_gemm/basic_gemm.cpp
index 80f9f3ac..dcb3cf3e 100644
--- a/examples/01_basic_gemm/basic_gemm.cpp
+++ b/examples/01_basic_gemm/basic_gemm.cpp
@@ -99,7 +99,9 @@ void basic_gemm_run(uint32_t iter) {
             ::brgemm;
 
     using epilogue_t = xetla::group::epilogue_t<
-            xetla::group::epilogue_policy_default<gpu_arch::Xe>, tile_shape,
+            xetla::group::epilogue_policy_default<result_overwrite,
+                    gpu_arch::Xe>,
+            tile_shape,
             mem_desc_t<data_type_c, mem_layout::row_major, mem_space::global>>;
 
     using gemm_op_t = xetla::kernel::gemm_t<
diff --git a/examples/02_basic_brgemm/basic_brgemm.cpp b/examples/02_basic_brgemm/basic_brgemm.cpp
index 363f5c22..a0fb35c9 100644
--- a/examples/02_basic_brgemm/basic_brgemm.cpp
+++ b/examples/02_basic_brgemm/basic_brgemm.cpp
@@ -145,9 +145,9 @@ void basic_brgemm_run(uint32_t iter) {
                 brgemm_t brgemm;
 
                 // Step 4: epilogue function to overwrite the result
-                using epilogue_t
-                        = epilogue_t<epilogue_policy_default<gpu_arch::Xe>,
-                                tile_shape, mem_desc_output_c>;
+                using epilogue_t = epilogue_t<
+                        epilogue_policy_default<result_overwrite, gpu_arch::Xe>,
+                        tile_shape, mem_desc_output_c>;
 
                 // Step 5: define the shared local memory usages
                 // developers have the responsibility to set
diff --git a/examples/03_gemm_relu_bias/gemm_relu_bias.cpp b/examples/03_gemm_relu_bias/gemm_relu_bias.cpp
index cec6d644..630cc0da 100644
--- a/examples/03_gemm_relu_bias/gemm_relu_bias.cpp
+++ b/examples/03_gemm_relu_bias/gemm_relu_bias.cpp
@@ -25,7 +25,7 @@ template <typename data_type_a, typename data_type_b, typename data_type_c,
         typename data_type_d, typename data_type_acc = float>
 int gemm_relu_bias_result_validate(data_type_a *A_device, data_type_b *B_device,
         data_type_c *C_device, data_type_d *D_device, uint32_t m, uint32_t k,
-        uint32_t n, sycl::queue queue,
+        uint32_t n, sycl::queue &queue,
         mem_layout mem_layout_a_ = mem_layout::row_major,
         mem_layout mem_layout_b_ = mem_layout::row_major) {
     auto A = alloc_host_and_copy<data_type_a>(A_device, m * k, queue);
@@ -167,8 +167,7 @@ void gemm_relu_bias_run(uint32_t iter) {
     using mem_desc_output_t
             = mem_desc_t<data_type_c, mem_layout::row_major, mem_space::global>;
     using epilogue_t = xetla::group::epilogue_t<
-            xetla::group::epilogue_policy_tile_op<tile_op_t, result_overwrite,
-                    gpu_arch::Xe>,
+            xetla::group::epilogue_policy_tile_op<tile_op_t, gpu_arch::Xe>,
             tile_shape, mem_desc_output_t>;
 
     using gemm_op_t = xetla::kernel::gemm_t<
diff --git a/examples/04_brgemm_polynomial/brgemm_polynomial.cpp b/examples/04_brgemm_polynomial/brgemm_polynomial.cpp
index 2c03c6e8..ecce1a07 100644
--- a/examples/04_brgemm_polynomial/brgemm_polynomial.cpp
+++ b/examples/04_brgemm_polynomial/brgemm_polynomial.cpp
@@ -26,7 +26,7 @@ template <typename data_type_a, typename data_type_b, typename data_type_c,
         typename data_type_acc = float>
 int gemm_polynomial_result_validate(data_type_a *A_device,
         data_type_b *B_device, data_type_c *C_device, int m, int k, int n,
-        sycl::queue queue, mem_layout mem_layout_a_ = mem_layout::row_major,
+        sycl::queue &queue, mem_layout mem_layout_a_ = mem_layout::row_major,
         mem_layout mem_layout_b_ = mem_layout::row_major) {
     auto A = alloc_host_and_copy<data_type_a>(A_device, m * k, queue);
     auto B = alloc_host_and_copy<data_type_b>(B_device, k * n, queue);
@@ -203,10 +203,9 @@ void brgemm_polynomial_run(int iter) {
                 // is already calculated.
                 // Mathematically epilogue_t is a map that applies to each element:
                 //   epilogue_t: [m, n] -> [m, n], C_acc |-> tile_op_t(C_acc)
-                using epilogue_t
-                        = epilogue_t<epilogue_policy_tile_op<tile_op_t,
-                                             result_overwrite, gpu_arch::Xe>,
-                                tile_shape, mem_desc_output_c>;
+                using epilogue_t = epilogue_t<
+                        epilogue_policy_tile_op<tile_op_t, gpu_arch::Xe>,
+                        tile_shape, mem_desc_output_c>;
 
                 // [Polynomial] define arguments for each epilogue_tile_op in chained_tile_op_t<>
                 using epilogue_tile_op_args_t
diff --git a/examples/05_batch_gemm/batch_gemm.cpp b/examples/05_batch_gemm/batch_gemm.cpp
index b9c72cac..39c2d832 100644
--- a/examples/05_batch_gemm/batch_gemm.cpp
+++ b/examples/05_batch_gemm/batch_gemm.cpp
@@ -111,7 +111,9 @@ void batch_gemm_run(uint32_t iter) {
             ::brgemm;
 
     using epilogue_t = xetla::group::epilogue_t<
-            xetla::group::epilogue_policy_default<gpu_arch::Xe>, tile_shape,
+            xetla::group::epilogue_policy_default<result_overwrite,
+                    gpu_arch::Xe>,
+            tile_shape,
             mem_desc_t<data_type_c, mem_layout::row_major, mem_space::global>>;
 
     using gemm_op_t = xetla::kernel::gemm_t<
diff --git a/examples/06_splitk_brgemm/splitk_brgemm.cpp b/examples/06_splitk_brgemm/splitk_brgemm.cpp
index 595b65a5..60bd84c2 100644
--- a/examples/06_splitk_brgemm/splitk_brgemm.cpp
+++ b/examples/06_splitk_brgemm/splitk_brgemm.cpp
@@ -157,10 +157,10 @@ void splitk_brgemm_run(uint32_t iter) {
                 // [Split-K] When Split-K is used, update_method should be set to
                 // result_reduce_sum in order to aggregate partial sum from each sub-task
                 // to the final output matrix C
-                using epilogue_t = epilogue_t<
-                        epilogue_policy_tile_op<chained_tile_op_t<>,
-                                result_reduce_sum, gpu_arch::Xe>,
-                        tile_shape, mem_desc_output_c>;
+                using epilogue_t
+                        = epilogue_t<epilogue_policy_default<result_reduce_sum,
+                                             gpu_arch::Xe>,
+                                tile_shape, mem_desc_output_c>;
 
                 // Step 5: define the shared local memory usages
                 // developers have the responsibility to set
diff --git a/examples/07_gemm_softmax/gemm_softmax.cpp b/examples/07_gemm_softmax/gemm_softmax.cpp
index 36e3cfc6..89bc2e00 100644
--- a/examples/07_gemm_softmax/gemm_softmax.cpp
+++ b/examples/07_gemm_softmax/gemm_softmax.cpp
@@ -26,7 +26,7 @@ template <typename data_type_a, typename data_type_b, typename data_type_c,
         typename data_type_acc = float>
 int gemm_softmax_result_validate(data_type_a *A_device, data_type_b *B_device,
         data_type_c *C_device, uint32_t m, uint32_t k, uint32_t n,
-        uint32_t batch_num, sycl::queue queue,
+        uint32_t batch_num, sycl::queue &queue,
         mem_layout mem_layout_a_ = mem_layout::row_major,
         mem_layout mem_layout_b_ = mem_layout::row_major) {
     uint32_t err_cnt = 0;
@@ -213,7 +213,8 @@ void gemm_softmax_run(uint32_t iter) {
 
                             // epilogue function to overwrite the result
                             using epilogue_t = epilogue_t<
-                                    epilogue_policy_default<gpu_arch::Xe>,
+                                    epilogue_policy_default<result_overwrite,
+                                            gpu_arch::Xe>,
                                     tile_shape, mem_desc_c_t>;
 
                             // using experimental::group::softmax
diff --git a/examples/08_multi_layer_perceptron/multi_layer_perceptron.cpp b/examples/08_multi_layer_perceptron/multi_layer_perceptron.cpp
index b271f813..f98039f7 100644
--- a/examples/08_multi_layer_perceptron/multi_layer_perceptron.cpp
+++ b/examples/08_multi_layer_perceptron/multi_layer_perceptron.cpp
@@ -37,7 +37,7 @@ template <typename data_type_a, typename data_type_b, typename data_type_c,
         typename data_type_acc = float>
 int mlp_result_validate(data_type_a *A_device, data_type_b *B_device,
         data_type_c *C_device, data_type_w *W_device, data_type_v *V_device,
-        uint32_t m, uint32_t k, uint32_t n, uint32_t l, sycl::queue queue,
+        uint32_t m, uint32_t k, uint32_t n, uint32_t l, sycl::queue &queue,
         mem_layout mem_layout_a_ = mem_layout::row_major,
         mem_layout mem_layout_w_ = mem_layout::row_major,
         mem_layout mem_layout_v_ = mem_layout::row_major) {
@@ -248,8 +248,7 @@ void mlp_run(uint32_t iter) {
                             mem_layout::row_major, mem_space::global>;
                     using epilogue_t = epilogue_t<
                             epilogue_policy_tile_op<
-                                    chained_tile_op_t<relu_op_t>,
-                                    result_overwrite, gpu_arch::Xe>,
+                                    chained_tile_op_t<relu_op_t>, gpu_arch::Xe>,
                             tile_shape, mem_desc_output_t>;
                     // [MLP] Define tile_op arguments
                     using epilogue_tile_op_args_t
@@ -351,8 +350,8 @@ void mlp_run(uint32_t iter) {
                             ::brgemm;
 
                     using epilogue_t = epilogue_t<
-                            epilogue_policy_tile_op<chained_tile_op_t<>,
-                                    result_overwrite, gpu_arch::Xe>,
+                            epilogue_policy_default<result_overwrite,
+                                    gpu_arch::Xe>,
                             tile_shape,
                             mem_desc_t<data_type_c, mem_layout::row_major,
                                     mem_space::global>>;
diff --git a/examples/09_scaled_dot_production_attention/scaled_dot_production_attention.cpp b/examples/09_scaled_dot_production_attention/scaled_dot_production_attention.cpp
index 9cc19c72..43cf0e56 100644
--- a/examples/09_scaled_dot_production_attention/scaled_dot_production_attention.cpp
+++ b/examples/09_scaled_dot_production_attention/scaled_dot_production_attention.cpp
@@ -31,7 +31,7 @@ template <typename dtype_in, typename dtype_out, typename data_type_acc = float>
 int sdp_fwd_result_validate(dtype_in *q_device, dtype_in *k_device,
         dtype_in *v_device, dtype_in *mask_device, dtype_out *c_device,
         uint32_t qk_m, uint32_t qk_k, uint32_t qk_n, uint32_t sv_m,
-        uint32_t sv_k, uint32_t sv_n, uint32_t batch_cnt, sycl::queue queue,
+        uint32_t sv_k, uint32_t sv_n, uint32_t batch_cnt, sycl::queue &queue,
         mem_layout mem_layout_qk_a_ = mem_layout::row_major,
         mem_layout mem_layout_qk_b_ = mem_layout::row_major,
         mem_layout mem_layout_sv_a_ = mem_layout::row_major,
@@ -267,8 +267,7 @@ void sdp_fwd_run(uint32_t iter) {
                             sg_tile_k_qk, mma_engine::xmx, gpu_arch::Xe,
                             prefetch_distance, periodic_sync_interval>::brgemm;
                     using epilogue0_t = epilogue_t<
-                            epilogue_policy_tile_op<post_op_t, result_overwrite,
-                                    gpu_arch::Xe>,
+                            epilogue_policy_tile_op<post_op_t, gpu_arch::Xe>,
                             tile_shape0,
                             mem_desc_t<dtype_sfx, mem_layout::row_major,
                                     mem_space::local>>;
diff --git a/examples/10_gate_recurrent_unit/gate_recurrent_unit.cpp b/examples/10_gate_recurrent_unit/gate_recurrent_unit.cpp
index b1034568..05e65ce0 100644
--- a/examples/10_gate_recurrent_unit/gate_recurrent_unit.cpp
+++ b/examples/10_gate_recurrent_unit/gate_recurrent_unit.cpp
@@ -24,7 +24,7 @@ int validation(data_type *layer_inputs_device, data_type *h0_inputs_device,
         std::vector<data_type *> i_weights, std::vector<data_type *> h_weights,
         data_type *hidden_outputs_device, data_type *layer_outputs_device,
         uint32_t batch_size, uint32_t input_size, uint32_t hidden_size,
-        uint32_t sequence_length, sycl::queue queue, uint32_t layer_size = 1) {
+        uint32_t sequence_length, sycl::queue &queue, uint32_t layer_size = 1) {
     uint32_t layer_input_size = batch_size * input_size;
     uint32_t hidden_io_size = batch_size * hidden_size;
     uint32_t i_weight_size = input_size * hidden_size;
@@ -268,6 +268,9 @@ void gru_run(uint32_t iter) {
                 data[idx] = static_cast<data_type>(0.001 * random_float());
             },
             queue, device, context);
+
+    i_weights.push_back(ir_weights);
+
     auto iz_weights = alloc_device_and_init<data_type>(
             input_weight_size,
             [](data_type *data, size_t idx) {
@@ -280,7 +283,7 @@ void gru_run(uint32_t iter) {
     auto in_weights = alloc_device_and_init<data_type>(
             input_weight_size,
             [](data_type *data, size_t idx) {
-                data[idx] = static_cast<data_type>(0.001 * random_float());
+                data[idx] = static_cast<data_type>(0.0001 * random_float());
             },
             queue, device, context);
 
diff --git a/examples/10_gate_recurrent_unit/kernel_func.hpp b/examples/10_gate_recurrent_unit/kernel_func.hpp
index bb2daf44..07331dac 100644
--- a/examples/10_gate_recurrent_unit/kernel_func.hpp
+++ b/examples/10_gate_recurrent_unit/kernel_func.hpp
@@ -128,8 +128,9 @@ struct gru_layer {
 
     // define arguments for each epilogue_tile_op in chained_tile_op_t<>
 
-    using epilogue_t = epilogue_t<epilogue_policy_default<gpu_arch::Xe>,
-            tile_shape, mem_desc_c_t>;
+    using epilogue_t = epilogue_t<
+            epilogue_policy_default<result_overwrite, gpu_arch::Xe>, tile_shape,
+            mem_desc_c_t>;
     using epilogue_args_t = typename epilogue_t::arguments_t;
 
     using matC_tile_desc_t = tile_desc_t<matAcc_t::tile_size_x,
diff --git a/include/group/epilogue/epilogue_policy.hpp b/include/group/epilogue/epilogue_policy.hpp
index 6a44ed78..3175261a 100644
--- a/include/group/epilogue/epilogue_policy.hpp
+++ b/include/group/epilogue/epilogue_policy.hpp
@@ -27,42 +27,38 @@ namespace gpu::xetla::group {
 /// @{
 
 /// @brief Default epilogue policy for store C.
+/// @tparam update_method_ Is the store method of matC.
 /// @tparam arch_ Is the HW architecture.
-template <gpu_arch arch_ = gpu_arch::Xe>
+template <typename update_method_ = result_overwrite,
+        gpu_arch arch_ = gpu_arch::Xe>
 struct epilogue_policy_default {
+    using update_method = update_method_;
     static constexpr gpu_arch arch_tag = arch_;
+    static_assert(std::is_same<update_method, result_overwrite>::value
+                    || std::is_same<update_method, result_reduce_sum>::value,
+            "The result can be either overwrite or reduce_sum");
 };
 
 /// @brief Epilogue policy for tile_op + store C fusion.
 /// @tparam tile_op_t_ Is the tile_op functor.
-/// @tparam update_method_ Is the store method of matC.
 /// @tparam arch_ Is the HW architecture.
-template <typename tile_op_t_, typename update_method_ = result_overwrite,
-        gpu_arch arch_ = gpu_arch::Xe>
+template <typename tile_op_t_, gpu_arch arch_ = gpu_arch::Xe>
 struct epilogue_policy_tile_op {
     using tile_op = tile_op_t_;
-    using update_method = update_method_;
-    static_assert(std::is_same<update_method, result_overwrite>::value
-                    || std::is_same<update_method, result_reduce_sum>::value,
-            "The result can be either overwrite or reduce_sum");
+    using update_method = result_overwrite;
     static constexpr gpu_arch arch_tag = arch_;
 };
 
 /// @brief Epilogue functor, specialized for quantization operator.
 /// @tparam tile_op_t_ is the tile op type.
 /// @tparam quant_op_t_ is the quantization op type
-/// @tparam update_method_ is update policy for result store.
 /// @tparam arch_ Is the HW architecture.
 template <typename tile_op_t_, typename quant_op_t_,
-        typename update_method_ = result_overwrite,
         gpu_arch arch_ = gpu_arch::Xe>
 struct epilogue_policy_quant_op {
     using tile_op = tile_op_t_;
     using quant_op = quant_op_t_;
-    using update_method = update_method_;
-    static_assert(std::is_same<update_method, result_overwrite>::value
-                    || std::is_same<update_method, result_reduce_sum>::value,
-            "The result can be either overwrite or reduce_sum");
+    using update_method = result_overwrite;
     static constexpr gpu_arch arch_tag = arch_;
 };
 /// @} xetla_epilogue
diff --git a/include/group/epilogue/impl/default_xe.hpp b/include/group/epilogue/impl/default_xe.hpp
index f498c2f5..e4856606 100644
--- a/include/group/epilogue/impl/default_xe.hpp
+++ b/include/group/epilogue/impl/default_xe.hpp
@@ -29,12 +29,13 @@ namespace gpu::xetla::group {
 /// @{
 
 /// @brief Is the epilogue functor specialized for epilogue_policy_default and Xe architecture.
-template <typename tile_shape_, typename mem_desc_c_t_>
-class epilogue_t<epilogue_policy_default<gpu_arch::Xe>, tile_shape_,
-        mem_desc_c_t_> {
+template <typename tile_shape_, typename update_method_, typename mem_desc_c_t_>
+class epilogue_t<epilogue_policy_default<update_method_, gpu_arch::Xe>,
+        tile_shape_, mem_desc_c_t_> {
 public:
-    using epilogue_policy = epilogue_policy_default<gpu_arch::Xe>;
-    using update_method = result_overwrite;
+    using epilogue_policy
+            = epilogue_policy_default<update_method_, gpu_arch::Xe>;
+    using update_method = typename epilogue_policy::update_method;
     using tile_shape = tile_shape_;
     using mem_desc_c_t = mem_desc_c_t_;
     static constexpr gpu_arch arch_tag = gpu_arch::Xe;
@@ -55,8 +56,14 @@ class epilogue_t<epilogue_policy_default<gpu_arch::Xe>, tile_shape_,
     static constexpr mem_layout mem_layout_c = mem_desc_c_t::layout;
     static constexpr mem_space mem_space_c = mem_desc_c_t::space;
     static constexpr msg_type msg_type_c
-            = (mem_space_c == mem_space::global ? msg_type::block_2d
-                                                : msg_type::scatter);
+            = std::is_same<update_method, result_overwrite>::value
+            ? (mem_space_c == mem_space::global ? msg_type::block_2d
+                                                : msg_type::scatter)
+            : msg_type::atomic_add;
+    static_assert(!(std::is_same<update_method, result_reduce_sum>::value
+                          && mem_space_c == mem_space::local),
+            "Local memory not support result_reduce_sum");
+
     /// @brief Updates tile base descriptor based on the tid.
     __XETLA_API static void update_sg_tile_tdesc(
             work_group_t &g, mem_desc_c_t &mem_desc_c) {
diff --git a/include/group/epilogue/impl/quant_tile_op_xe.hpp b/include/group/epilogue/impl/quant_tile_op_xe.hpp
index 73f3a622..dadc8eb9 100644
--- a/include/group/epilogue/impl/quant_tile_op_xe.hpp
+++ b/include/group/epilogue/impl/quant_tile_op_xe.hpp
@@ -29,17 +29,17 @@ namespace gpu::xetla::group {
 /// @{
 
 /// @brief Is the epilogue functor specialized for epilogue_policy_quant_op and Xe architecture.
-template <typename tile_op_t_, typename quant_op_t_, typename update_method_,
-        typename tile_shape_, typename mem_desc_c_t_>
-class epilogue_t<epilogue_policy_quant_op<tile_op_t_, quant_op_t_,
-                         update_method_, gpu_arch::Xe>,
+template <typename tile_op_t_, typename quant_op_t_, typename tile_shape_,
+        typename mem_desc_c_t_>
+class epilogue_t<
+        epilogue_policy_quant_op<tile_op_t_, quant_op_t_, gpu_arch::Xe>,
         tile_shape_, mem_desc_c_t_> {
 public:
-    using epilogue_policy = epilogue_policy_quant_op<tile_op_t_, quant_op_t_,
-            update_method_, gpu_arch::Xe>;
+    using epilogue_policy
+            = epilogue_policy_quant_op<tile_op_t_, quant_op_t_, gpu_arch::Xe>;
     using quant_op_t = quant_op_t_;
     using tile_op_t = tile_op_t_;
-    using update_method = update_method_;
+    using update_method = typename epilogue_policy::update_method;
     using tile_shape = tile_shape_;
     using mem_desc_c_t = mem_desc_c_t_;
     static constexpr gpu_arch arch_tag = gpu_arch::Xe;
@@ -100,14 +100,8 @@ class epilogue_t<epilogue_policy_quant_op<tile_op_t_, quant_op_t_,
     static constexpr mem_layout mem_layout_c = mem_desc_c_t::layout;
     static constexpr mem_space mem_space_c = mem_desc_c_t::space;
     static constexpr msg_type msg_type_c
-            = std::is_same<update_method, result_overwrite>::value
-            ? (mem_space_c == mem_space::global ? msg_type::block_2d
-                                                : msg_type::scatter)
-            : msg_type::atomic_add;
-
-    static_assert(!(std::is_same<update_method, result_reduce_sum>::value
-                          && mem_space_c == mem_space::local),
-            "Local memory not support result_reduce_sum");
+            = (mem_space_c == mem_space::global ? msg_type::block_2d
+                                                : msg_type::scatter);
 
     /// @brief Updates tile base descriptor based on the tid.
     __XETLA_API static void update_sg_tile_tdesc(
diff --git a/include/group/epilogue/impl/tile_op_xe.hpp b/include/group/epilogue/impl/tile_op_xe.hpp
index 26372277..17fab24e 100644
--- a/include/group/epilogue/impl/tile_op_xe.hpp
+++ b/include/group/epilogue/impl/tile_op_xe.hpp
@@ -29,16 +29,13 @@ namespace gpu::xetla::group {
 /// @{
 
 /// @brief Is the epilogue functor specialized for epilogue_policy_tile_op and Xe architecture.
-template <typename tile_op_t_, typename update_method_, typename tile_shape_,
-        typename mem_desc_c_t_>
-class epilogue_t<
-        epilogue_policy_tile_op<tile_op_t_, update_method_, gpu_arch::Xe>,
-        tile_shape_, mem_desc_c_t_> {
+template <typename tile_op_t_, typename tile_shape_, typename mem_desc_c_t_>
+class epilogue_t<epilogue_policy_tile_op<tile_op_t_, gpu_arch::Xe>, tile_shape_,
+        mem_desc_c_t_> {
 public:
-    using epilogue_policy
-            = epilogue_policy_tile_op<tile_op_t_, update_method_, gpu_arch::Xe>;
+    using epilogue_policy = epilogue_policy_tile_op<tile_op_t_, gpu_arch::Xe>;
     using tile_op_t = tile_op_t_;
-    using update_method = update_method_;
+    using update_method = typename epilogue_policy::update_method;
     using tile_shape = tile_shape_;
     using mem_desc_c_t = mem_desc_c_t_;
     static constexpr gpu_arch arch_tag = gpu_arch::Xe;
@@ -90,14 +87,8 @@ class epilogue_t<
     static constexpr mem_layout mem_layout_c = mem_desc_c_t::layout;
     static constexpr mem_space mem_space_c = mem_desc_c_t::space;
     static constexpr msg_type msg_type_c
-            = std::is_same<update_method, result_overwrite>::value
-            ? (mem_space_c == mem_space::global ? msg_type::block_2d
-                                                : msg_type::scatter)
-            : msg_type::atomic_add;
-
-    static_assert(!(std::is_same<update_method, result_reduce_sum>::value
-                          && mem_space_c == mem_space::local),
-            "Local memory not support result_reduce_sum");
+            = (mem_space_c == mem_space::global ? msg_type::block_2d
+                                                : msg_type::scatter);
 
     /// @brief Updates tile base descriptor based on the tid.
     __XETLA_API static void update_sg_tile_tdesc(
diff --git a/include/subgroup/tile/config.hpp b/include/subgroup/tile/config.hpp
deleted file mode 100644
index 4f8bae5c..00000000
--- a/include/subgroup/tile/config.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-/*******************************************************************************
-* Copyright (c) 2022-2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-/// @file
-/// C++ API
-
-#pragma once
-
-#include "subgroup/tile/common.hpp"
-
-namespace gpu::xetla::subgroup {
-
-/// @brief
-///
-/// @tparam update_dir_
-/// @tparam omode_
-/// @tparam cyclic_size_
-template <tdesc_update_dir update_dir_,
-        offset_mode omode_ = offset_mode::const_offset,
-        uint32_t cyclic_size_ = 1>
-struct mem_update_config_t {
-    static constexpr tdesc_update_dir update_dir = update_dir_;
-    static constexpr offset_mode omode = omode_;
-    static constexpr uint32_t cyclic_size = cyclic_size_;
-    //    static_assert((omode != offset_mode::const_offset) || cyclic_size == 1,
-    //            "for const_offset, the cyclic size should be 1");
-};
-
-} // namespace gpu::xetla::subgroup
diff --git a/media/docs/programming_guidelines.md b/media/docs/programming_guidelines.md
index 744ad966..30ecc97e 100644
--- a/media/docs/programming_guidelines.md
+++ b/media/docs/programming_guidelines.md
@@ -15,15 +15,11 @@ There are two groups of API to imeplement GEMM, brgemm (mirco-kernels) in group
 
 ## The Key Things for Better Performance
 Intel® XeTLA provides the basic building block of GEMM unit; however, it still needs to implement the kernel carefully for the better perforamnce in both algorithm and hardware level.
-1. Hardware Compute Unit
-In Intel's GPU, the compute unit is organized by sub-slices, and there are many Execution Unit (EU) and shared local memory inside.
-The Intel® XeTLA's micro-kernel is designed to fully utilize the whole sub-slices to archieve the best performance. Thus, the software developers response to allocate at least the number of work-group equal with the number of sub-slices.
-
-3. Number of work-group / sub-group
-4. K slicing algorithm
-5. Reuse register for post operations
-6. Data sharing through shared local memory
-7. Reduction
+1. Number of work-group / sub-group
+2. K slicing algorithm
+3. Reuse register for post operations
+4. Data sharing through shared local memory
+5. Reduction
 
 ## How To Implement A GEMM With Building Block 
 
diff --git a/media/docs/quick_start.md b/media/docs/quick_start.md
index dc77737e..d390769b 100644
--- a/media/docs/quick_start.md
+++ b/media/docs/quick_start.md
@@ -2,7 +2,7 @@
 # Quick Start
 
 - [Preparations](/media/docs/quick_start.md#preparations)
-- [Setup Environment](/media/docs/quick_start.md#setup-build-environment)
+- [Setup Environment](/media/docs/quick_start.md#setup-environment)
 - [Build](/media/docs/quick_start.md#build)
 - [Run Tests](/media/docs/quick_start.md#run-tests)
 - [Run Examples](/media/docs/quick_start.md#run-examples)
diff --git a/media/docs/terminology.md b/media/docs/terminology.md
index e6dcaeea..2037ac8d 100644
--- a/media/docs/terminology.md
+++ b/media/docs/terminology.md
@@ -75,25 +75,6 @@ Intel® XeTLA based on [SYCL](https://registry.khronos.org/SYCL/specs/sycl-2020/
 
 **work-item**:The SYCL work-item is a representation of a work-item among a collection of parallel executions of a kernel invoked on a device by a command. A work-item is executed by one or more processing elements as part of a work-group executing on a compute unit. A work-item is distinguished from other work items by its global id or the combination of its work-group id and its local id within a work-group.
 
-Because XeTLA is on the top of SYCL, the most of concept is inherabit from SYCL SPEC.
-https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#_work_group_data_parallel_kernels
-work_item
-work_group
-sub_group
-Ndrange
-Shared local memory
-Barrier
-
-XeTLA 
-kernel:
-group:
-subgroup:
-tile:
-compute policy:
-compute_attr： 
-prefetch_distance： 
-
-
 # Copyright
 Copyright (c) 2022-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at
 
diff --git a/tests/integration/data_transformer/common.hpp b/tests/integration/data_transformer/common.hpp
index c02dd100..05326aed 100644
--- a/tests/integration/data_transformer/common.hpp
+++ b/tests/integration/data_transformer/common.hpp
@@ -25,7 +25,7 @@ template <typename data_type_in, typename data_type_out, typename data_type_acc>
 int data_transformer_result_validate(data_type_in *in_device,
         data_type_out *out_device, size_t mat_m, size_t mat_n,
         bool is_transposed, int need_fp8_op, data_type_acc *amax_ptr_device,
-        data_type_acc *scale_device, sycl::queue queue) {
+        data_type_acc *scale_device, sycl::queue &queue) {
     auto in = alloc_host_and_copy<data_type_in>(
             in_device, mat_m * mat_n, queue);
     auto out = alloc_host_and_copy<data_type_out>(
diff --git a/tests/integration/gemm/bf16/common.hpp b/tests/integration/gemm/bf16/common.hpp
index 29e22495..717b35cb 100644
--- a/tests/integration/gemm/bf16/common.hpp
+++ b/tests/integration/gemm/bf16/common.hpp
@@ -277,30 +277,13 @@ class Test11 : public TestBase {
     using data_type_acc = float;
 };
 
-template <typename dtype_a, typename dtype_b, typename dtype_c>
-class input_buffer_init {
-public:
-    void operator()(dtype_a *A, dtype_b *B, dtype_c *C, size_t size_a,
-            size_t size_b, size_t size_c) {
-        for (unsigned i = 0; i < size_a; ++i) {
-            A[i] = (i * 3) % 17;
-        }
-        for (unsigned i = 0; i < size_b; ++i) {
-            B[i] = (i * 5) % 19;
-        }
-        for (unsigned i = 0; i < size_c; ++i) {
-            C[i] = 0;
-        }
-    }
-};
-
 template <class Test, typename dtype_a, typename dtype_b, typename dtype_c,
         typename dtype_acc>
 class result_validate {
 
 public:
-    int operator()(dtype_a *A, dtype_b *B, dtype_c *C, sycl::queue queue,
-            sycl::context context) {
+    int operator()(dtype_a *A, dtype_b *B, dtype_c *C, sycl::queue &queue,
+            sycl::context &context) {
         return gemm_result_validate<dtype_a, dtype_b, dtype_c, dtype_acc>(A, B,
                 C, 1, Test::mat_m, Test::mat_k, Test::mat_n, queue, context,
                 Test::layout_a, Test::layout_b);
diff --git a/tests/integration/gemm/bf16/kernel_func.hpp b/tests/integration/gemm/bf16/kernel_func.hpp
index f98ad1f8..08f63526 100644
--- a/tests/integration/gemm/bf16/kernel_func.hpp
+++ b/tests/integration/gemm/bf16/kernel_func.hpp
@@ -44,7 +44,7 @@ struct bgemm_test_func {
         using update_method = typename std::conditional<(l3_kslicing > 1),
                 result_reduce_sum, result_overwrite>::type;
         using epilogue_t = epilogue_t<
-                epilogue_policy_tile_op<none_op_t, update_method, gpu_arch::Xe>,
+                epilogue_policy_default<update_method, gpu_arch::Xe>,
                 tile_shape,
                 mem_desc_t<dtype_c, mem_layout::row_major, mem_space::global>>;
 
diff --git a/tests/integration/gemm/bf16/main.cpp b/tests/integration/gemm/bf16/main.cpp
index 97d77bbd..a45e416f 100644
--- a/tests/integration/gemm/bf16/main.cpp
+++ b/tests/integration/gemm/bf16/main.cpp
@@ -30,9 +30,9 @@ TYPED_TEST_SUITE_P(bgemm_test);
 TYPED_TEST_P(bgemm_test, esimd) {
     gemm_exec<TypeParam, typename TypeParam::data_type_a,
             typename TypeParam::data_type_b, typename TypeParam::data_type_c,
-            typename TypeParam::data_type_acc, input_buffer_init,
-            result_validate, bgemm_func>(TypeParam::mat_m, TypeParam::mat_n,
-            TypeParam::mat_k, esimd_compile_string);
+            typename TypeParam::data_type_acc, result_validate, bgemm_func>(
+            TypeParam::mat_m, TypeParam::mat_n, TypeParam::mat_k,
+            esimd_compile_string);
 }
 REGISTER_TYPED_TEST_SUITE_P(bgemm_test, esimd);
 using tests = ::testing::Types<Test0, Test1, Test2, Test3, Test4, Test5, Test6,
diff --git a/tests/integration/gemm/fp16/kernel_func.hpp b/tests/integration/gemm/fp16/kernel_func.hpp
index c2a22460..d48ead41 100644
--- a/tests/integration/gemm/fp16/kernel_func.hpp
+++ b/tests/integration/gemm/fp16/kernel_func.hpp
@@ -44,7 +44,7 @@ struct fp16_gemm_test_func {
         using update_method = typename std::conditional<(l3_kslicing > 1),
                 result_reduce_sum, result_overwrite>::type;
         using epilogue_t = epilogue_t<
-                epilogue_policy_tile_op<none_op_t, update_method, gpu_arch::Xe>,
+                epilogue_policy_default<update_method, gpu_arch::Xe>,
                 tile_shape,
                 mem_desc_t<dtype_c, mem_layout::row_major, mem_space::global>>;
 
diff --git a/tests/integration/gemm/fp16/main.cpp b/tests/integration/gemm/fp16/main.cpp
index b27d2ac6..7d123e89 100644
--- a/tests/integration/gemm/fp16/main.cpp
+++ b/tests/integration/gemm/fp16/main.cpp
@@ -370,30 +370,13 @@ class Test16 : public TestBase {
     using data_type_acc = float;
 };
 
-template <typename dtype_a, typename dtype_b, typename dtype_c>
-class input_buffer_init {
-public:
-    void operator()(dtype_a *A, dtype_b *B, dtype_c *C, size_t size_a,
-            size_t size_b, size_t size_c) {
-        for (unsigned i = 0; i < size_a; ++i) {
-            A[i] = (i * 3) % 17;
-        }
-        for (unsigned i = 0; i < size_b; ++i) {
-            B[i] = (i * 5) % 19;
-        }
-        for (unsigned i = 0; i < size_c; ++i) {
-            C[i] = 0;
-        }
-    }
-};
-
 template <class Test, typename dtype_a, typename dtype_b, typename dtype_c,
         typename dtype_acc>
 class result_validate {
 
 public:
-    int operator()(dtype_a *A, dtype_b *B, dtype_c *C, sycl::queue queue,
-            sycl::context context) {
+    int operator()(dtype_a *A, dtype_b *B, dtype_c *C, sycl::queue &queue,
+            sycl::context &context) {
         return gemm_result_validate<dtype_a, dtype_b, dtype_c, dtype_acc>(A, B,
                 C, 1, Test::mat_m, Test::mat_k, Test::mat_n, queue, context,
                 Test::layout_a, Test::layout_b);
@@ -419,9 +402,9 @@ TYPED_TEST_SUITE_P(fp16_gemm_test);
 TYPED_TEST_P(fp16_gemm_test, esimd) {
     gemm_exec<TypeParam, typename TypeParam::data_type_a,
             typename TypeParam::data_type_b, typename TypeParam::data_type_c,
-            typename TypeParam::data_type_acc, input_buffer_init,
-            result_validate, fp16_gemm_func>(TypeParam::mat_m, TypeParam::mat_n,
-            TypeParam::mat_k, esimd_compile_string);
+            typename TypeParam::data_type_acc, result_validate, fp16_gemm_func>(
+            TypeParam::mat_m, TypeParam::mat_n, TypeParam::mat_k,
+            esimd_compile_string);
 }
 REGISTER_TYPED_TEST_SUITE_P(fp16_gemm_test, esimd);
 using tests = ::testing::Types<Test0, Test1, Test2, Test3, Test4, Test5, Test6,
diff --git a/tests/integration/gemm/fp32/common.hpp b/tests/integration/gemm/fp32/common.hpp
index f2664952..b264dbc1 100644
--- a/tests/integration/gemm/fp32/common.hpp
+++ b/tests/integration/gemm/fp32/common.hpp
@@ -38,6 +38,9 @@ class TestBase {
                 + mem_layout_a_str + "_" + mem_layout_b_str;
         return name;
     }
+
+    static constexpr size_t batch_size = 1;
+    static constexpr mma_engine engine = mma_engine::fpu;
 };
 
 class Test1 : public TestBase {
@@ -162,38 +165,75 @@ class Test8 : public TestBase {
     static constexpr mem_layout layout_b = mem_layout::row_major;
 };
 
-template <typename dtype_a, typename dtype_b, typename dtype_c>
-class input_buffer_init {
+class Test9 : public TestBase {
 public:
-    void operator()(dtype_a *A, dtype_b *B, dtype_c *C, size_t size_a,
-            size_t size_b, size_t size_c) {
-        for (unsigned i = 0; i < size_a; ++i) {
-            A[i] = (i * 3) % 17;
-        }
-        for (unsigned i = 0; i < size_b; ++i) {
-            B[i] = (i * 5) % 19;
-        }
-        for (unsigned i = 0; i < size_c; ++i) {
-            C[i] = 0;
-        }
-    }
+    static constexpr size_t batch_size = 64;
+    static constexpr size_t mat_m = 256;
+    static constexpr size_t mat_n = 256;
+    static constexpr size_t mat_k = 256;
+    static constexpr size_t wg_m = 256;
+    static constexpr size_t wg_n = 256;
+    static constexpr size_t sg_m = 32;
+    static constexpr size_t sg_n = 64;
+    static constexpr size_t sg_k = 8;
+    static constexpr uint32_t l3_kslicing = 2;
+    static constexpr uint32_t slm_kslicing = 1;
+    static constexpr mem_layout layout_a = mem_layout::row_major;
+    static constexpr mem_layout layout_b = mem_layout::row_major;
+    static constexpr mma_engine engine = mma_engine::xmx;
+};
+
+class Test10 : public TestBase {
+public:
+    static constexpr size_t batch_size = 64;
+    static constexpr size_t mat_m = 256;
+    static constexpr size_t mat_n = 256;
+    static constexpr size_t mat_k = 256;
+    static constexpr size_t wg_m = 256;
+    static constexpr size_t wg_n = 256;
+    static constexpr size_t sg_m = 32;
+    static constexpr size_t sg_n = 64;
+    static constexpr size_t sg_k = 8;
+    static constexpr uint32_t l3_kslicing = 2;
+    static constexpr uint32_t slm_kslicing = 1;
+    static constexpr mem_layout layout_a = mem_layout::row_major;
+    static constexpr mem_layout layout_b = mem_layout::row_major;
+};
+
+class Test11 : public TestBase {
+public:
+    static constexpr size_t batch_size = 35;
+    static constexpr size_t mat_m = 4193;
+    static constexpr size_t mat_k = 1134;
+    static constexpr size_t mat_n = 686;
+    static constexpr size_t wg_m = 256;
+    static constexpr size_t wg_n = 256;
+    static constexpr size_t sg_m = 32;
+    static constexpr size_t sg_n = 64;
+    static constexpr size_t sg_k = 32;
+    static constexpr uint32_t l3_kslicing = 1;
+    static constexpr uint32_t slm_kslicing = 16;
+    static constexpr mem_layout layout_a = mem_layout::col_major;
+    static constexpr mem_layout layout_b = mem_layout::row_major;
+    static constexpr mma_engine engine = mma_engine::xmx;
 };
 
 template <class Test, typename dtype_a, typename dtype_b, typename dtype_c,
         typename dtype_acc>
 using sgemm_func = sgemm_test_func<dtype_a, dtype_b, dtype_c, dtype_acc,
         Test::wg_m, Test::wg_n, Test::sg_m, Test::sg_n, Test::sg_k,
-        Test::layout_a, Test::layout_b, Test::l3_kslicing, Test::slm_kslicing>;
+        Test::layout_a, Test::layout_b, Test::l3_kslicing, Test::slm_kslicing,
+        Test::engine>;
 
 template <class Test, typename dtype_a, typename dtype_b, typename dtype_c,
         typename dtype_acc>
 class result_validate {
 
 public:
-    int operator()(dtype_a *A, dtype_b *B, dtype_c *C, sycl::queue queue,
-            sycl::context context) {
+    int operator()(dtype_a *A, dtype_b *B, dtype_c *C, sycl::queue &queue,
+            sycl::context &context) {
         return gemm_result_validate<dtype_a, dtype_b, dtype_c, dtype_acc>(A, B,
-                C, 1, Test::mat_m, Test::mat_k, Test::mat_n, queue, context,
-                Test::layout_a, Test::layout_b);
+                C, Test::batch_size, Test::mat_m, Test::mat_k, Test::mat_n,
+                queue, context, Test::layout_a, Test::layout_b);
     }
 };
diff --git a/tests/integration/gemm/fp32/kernel_func.hpp b/tests/integration/gemm/fp32/kernel_func.hpp
index 9fe185da..1ef52f2d 100644
--- a/tests/integration/gemm/fp32/kernel_func.hpp
+++ b/tests/integration/gemm/fp32/kernel_func.hpp
@@ -23,7 +23,7 @@ using namespace gpu::xetla;
 template <typename dtype_a, typename dtype_b, typename dtype_c,
         typename dtype_acc, int wg_m, int wg_n, int sg_m, int sg_n, int sg_k,
         mem_layout layout_a, mem_layout layout_b, uint32_t l3_kslicing,
-        uint32_t slm_kslicing>
+        uint32_t slm_kslicing, mma_engine engine>
 struct sgemm_test_func {
 
     static const char *func_name() { return "sgemm_test_func"; }
@@ -38,13 +38,13 @@ struct sgemm_test_func {
         static constexpr uint32_t prefetch_distance = 1;
         using brgemm_t = typename brgemm_selector_t<dtype_a, dtype_b, layout_a,
                 layout_b, mem_space::global, mem_space::global, 4, 4, dtype_acc,
-                tile_shape, sg_k, mma_engine::fpu, gpu_arch::Xe,
-                prefetch_distance, periodic_sync_interval>::brgemm;
+                tile_shape, sg_k, engine, gpu_arch::Xe, prefetch_distance,
+                periodic_sync_interval>::brgemm;
 
         using update_method = typename std::conditional<(l3_kslicing > 1),
                 result_reduce_sum, result_overwrite>::type;
         using epilogue_t = epilogue_t<
-                epilogue_policy_tile_op<none_op_t, update_method, gpu_arch::Xe>,
+                epilogue_policy_default<update_method, gpu_arch::Xe>,
                 tile_shape,
                 mem_desc_t<dtype_c, mem_layout::row_major, mem_space::global>>;
 
diff --git a/tests/integration/gemm/fp32/main.cpp b/tests/integration/gemm/fp32/main.cpp
index 13cb0cb1..07cf27fd 100644
--- a/tests/integration/gemm/fp32/main.cpp
+++ b/tests/integration/gemm/fp32/main.cpp
@@ -31,12 +31,14 @@ std::string esimd_compile_string
 template <typename T>
 class sgemm_test : public ::testing::Test {};
 TYPED_TEST_SUITE_P(sgemm_test);
+
 TYPED_TEST_P(sgemm_test, esimd) {
     gemm_exec<TypeParam, data_type_a, data_type_b, data_type_c, data_type_acc,
-            input_buffer_init, result_validate, sgemm_func>(TypeParam::mat_m,
-            TypeParam::mat_n, TypeParam::mat_k, esimd_compile_string);
+            result_validate, sgemm_func>(TypeParam::mat_m, TypeParam::mat_n,
+            TypeParam::mat_k, esimd_compile_string, TypeParam::batch_size);
 }
+
 REGISTER_TYPED_TEST_SUITE_P(sgemm_test, esimd);
 using tests = ::testing::Types<Test1, Test2, Test3, Test4, Test5, Test6, Test7,
-        Test8>;
+        Test8, Test9, Test10, Test11>;
 INSTANTIATE_TYPED_TEST_SUITE_P(sgemm_test_suite, sgemm_test, tests);
diff --git a/tests/integration/gemm/int8/common.hpp b/tests/integration/gemm/int8/common.hpp
index d7f3590e..269bd5e5 100644
--- a/tests/integration/gemm/int8/common.hpp
+++ b/tests/integration/gemm/int8/common.hpp
@@ -182,38 +182,62 @@ class Test6 : public TestBase {
     using data_type_acc = int;
 };
 
-template <typename dtype_a, typename dtype_b, typename dtype_c>
-class input_buffer_init {
+template <class Test, typename data_type_a, typename data_type_b,
+        typename data_type_c, typename data_type_acc>
+using int8_gemm_func = int8gemm_test_func<data_type_a, data_type_b, data_type_c,
+        data_type_acc, Test::wg_m, Test::wg_n, Test::sg_m, Test::sg_n,
+        Test::sg_k, Test::layout_a, Test::layout_b, Test::l3_kslicing,
+        Test::slm_kslicing>;
+
+template <class Test, typename data_type_a, typename data_type_b,
+        typename data_type_c, typename data_type_acc>
+class result_validate {
+
 public:
-    void operator()(dtype_a *A, dtype_b *B, dtype_c *C, size_t size_a,
-            size_t size_b, size_t size_c) {
-        for (unsigned i = 0; i < size_a; ++i) {
-            A[i] = (i * 3) % 17;
-        }
-        for (unsigned i = 0; i < size_b; ++i) {
-            B[i] = (i * 5) % 19;
-        }
-        for (unsigned i = 0; i < size_c; ++i) {
-            C[i] = 0;
+    int operator()(data_type_a *A_device, data_type_b *B_device,
+            data_type_c *C_device, sycl::queue &queue, sycl::context &context) {
+        auto A = alloc_host_and_copy<data_type_a>(
+                A_device, Test::mat_m * Test::mat_k, queue);
+        auto B = alloc_host_and_copy<data_type_b>(
+                B_device, Test::mat_k * Test::mat_n, queue);
+        auto C = alloc_host_and_copy<data_type_c>(
+                C_device, Test::mat_m * Test::mat_n, queue);
+
+        buff_cmp::buff_vals<data_type_c> data(
+                C, Test::mat_m, Test::mat_n, Test::mat_n);
+        std::vector<data_type_acc> acc_buffer(Test::mat_m * Test::mat_n, 0);
+
+        {
+            bool is_col_major_a = Test::layout_a == mem_layout::col_major;
+            bool is_col_major_b = Test::layout_b == mem_layout::col_major;
+            for (int i = 0; i < Test::mat_m; i++) {
+                for (int j = 0; j < Test::mat_n; j++) {
+                    for (int k = 0; k < Test::mat_k; k++) {
+                        data_type_acc a_temp = is_col_major_a
+                                ? A[i + k * Test::mat_m]
+                                : A[i * Test::mat_k + k];
+                        data_type_acc b_temp = is_col_major_b
+                                ? B[k + j * Test::mat_k]
+                                : B[k * Test::mat_n + j];
+                        acc_buffer[i * Test::mat_n + j]
+                                = acc_buffer[i * Test::mat_n + j]
+                                + a_temp * b_temp;
+                    }
+                }
+            }
         }
-    }
-};
 
-template <class Test, typename dtype_a, typename dtype_b, typename dtype_c,
-        typename dtype_acc>
-using int8_gemm_func = int8gemm_test_func<dtype_a, dtype_b, dtype_c, dtype_acc,
-        Test::wg_m, Test::wg_n, Test::sg_m, Test::sg_n, Test::sg_k,
-        Test::layout_a, Test::layout_b, Test::l3_kslicing, Test::slm_kslicing>;
+        buff_cmp::buff_vals<data_type_c, data_type_acc> other(
+                acc_buffer.data(), Test::mat_m, Test::mat_n, Test::mat_n);
+        bool result = buff_cmp::xetla_buff_cmp(data, other,
+                Test::name(Test::mat_m, Test::mat_n, Test::mat_k, Test::wg_m,
+                        Test::wg_n, Test::sg_m, Test::sg_n, Test::sg_k,
+                        Test::layout_a, Test::layout_b));
 
-template <class Test, typename dtype_a, typename dtype_b, typename dtype_c,
-        typename dtype_acc>
-class result_validate {
+        free(A);
+        free(B);
+        free(C);
 
-public:
-    int operator()(dtype_a *A, dtype_b *B, dtype_c *C, sycl::queue queue,
-            sycl::context context) {
-        return gemm_result_validate<dtype_a, dtype_b, dtype_c, dtype_acc>(A, B,
-                C, 1, Test::mat_m, Test::mat_k, Test::mat_n, queue, context,
-                Test::layout_a, Test::layout_b);
+        return result ? 0 : 1;
     }
 };
diff --git a/tests/integration/gemm/int8/kernel_func.hpp b/tests/integration/gemm/int8/kernel_func.hpp
index 636398ef..a3561089 100644
--- a/tests/integration/gemm/int8/kernel_func.hpp
+++ b/tests/integration/gemm/int8/kernel_func.hpp
@@ -44,7 +44,7 @@ struct int8gemm_test_func {
         using update_method = typename std::conditional<(l3_kslicing > 1),
                 result_reduce_sum, result_overwrite>::type;
         using epilogue_t = epilogue_t<
-                epilogue_policy_tile_op<none_op_t, update_method, gpu_arch::Xe>,
+                epilogue_policy_default<update_method, gpu_arch::Xe>,
                 tile_shape,
                 mem_desc_t<dtype_c, mem_layout::row_major, mem_space::global>>;
 
diff --git a/tests/integration/gemm/int8/main.cpp b/tests/integration/gemm/int8/main.cpp
index 92829979..a50a59d1 100644
--- a/tests/integration/gemm/int8/main.cpp
+++ b/tests/integration/gemm/int8/main.cpp
@@ -35,9 +35,9 @@ TYPED_TEST_SUITE_P(int8_gemm_test);
 TYPED_TEST_P(int8_gemm_test, esimd) {
     gemm_exec<TypeParam, typename TypeParam::data_type_a,
             typename TypeParam::data_type_b, typename TypeParam::data_type_c,
-            typename TypeParam::data_type_acc, input_buffer_init,
-            result_validate, int8_gemm_func>(TypeParam::mat_m, TypeParam::mat_n,
-            TypeParam::mat_k, esimd_compile_string);
+            typename TypeParam::data_type_acc, result_validate, int8_gemm_func>(
+            TypeParam::mat_m, TypeParam::mat_n, TypeParam::mat_k,
+            esimd_compile_string);
 }
 REGISTER_TYPED_TEST_SUITE_P(int8_gemm_test, esimd);
 
diff --git a/tests/integration/gemm/int8_quantization/common.hpp b/tests/integration/gemm/int8_quantization/common.hpp
index 5e4942ef..138fc1e0 100644
--- a/tests/integration/gemm/int8_quantization/common.hpp
+++ b/tests/integration/gemm/int8_quantization/common.hpp
@@ -25,7 +25,7 @@ int gemm_result_validate(data_type_a *A_device, data_type_b *B_device,
         data_type_c *C_device, data_type_param *scale_device,
         data_type_param *offset_device, int m, int k, int n,
         gpu::xetla::mem_layout mem_layout_a,
-        gpu::xetla::mem_layout mem_layout_b, sycl::queue queue) {
+        gpu::xetla::mem_layout mem_layout_b, sycl::queue &queue) {
 
     auto A = alloc_host_and_copy<data_type_a>(A_device, m * k, queue);
     auto B = alloc_host_and_copy<data_type_b>(B_device, k * n, queue);
diff --git a/tests/integration/gemm/int8_quantization/kernel_func.hpp b/tests/integration/gemm/int8_quantization/kernel_func.hpp
index 1f3c7e85..8cb0c27b 100644
--- a/tests/integration/gemm/int8_quantization/kernel_func.hpp
+++ b/tests/integration/gemm/int8_quantization/kernel_func.hpp
@@ -37,7 +37,7 @@ struct igemm_quantize_func {
     using quant_op_t = subgroup::quant_op_t<dtype_param>;
     using epilogue_t = gpu::xetla::group::epilogue_t<
             gpu::xetla::group::epilogue_policy_quant_op<none_op_t, quant_op_t,
-                    result_overwrite, gpu_arch::Xe>,
+                    gpu_arch::Xe>,
             tile_shape,
             mem_desc_t<dtype_c, mem_layout::row_major, mem_space::global>>;
 
diff --git a/tests/integration/gemm/tf32/kernel_func.hpp b/tests/integration/gemm/tf32/kernel_func.hpp
index 8e272782..fec8bc5b 100644
--- a/tests/integration/gemm/tf32/kernel_func.hpp
+++ b/tests/integration/gemm/tf32/kernel_func.hpp
@@ -42,7 +42,7 @@ struct gemm_test_func {
         using update_method = typename std::conditional<(l3_kslicing > 1),
                 result_reduce_sum, result_overwrite>::type;
         using epilogue_t = epilogue_t<
-                epilogue_policy_tile_op<none_op_t, update_method, gpu_arch::Xe>,
+                epilogue_policy_default<update_method, gpu_arch::Xe>,
                 tile_shape,
                 mem_desc_t<dtype_c, mem_layout::row_major, mem_space::global>>;
 
diff --git a/tests/integration/gemm/tf32/main.cpp b/tests/integration/gemm/tf32/main.cpp
index 40d7645b..85047073 100644
--- a/tests/integration/gemm/tf32/main.cpp
+++ b/tests/integration/gemm/tf32/main.cpp
@@ -19,29 +19,12 @@
 #include "test.hpp"
 #include <gtest/gtest.h>
 
-template <typename dtype_a, typename dtype_b, typename dtype_c>
-class input_buffer_init {
-public:
-    void operator()(dtype_a *A, dtype_b *B, dtype_c *C, size_t size_a,
-            size_t size_b, size_t size_c) {
-        for (unsigned i = 0; i < size_a; ++i) {
-            A[i] = (i * 3) % 17;
-        }
-        for (unsigned i = 0; i < size_b; ++i) {
-            B[i] = (i * 5) % 19;
-        }
-        for (unsigned i = 0; i < size_c; ++i) {
-            C[i] = 0;
-        }
-    }
-};
-
 template <class Test, typename dtype_a, typename dtype_b, typename dtype_c,
         typename dtype_acc>
 class result_validate {
 public:
-    int operator()(dtype_a *A, dtype_b *B, dtype_c *C, sycl::queue queue,
-            sycl::context context) {
+    int operator()(dtype_a *A, dtype_b *B, dtype_c *C, sycl::queue &queue,
+            sycl::context &context) {
         return gemm_result_validate<dtype_a, dtype_b, dtype_c, dtype_acc>(A, B,
                 C, 1, Test::mat_m, Test::mat_k, Test::mat_n, queue, context,
                 Test::layout_a, Test::layout_b);
@@ -67,9 +50,8 @@ TYPED_TEST_SUITE_P(gemm_tf32);
 TYPED_TEST_P(gemm_tf32, esimd) {
     gemm_exec<TypeParam, typename TypeParam::data_type_a,
             typename TypeParam::data_type_b, typename TypeParam::data_type_c,
-            float, input_buffer_init, result_validate, gemm_func>(
-            TypeParam::mat_m, TypeParam::mat_n, TypeParam::mat_k,
-            esimd_compile_string);
+            float, result_validate, gemm_func>(TypeParam::mat_m,
+            TypeParam::mat_n, TypeParam::mat_k, esimd_compile_string);
 }
 REGISTER_TYPED_TEST_SUITE_P(gemm_tf32, esimd);
 INSTANTIATE_TYPED_TEST_SUITE_P(gemm_tf32_suite, gemm_tf32, tests);
diff --git a/tests/integration/row_reduction/common.hpp b/tests/integration/row_reduction/common.hpp
index 1b7c7280..dafca051 100644
--- a/tests/integration/row_reduction/common.hpp
+++ b/tests/integration/row_reduction/common.hpp
@@ -31,7 +31,7 @@ int reduction_result_validate(data_type_in *device_in,
         data_type_out *device_out, data_type_w *device_w_in,
         data_type_x *device_x_out, data_type_d *device_d_out,
         uint8_t *device_mask_in, int m, int n, float scale,
-        reduction_fused_kind fused_op, sycl::queue queue) {
+        reduction_fused_kind fused_op, sycl::queue &queue) {
     int err_cnt = 0;
     bool is_bias_gelu_bwd = fused_op == reduction_fused_kind::bias_gelu_w_bwd;
     bool is_bias_dropout_bwd
diff --git a/tests/integration/sg_dropout_op/common.hpp b/tests/integration/sg_dropout_op/common.hpp
index bed1f604..3c906b87 100644
--- a/tests/integration/sg_dropout_op/common.hpp
+++ b/tests/integration/sg_dropout_op/common.hpp
@@ -30,7 +30,7 @@ template <typename data_type_x, typename data_type_y,
         typename data_type_acc = float>
 int dropout_result_validate(data_type_x *in_device, data_type_y *out_device,
         int m, int n, uint8_t *buffer_mask_device, float drop_out_scale,
-        sycl::queue queue) {
+        sycl::queue &queue) {
     auto in = alloc_host_and_copy<data_type_x>(in_device, m * n, queue);
     auto out = alloc_host_and_copy<data_type_y>(out_device, m * n, queue);
     auto buffer_mask
diff --git a/tests/integration/sg_dropout_op/kernel_func.hpp b/tests/integration/sg_dropout_op/kernel_func.hpp
index fb160cd0..c60de6e2 100644
--- a/tests/integration/sg_dropout_op/kernel_func.hpp
+++ b/tests/integration/sg_dropout_op/kernel_func.hpp
@@ -56,10 +56,9 @@ struct dropout_func_t {
                     subgroup::dropout_op_t<uint8_t, gpu_arch::Xe>,
                     subgroup::rng_dropout_op_t<uint8_t, gpu_arch::Xe>>::type;
 
-    using epilogue_t
-            = group::epilogue_t<group::epilogue_policy_tile_op<tile_op_t,
-                                        result_overwrite, gpu_arch::Xe>,
-                    tile_shape, mem_desc_out_t>;
+    using epilogue_t = group::epilogue_t<
+            group::epilogue_policy_tile_op<tile_op_t, gpu_arch::Xe>, tile_shape,
+            mem_desc_out_t>;
     using epilogue_args_t = typename epilogue_t::arguments_t;
 
     static inline void run(xetla_exec_item<3> &ei, dtype_in *mat_in_ptr,
diff --git a/tests/integration/vector_add/bf16_2d/common.hpp b/tests/integration/vector_add/bf16_2d/common.hpp
index 1295e895..5d7b675d 100644
--- a/tests/integration/vector_add/bf16_2d/common.hpp
+++ b/tests/integration/vector_add/bf16_2d/common.hpp
@@ -26,7 +26,8 @@ class Test1;
 #define data_type bf16
 
 int vadd_result_validate(data_type *A_device, data_type *B_device,
-        data_type *C_device, unsigned Size, unsigned pitch, sycl::queue queue) {
+        data_type *C_device, unsigned Size, unsigned pitch,
+        sycl::queue &queue) {
     auto A = alloc_host_and_copy<data_type>(A_device, pitch * pitch, queue);
     auto B = alloc_host_and_copy<data_type>(B_device, pitch * pitch, queue);
     auto C = alloc_host_and_copy<data_type>(C_device, pitch * pitch, queue);
diff --git a/tests/integration/vector_add/int32_1d/common.hpp b/tests/integration/vector_add/int32_1d/common.hpp
index bdf408ee..d5192be3 100644
--- a/tests/integration/vector_add/int32_1d/common.hpp
+++ b/tests/integration/vector_add/int32_1d/common.hpp
@@ -25,7 +25,7 @@ using namespace gpu::xetla;
 using namespace cl::sycl;
 
 int vadd_result_validate(data_type *A_device, data_type *B_device,
-        data_type *C_device, unsigned Size, sycl::queue queue) {
+        data_type *C_device, unsigned Size, sycl::queue &queue) {
     auto A = alloc_host_and_copy<data_type>(A_device, Size, queue);
     auto B = alloc_host_and_copy<data_type>(B_device, Size, queue);
     auto C = alloc_host_and_copy<data_type>(C_device, Size, queue);
diff --git a/tests/integration/vector_add/int32_2d/common.hpp b/tests/integration/vector_add/int32_2d/common.hpp
index 846a8254..6e754261 100644
--- a/tests/integration/vector_add/int32_2d/common.hpp
+++ b/tests/integration/vector_add/int32_2d/common.hpp
@@ -24,7 +24,7 @@ class Test1;
 using namespace cl::sycl;
 
 int vadd_result_validate(data_type *A_device, data_type *B_device,
-        data_type *C_device, unsigned Size, sycl::queue queue) {
+        data_type *C_device, unsigned Size, sycl::queue &queue) {
     auto A = alloc_host_and_copy<data_type>(A_device, Size, queue);
     auto B = alloc_host_and_copy<data_type>(B_device, Size, queue);
     auto C = alloc_host_and_copy<data_type>(C_device, Size, queue);
diff --git a/tests/integration/vector_add/tf32_1d/common.hpp b/tests/integration/vector_add/tf32_1d/common.hpp
index 3cbb26b9..22b78902 100644
--- a/tests/integration/vector_add/tf32_1d/common.hpp
+++ b/tests/integration/vector_add/tf32_1d/common.hpp
@@ -28,7 +28,7 @@ class Test1;
 using namespace cl::sycl;
 
 int vadd_result_validate(data_type *A_device, data_type *B_device,
-        data_type *C_device, unsigned Size, sycl::queue queue) {
+        data_type *C_device, unsigned Size, sycl::queue &queue) {
     auto A = alloc_host_and_copy<data_type>(A_device, Size, queue);
     auto B = alloc_host_and_copy<data_type>(B_device, Size, queue);
     auto C = alloc_host_and_copy<data_type>(C_device, Size, queue);
diff --git a/tests/unit/epilogue_tile_op/kernel_func.hpp b/tests/unit/epilogue_tile_op/kernel_func.hpp
index f4976cd5..0f67c03e 100644
--- a/tests/unit/epilogue_tile_op/kernel_func.hpp
+++ b/tests/unit/epilogue_tile_op/kernel_func.hpp
@@ -34,8 +34,7 @@ struct tile_elemwise_op_func {
     using tile_shape = tile_shape_t<twidth, theight, twidth, theight>;
     using mem_desc_c_t
             = mem_desc_t<dtype, mem_layout::row_major, mem_space::global>;
-    using epilogue_policy = epilogue_policy_tile_op<tile_op_t, result_overwrite,
-            gpu_arch::Xe>;
+    using epilogue_policy = epilogue_policy_tile_op<tile_op_t, gpu_arch::Xe>;
     using epilogue_t = epilogue_t<epilogue_policy, tile_shape, mem_desc_c_t>;
     using work_group_t = typename tile_shape::work_group_t;
     using epilogue_args_t = typename epilogue_t::arguments_t;
@@ -70,7 +69,7 @@ struct tile_elemwise_op_func<dtype, swidth, sheight, spitch, twidth, theight,
             = mem_desc_t<dtype, mem_layout::row_major, mem_space::global>;
     using epilogue_policy
             = epilogue_policy_tile_op<gelu_fwd_w_op_t<dtype, gpu_arch::Xe>,
-                    result_overwrite, gpu_arch::Xe>;
+                    gpu_arch::Xe>;
     using epilogue_t = epilogue_t<epilogue_policy, tile_shape, mem_desc_b_t>;
     using work_group_t = typename tile_shape::work_group_t;
     using epilogue_args_t = typename epilogue_t::arguments_t;
@@ -105,7 +104,7 @@ struct tile_elemwise_op_func<dtype, swidth, sheight, spitch, twidth, theight,
             = mem_desc_t<dtype, mem_layout::row_major, mem_space::global>;
     using epilogue_policy
             = epilogue_policy_tile_op<gelu_bwd_op_t<dtype, gpu_arch::Xe>,
-                    result_overwrite, gpu_arch::Xe>;
+                    gpu_arch::Xe>;
     using epilogue_t = epilogue_t<epilogue_policy, tile_shape, mem_desc_c_t>;
     using work_group_t = typename tile_shape::work_group_t;
     using epilogue_args_t = typename epilogue_t::arguments_t;
@@ -136,7 +135,7 @@ struct tile_elemwise_op_func<dtype, swidth, sheight, spitch, twidth, theight,
             = mem_desc_t<dtype, mem_layout::row_major, mem_space::global>;
     using epilogue_policy
             = epilogue_policy_tile_op<bias_add_op_t<dtype, gpu_arch::Xe>,
-                    result_overwrite, gpu_arch::Xe>;
+                    gpu_arch::Xe>;
     using epilogue_t = epilogue_t<epilogue_policy, tile_shape, mem_desc_c_t>;
     using work_group_t = typename tile_shape::work_group_t;
     using epilogue_args_t = typename epilogue_t::arguments_t;
@@ -165,7 +164,7 @@ struct tile_elemwise_op_func<dtype, swidth, sheight, spitch, twidth, theight,
             = mem_desc_t<dtype, mem_layout::row_major, mem_space::global>;
     using epilogue_policy = epilogue_policy_tile_op<
             elemwise_reduce_op_t<reduce_kind, dtype, gpu_arch::Xe>,
-            result_overwrite, gpu_arch::Xe>;
+            gpu_arch::Xe>;
     using epilogue_t = epilogue_t<epilogue_policy, tile_shape, mem_desc_c_t>;
     using work_group_t = typename tile_shape::work_group_t;
     using epilogue_args_t = typename epilogue_t::arguments_t;
diff --git a/tests/utils/buff_compare.hpp b/tests/utils/buff_compare.hpp
index 3acb5b7b..bd587a25 100644
--- a/tests/utils/buff_compare.hpp
+++ b/tests/utils/buff_compare.hpp
@@ -34,7 +34,7 @@ namespace buff_cmp {
 /// @{
 
 // convenient datatype to represent ulp-converted fp buffers
-using ulp_vec = std::vector<uint64_t>;
+using ulp_vec = std::vector<size_t>;
 
 ///
 ///@brief Structure used to describe tensors / buffers as stdlib vectors, idx_mapping vector is used to ignore "unwanted" elements from a tensor array.
@@ -44,8 +44,8 @@ template <typename dtype, typename dtype_src = dtype>
 struct buff_vals {
     using type = dtype;
     std::vector<dtype> buff;
-    std::vector<uint32_t> idx_mapping;
-    unsigned size;
+    std::vector<size_t> idx_mapping;
+    size_t size;
 
     /// @brief Initializes and empty buff_vals<dtype> structure.
     /// @tparam dtype Datatype of the output structure and input buffer.
@@ -55,10 +55,10 @@ struct buff_vals {
     /// @tparam dtype Datatype of the output structure and input buffer.
     /// @param data Pointer to buffer of input data.
     /// @param n Size of input buffer data
-    buff_vals(dtype_src *data, uint32_t n) {
+    buff_vals(dtype_src *data, size_t n) {
         this->size = n;
         this->buff.resize(this->size, 0);
-        for (uint32_t i = 0; i < this->size; ++i) {
+        for (size_t i = 0; i < this->size; ++i) {
             this->buff[i] = data[i];
             this->idx_mapping.push_back(i);
         }
@@ -70,14 +70,13 @@ struct buff_vals {
     /// @param Blocky By default used to define size of input buffer. If input is organized as tensor (meaning, by blocks) then defines block height.
     /// @param Blockx Default value is 1 for non-tensor buffers, otherwise describes tensor block width.
     /// @param Sizex Default value is 1 for non-tensor buffers, otherwise describes tensor pitch.
-    buff_vals(
-            dtype_src *data, uint32_t Blocky, uint32_t Blockx, uint32_t Sizex) {
+    buff_vals(dtype_src *data, size_t Blocky, size_t Blockx, size_t Sizex) {
         this->size = Blockx * Blocky;
         this->buff.resize(this->size, 0);
         this->idx_mapping.resize(this->size, 0);
-        unsigned idx = 0;
-        for (int i = 0; i < Blocky; ++i) {
-            for (int j = 0; j < Blockx; ++j) {
+        size_t idx = 0;
+        for (size_t i = 0; i < Blocky; ++i) {
+            for (size_t j = 0; j < Blockx; ++j) {
                 this->buff[idx] = data[i * Sizex + j];
                 this->idx_mapping[idx] = i * Sizex + j;
                 ++idx;
@@ -89,7 +88,7 @@ struct buff_vals {
     /// @tparam dtype Datatype of the given structure and new element.
     /// @param val Element to be added to structure.
     /// @param idx Index mapping for element being added.
-    void push(dtype val, uint32_t idx) {
+    void push(dtype val, size_t idx) {
         ++this->size;
         this->buff.push_back(val);
         this->idx_mapping.push_back(idx);
@@ -121,15 +120,15 @@ struct buff_vals {
 ///
 template <typename dtype>
 buff_vals<dtype> xetla_get_buff_vals(
-        dtype *data, unsigned Blocky, unsigned Blockx = 1, unsigned Sizex = 1) {
+        dtype *data, size_t Blocky, size_t Blockx = 1, size_t Sizex = 1) {
     buff_vals<dtype> res;
     res.size = Blockx * Blocky;
     res.buff.resize(res.size, 0);
     res.idx_mapping.resize(res.size, 0);
 
-    unsigned idx = 0;
-    for (int i = 0; i < Blocky; ++i) {
-        for (int j = 0; j < Blockx; ++j) {
+    size_t idx = 0;
+    for (size_t i = 0; i < Blocky; ++i) {
+        for (size_t j = 0; j < Blockx; ++j) {
             res.buff[idx] = data[i * Sizex + j];
             res.idx_mapping[idx] = i * Sizex + j;
             ++idx;
@@ -142,7 +141,7 @@ buff_vals<dtype> xetla_get_buff_vals(
 struct rel_abs_vals {
     std::vector<double> ate;
     std::vector<double> rte;
-    unsigned size;
+    size_t size;
 };
 
 ///
@@ -171,7 +170,7 @@ rel_abs_vals xetla_get_rte_and_ate(T1 &v1, T2 &v2) {
     res.rte.resize(res.size, 0.0);
     res.ate.resize(res.size, 0.0);
 
-    for (unsigned i = 0; i < res.size; ++i) {
+    for (size_t i = 0; i < res.size; ++i) {
         res.ate[i] = get_ate(v1.buff[i], v2.buff[i]);
         res.rte[i] = get_rte(v1.buff[i], v2.buff[i]);
     }
@@ -190,7 +189,7 @@ ulp_vec xetla_get_ulp_buffer(T &v1) {
     using dtype = typename T::type;
     using uint_dtype = gpu::xetla::uint_type_t<dtype>;
     ulp_vec ulp_buff(v1.size, 0);
-    for (unsigned i = 0; i < ulp_buff.size(); ++i) {
+    for (size_t i = 0; i < ulp_buff.size(); ++i) {
         uint_dtype val = (*reinterpret_cast<uint_dtype *>(&v1.buff[i]));
         ulp_buff[i] = val;
     }
@@ -211,7 +210,7 @@ ulp_vec xetla_get_ulp_buffer(T &v1) {
 ///
 template <typename dtype>
 bool _handle_fp_types(buff_vals<dtype> &data, buff_vals<dtype> &other,
-        std::string name, uint64_t ulp_tol, double abs_tol) {
+        std::string name, size_t ulp_tol, double abs_tol) {
     if (std::is_same<remove_const_t<dtype>, gpu::xetla::bf16>::value) {
         if (ulp_tol == 0) ulp_tol = 8;
         if (abs_tol == 0) abs_tol = 0.25;
@@ -233,7 +232,7 @@ bool _handle_fp_types(buff_vals<dtype> &data, buff_vals<dtype> &other,
     ulp_vec ulp_data = xetla_get_ulp_buffer(data);
     ulp_vec ulp_other = xetla_get_ulp_buffer(other);
 
-    auto get_ulp_ate = [=](uint64_t a, uint64_t b) {
+    auto get_ulp_ate = [=](size_t a, size_t b) {
         if (a > b)
             return a - b;
         else
@@ -242,10 +241,10 @@ bool _handle_fp_types(buff_vals<dtype> &data, buff_vals<dtype> &other,
 
     ulp_vec aulpte;
     aulpte.resize(ulp_data.size(), 0);
-    for (unsigned i = 0; i < ulp_data.size(); ++i)
+    for (size_t i = 0; i < ulp_data.size(); ++i)
         aulpte[i] = get_ulp_ate(ulp_data[i], ulp_other[i]);
 
-    unsigned aulpidx
+    size_t aulpidx
             = std::max_element(aulpte.begin(), aulpte.end()) - aulpte.begin();
 
     std::cout << "\t"
@@ -258,16 +257,16 @@ bool _handle_fp_types(buff_vals<dtype> &data, buff_vals<dtype> &other,
               << "data_val: " << ulp_data[aulpidx]
               << " gold_val: " << ulp_other[aulpidx] << std::endl;
 
-    uint64_t ulp_threshold = ulp_tol;
+    size_t ulp_threshold = ulp_tol;
     double small_num_threshold = abs_tol;
-    uint32_t diff_elems_count = 0;
+    size_t diff_elems_count = 0;
     bool flag = true;
-    for (unsigned i = 0; i < ulp_data.size(); ++i) {
+    for (size_t i = 0; i < ulp_data.size(); ++i) {
         float des = other.buff[i];
         float act = data.buff[i];
-        uint64_t ulp_des = ulp_other[i];
-        uint64_t ulp_act = ulp_data[i];
-        uint64_t sub_ulp = ulp_act - ulp_des;
+        size_t ulp_des = ulp_other[i];
+        size_t ulp_act = ulp_data[i];
+        size_t sub_ulp = ulp_act - ulp_des;
         if (ulp_des > ulp_act) sub_ulp = ulp_des - ulp_act;
         if (!((fabs(act - des) <= small_num_threshold)
                     || (sub_ulp <= ulp_threshold))) {
@@ -292,7 +291,7 @@ bool _handle_fp_types(buff_vals<dtype> &data, buff_vals<dtype> &other,
 
 template <typename cast_dtype, typename T1, typename T2>
 bool _cast_and_handle_fp_types(T1 &data, T2 &other, std::string name,
-        double diff_elems_tol, uint64_t ulp_tol, double abs_tol) {
+        double diff_elems_tol, size_t ulp_tol, double abs_tol) {
     buff_vals<cast_dtype> casted_data, casted_other;
     casted_data.size = data.size;
     casted_data.buff
@@ -323,8 +322,7 @@ bool _cast_and_handle_fp_types(T1 &data, T2 &other, std::string name,
 ///
 template <typename T1, typename T2>
 bool xetla_buff_cmp(T1 &data, T2 &other, std::string name,
-        double diff_elems_tol = 0.02, uint64_t ulp_tol = 0,
-        double abs_tol = 0) {
+        double diff_elems_tol = 0.02, size_t ulp_tol = 0, double abs_tol = 0) {
     if (data.size != other.size) {
         std::cout << "ERROR: buffer size or shape mismatch!\n";
         return false;
@@ -383,8 +381,8 @@ bool xetla_buff_cmp(T1 &data, T2 &other, std::string name,
                     data, other, name, diff_elems_tol, ulp_tol, abs_tol);
         }
     } else {
-        uint32_t diff_elems_count = 0;
-        for (unsigned i = 0; i < data.size; ++i)
+        size_t diff_elems_count = 0;
+        for (size_t i = 0; i < data.size; ++i)
             if (data.buff[i] != other.buff[i]) ++diff_elems_count;
         float fail_rate = diff_elems_count / ((float)data.size) * 100;
         float pass_rate = 100 - fail_rate;
diff --git a/tests/utils/common.hpp b/tests/utils/common.hpp
index 1678e512..3fd6c5b7 100644
--- a/tests/utils/common.hpp
+++ b/tests/utils/common.hpp
@@ -101,7 +101,7 @@ inline data_type *alloc_device_and_init(size_t size,
         sycl::queue &queue, sycl::device &device, sycl::context &context) {
     auto host_ptr = static_cast<data_type *>(malloc(size * sizeof(data_type)));
 
-    for (uint32_t i = 0; i < size; ++i) {
+    for (size_t i = 0; i < size; ++i) {
         init_func(host_ptr, i);
     }
 
@@ -129,15 +129,15 @@ template <typename data_type_a, typename data_type_b, typename data_type_c,
         typename data_type_acc = float>
 int gemm_result_validate(data_type_a *A_device, data_type_b *B_device,
         data_type_c *C_device, uint32_t batch_size, uint32_t m, uint32_t k,
-        uint32_t n, sycl::queue queue, sycl::context context,
+        uint32_t n, sycl::queue &queue, sycl::context &context,
         mem_layout mem_layout_a_ = mem_layout::row_major,
         mem_layout mem_layout_b_ = mem_layout::row_major) {
     bool is_col_major_a = mem_layout_a_ == mem_layout::col_major;
     bool is_col_major_b = mem_layout_b_ == mem_layout::col_major;
     // define slice of each matrices
-    uint32_t size_a_slice = m * k;
-    uint32_t size_b_slice = k * n;
-    uint32_t size_c_slice = m * n;
+    size_t size_a_slice = m * k;
+    size_t size_b_slice = k * n;
+    size_t size_c_slice = m * n;
 
     auto A = alloc_host_and_copy<data_type_a>(
             A_device, batch_size * size_a_slice, queue);
diff --git a/tests/utils/execution.hpp b/tests/utils/execution.hpp
index db9c620a..499345fc 100644
--- a/tests/utils/execution.hpp
+++ b/tests/utils/execution.hpp
@@ -17,20 +17,23 @@
 #pragma once
 
 #include "profiling.hpp"
+#include "xetla.hpp"
 
 using namespace cl::sycl;
 using namespace gpu;
 using namespace gpu::xetla;
 
+enum class test_result { complete = 0, skip = 1, fail = 2 };
+
 template <class Test, typename data_type_a, typename data_type_b,
         typename data_type_c, typename data_type_acc,
-        template <typename, typename, typename> class initialize_func,
         template <class, typename, typename, typename, typename>
         class validate_func,
         template <class, typename, typename, typename, typename> class KERNEL,
         int SLMSIZE = 128 * 1024, int BARNUM = 32>
 void gemm_exec(size_t matrix_m, size_t matrix_n, size_t matrix_k,
-        std::string compile_str) {
+        std::string compile_str, size_t batch = 1) {
+    test_result result = test_result::complete;
 
     constexpr size_t wg_tile_m = Test::wg_m;
     constexpr size_t wg_tile_n = Test::wg_n;
@@ -38,32 +41,33 @@ void gemm_exec(size_t matrix_m, size_t matrix_n, size_t matrix_k,
     constexpr size_t sg_tile_n = Test::sg_n;
     constexpr size_t sg_tile_k = Test::sg_k;
 
-    int size_a = matrix_m * matrix_k;
-    int size_b = matrix_k * matrix_n;
-    int size_c = matrix_m * matrix_n;
+    size_t size_a = matrix_m * matrix_k;
+    size_t size_b = matrix_k * matrix_n;
+    size_t size_c = matrix_m * matrix_n;
     sycl::property_list properties {sycl::property::queue::enable_profiling()};
     auto queue = sycl::queue(properties);
     auto context = queue.get_info<info::queue::context>();
     auto device = queue.get_info<info::queue::device>();
 
-    std::cout << "Running on " << device.get_info<info::device::name>() << "\n";
+    std::cout << "Running on batch: " << batch << ", "
+              << device.get_info<info::device::name>() << "\n";
 
     auto A = alloc_device_and_init<data_type_a>(
-            size_a,
+            batch * size_a,
             [](data_type_a *data, size_t idx) {
-                data[idx] = static_cast<data_type_a>(random_float());
+                data[idx] = static_cast<data_type_a>((idx * 3) % 17);
             },
             queue, device, context);
     auto B = alloc_device_and_init<data_type_b>(
-            size_b,
+            batch * size_b,
             [](data_type_b *data, size_t idx) {
-                data[idx] = static_cast<data_type_b>(random_float());
+                data[idx] = static_cast<data_type_b>((idx * 5) % 19);
             },
             queue, device, context);
     auto C = alloc_device_and_init<data_type_c>(
-            size_c,
+            batch * size_c,
             [](data_type_c *data, size_t idx) {
-                data[idx] = static_cast<data_type_c>(0.0f);
+                data[idx] = static_cast<data_type_c>(0);
             },
             queue, device, context);
 
@@ -92,33 +96,88 @@ void gemm_exec(size_t matrix_m, size_t matrix_n, size_t matrix_k,
         setenv("SYCL_PROGRAM_COMPILE_OPTIONS", compile_str.c_str(), 1);
         kernel_bundle<bundle_state::executable> exeBundle = build(inputBundle);
         unsetenv("SYCL_PROGRAM_COMPILE_OPTIONS");
-        auto e_esimd = queue.submit([&](handler &cgh) {
-            cgh.use_kernel_bundle(exeBundle);
-            cgh.parallel_for<Test>(
-                    nd_range, [=](nd_item<3> item) SYCL_ESIMD_KERNEL {
-                        gpu::xetla::xetla_exec_item<3> ei(item);
-                        gpu::xetla::xetla_local_init<SLMSIZE>();
-                        gpu::xetla::xetla_nbarrier_init<BARNUM>();
-                        KERNEL<Test, data_type_a, data_type_b, data_type_c,
-                                data_type_acc>::run(ei, A, B, C, matrix_m,
-                                matrix_n, matrix_k);
-                    });
-        });
-        e_esimd.wait();
 
+        using namespace gpu::xetla::group;
+        using namespace gpu::xetla::kernel;
+        using namespace gpu::xetla::subgroup;
+        using tile_shape
+                = tile_shape_t<wg_tile_n, wg_tile_m, sg_tile_n, sg_tile_m>;
+        static constexpr uint32_t periodic_sync_interval = 8;
+        static constexpr uint32_t prefetch_distance = 3;
+        using brgemm_t = typename brgemm_selector_t<data_type_a, data_type_b,
+                Test::layout_a, Test::layout_b, mem_space::global,
+                mem_space::global, sizeof(data_type_a) == 4 ? 4 : 8,
+                sizeof(data_type_a) == 4 ? 4 : 8, data_type_acc, tile_shape,
+                sg_tile_k, mma_engine::xmx, gpu_arch::Xe, prefetch_distance,
+                periodic_sync_interval>::brgemm;
+
+        using update_method = typename std::conditional<(Test::l3_kslicing > 1),
+                result_reduce_sum, result_overwrite>::type;
+        using epilogue_t = epilogue_t<
+                epilogue_policy_default<update_method, gpu_arch::Xe>,
+                tile_shape,
+                mem_desc_t<data_type_c, mem_layout::row_major,
+                        mem_space::global>>;
+
+        using gemm_op_t = gemm_t<dispatch_policy_kslicing<Test::l3_kslicing,
+                                         Test::slm_kslicing, gpu_arch::Xe>,
+                brgemm_t, epilogue_t>;
+
+        for (size_t i = 0; i < batch; i++) {
+            auto A_ptr = A + i * size_a;
+            auto B_ptr = B + i * size_b;
+            auto C_ptr = C + i * size_c;
+            typename gemm_op_t::arguments_t arg(matrix_m, matrix_k, matrix_n,
+                    A_ptr,
+                    Test::layout_a == mem_layout::col_major ? matrix_m
+                                                            : matrix_k,
+                    B_ptr,
+                    Test::layout_b == mem_layout::col_major ? matrix_k
+                                                            : matrix_n,
+                    C_ptr, matrix_n);
+            if (!gemm_op_t::can_implement(arg)) {
+                std::cout << "The arguments cannot be supported, skip ... "
+                          << std::endl;
+                result = test_result::skip;
+                break;
+            }
+
+            auto e_esimd = queue.submit([&](handler &cgh) {
+                cgh.use_kernel_bundle(exeBundle);
+                cgh.parallel_for<Test>(
+                        nd_range, [=](nd_item<3> item) SYCL_ESIMD_KERNEL {
+                            gpu::xetla::xetla_exec_item<3> ei(item);
+                            gpu::xetla::xetla_local_init<SLMSIZE>();
+                            gpu::xetla::xetla_nbarrier_init<BARNUM>();
+                            KERNEL<Test, data_type_a, data_type_b, data_type_c,
+                                    data_type_acc>::run(ei, A_ptr, B_ptr, C_ptr,
+                                    matrix_m, matrix_n, matrix_k);
+                        });
+            });
+            e_esimd.wait();
+        }
     } catch (cl::sycl::exception const &e) {
         std::cout << "SYCL exception caught: " << e.what() << '\n';
-        FAIL();
+        result = test_result::fail;
     }
 
     // validation
-    validate_func<Test, data_type_a, data_type_b, data_type_c, data_type_acc>
-            vfunc;
-    ASSERT_EQ(0, vfunc(A, B, C, queue, context));
+    if (result == test_result::complete) {
+        validate_func<Test, data_type_a, data_type_b, data_type_c,
+                data_type_acc>
+                vfunc;
+        ASSERT_EQ(0, vfunc(A, B, C, queue, context));
+    }
 
     free(A, context);
     free(B, context);
     free(C, context);
+
+    if (result == test_result::skip) {
+        GTEST_SKIP();
+    } else if (result != test_result::complete) {
+        FAIL();
+    }
 }
 
 /// @brief The template function to execute kernel in esimd way for unit test framework
@@ -128,8 +187,8 @@ void gemm_exec(size_t matrix_m, size_t matrix_n, size_t matrix_k,
 /// @param nd_range the range of workitems
 /// @param validate_result validation function, taking 3 parameters buffer A, B as input C as output
 ///
-template <typename data_type, class KERNEL, int SLMSIZE = 128 * 1024,
-        int BARNUM = 32, int Size = 4096>
+template <typename data_type, class KERNEL, size_t SLMSIZE = 128 * 1024,
+        size_t BARNUM = 32, size_t Size = 4096>
 void kernel_run(auto nd_range, auto validate_result) {
 
     queue queue {};
diff --git a/tools/scripts/env.sh b/tools/scripts/env.sh
index 357f58e3..36c5cf1e 100755
--- a/tools/scripts/env.sh
+++ b/tools/scripts/env.sh
@@ -10,4 +10,5 @@ source ${ONEAPI_INSTALL_PATH}/setvars.sh
 
 # Export environment variables
 export CC=icx
+export CXX=icpx
 export ONEAPI_DEVICE_SELECTOR=level_zero1:*