Add test cases and refine them (#563)

intel · Jun 16, 2023 · 1b319d5 · 1b319d5
1 parent 76c593f
commit 1b319d5
Show file tree

Hide file tree

Showing 49 changed files with 373 additions and 370 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,7 @@
 ## [v0.3.2](https://github.com/intel/xetla/releases/tag/v0.3.2) (2023-06-16)
 - New Features
   * Added some kernel-level APIs' parameters check functions, users need to explicit call them before launch the kernel; will return fail and print error messages when detect unsupported scenarios, continue launching the kernel for unspported scenarios may lead to unpredictable result.
+  * Removed reduce_sum + tile_op epilogue policy.
   * Added some unit test cases.
   * Refined some examples code.
   * Updated documents, added some diagrams and details.

diff --git a/README.md b/README.md
@@ -1,4 +1,7 @@
-# Intel® Xe Templates for Linear Algebra [v0.3.2](/CHANGELOG.md)
+# Intel® Xe Templates for Linear Algebra
+
+_Intel® XeTLA [v0.3.2](/CHANGELOG.md) - June 2023_
+
 Intel® Xe Templates for Linear Algebra (Intel® XeTLA) is a collection of SYCL/ESIMD templates that enable high-performance General Matrix Multiply (GEMM), Convolution (CONV), and related computations on Intel Xe GPU architecture. Intel® XeTLA offers reusable C++ templates for kernel, work-group and sub-group levels, allowing developers to optimize and specialize kernels based on data types, tiling policies, algorithms, fusion policies, and more. 
 
 One of the key features of Intel® XeTLA is its ability to abstract and hide details of Xe hardware implementations, particularly those related to matrix computations, such as the systolic array and other low level instructions. This ensures that SYCL/DPC++ developers can focus on leveraging the performance benefits of Intel® XeTLA without being burdened by hardware-specific instructions.

diff --git a/examples/01_basic_gemm/basic_gemm.cpp b/examples/01_basic_gemm/basic_gemm.cpp
@@ -99,7 +99,9 @@ void basic_gemm_run(uint32_t iter) {
             ::brgemm;
 
     using epilogue_t = xetla::group::epilogue_t<
-            xetla::group::epilogue_policy_default<gpu_arch::Xe>, tile_shape,
+            xetla::group::epilogue_policy_default<result_overwrite,
+                    gpu_arch::Xe>,
+            tile_shape,
             mem_desc_t<data_type_c, mem_layout::row_major, mem_space::global>>;
 
     using gemm_op_t = xetla::kernel::gemm_t<

diff --git a/examples/02_basic_brgemm/basic_brgemm.cpp b/examples/02_basic_brgemm/basic_brgemm.cpp
@@ -145,9 +145,9 @@ void basic_brgemm_run(uint32_t iter) {
                 brgemm_t brgemm;
 
                 // Step 4: epilogue function to overwrite the result
-                using epilogue_t
-                        = epilogue_t<epilogue_policy_default<gpu_arch::Xe>,
-                                tile_shape, mem_desc_output_c>;
+                using epilogue_t = epilogue_t<
+                        epilogue_policy_default<result_overwrite, gpu_arch::Xe>,
+                        tile_shape, mem_desc_output_c>;
 
                 // Step 5: define the shared local memory usages
                 // developers have the responsibility to set

diff --git a/examples/03_gemm_relu_bias/gemm_relu_bias.cpp b/examples/03_gemm_relu_bias/gemm_relu_bias.cpp
@@ -25,7 +25,7 @@ template <typename data_type_a, typename data_type_b, typename data_type_c,
         typename data_type_d, typename data_type_acc = float>
 int gemm_relu_bias_result_validate(data_type_a *A_device, data_type_b *B_device,
         data_type_c *C_device, data_type_d *D_device, uint32_t m, uint32_t k,
-        uint32_t n, sycl::queue queue,
+        uint32_t n, sycl::queue &queue,
         mem_layout mem_layout_a_ = mem_layout::row_major,
         mem_layout mem_layout_b_ = mem_layout::row_major) {
     auto A = alloc_host_and_copy<data_type_a>(A_device, m * k, queue);
@@ -167,8 +167,7 @@ void gemm_relu_bias_run(uint32_t iter) {
     using mem_desc_output_t
             = mem_desc_t<data_type_c, mem_layout::row_major, mem_space::global>;
     using epilogue_t = xetla::group::epilogue_t<
-            xetla::group::epilogue_policy_tile_op<tile_op_t, result_overwrite,
-                    gpu_arch::Xe>,
+            xetla::group::epilogue_policy_tile_op<tile_op_t, gpu_arch::Xe>,
             tile_shape, mem_desc_output_t>;
 
     using gemm_op_t = xetla::kernel::gemm_t<

diff --git a/examples/04_brgemm_polynomial/brgemm_polynomial.cpp b/examples/04_brgemm_polynomial/brgemm_polynomial.cpp
@@ -26,7 +26,7 @@ template <typename data_type_a, typename data_type_b, typename data_type_c,
         typename data_type_acc = float>
 int gemm_polynomial_result_validate(data_type_a *A_device,
         data_type_b *B_device, data_type_c *C_device, int m, int k, int n,
-        sycl::queue queue, mem_layout mem_layout_a_ = mem_layout::row_major,
+        sycl::queue &queue, mem_layout mem_layout_a_ = mem_layout::row_major,
         mem_layout mem_layout_b_ = mem_layout::row_major) {
     auto A = alloc_host_and_copy<data_type_a>(A_device, m * k, queue);
     auto B = alloc_host_and_copy<data_type_b>(B_device, k * n, queue);
@@ -203,10 +203,9 @@ void brgemm_polynomial_run(int iter) {
                 // is already calculated.
                 // Mathematically epilogue_t is a map that applies to each element:
                 //   epilogue_t: [m, n] -> [m, n], C_acc |-> tile_op_t(C_acc)
-                using epilogue_t
-                        = epilogue_t<epilogue_policy_tile_op<tile_op_t,
-                                             result_overwrite, gpu_arch::Xe>,
-                                tile_shape, mem_desc_output_c>;
+                using epilogue_t = epilogue_t<
+                        epilogue_policy_tile_op<tile_op_t, gpu_arch::Xe>,
+                        tile_shape, mem_desc_output_c>;
 
                 // [Polynomial] define arguments for each epilogue_tile_op in chained_tile_op_t<>
                 using epilogue_tile_op_args_t

diff --git a/examples/05_batch_gemm/batch_gemm.cpp b/examples/05_batch_gemm/batch_gemm.cpp
@@ -111,7 +111,9 @@ void batch_gemm_run(uint32_t iter) {
             ::brgemm;
 
     using epilogue_t = xetla::group::epilogue_t<
-            xetla::group::epilogue_policy_default<gpu_arch::Xe>, tile_shape,
+            xetla::group::epilogue_policy_default<result_overwrite,
+                    gpu_arch::Xe>,
+            tile_shape,
             mem_desc_t<data_type_c, mem_layout::row_major, mem_space::global>>;
 
     using gemm_op_t = xetla::kernel::gemm_t<

diff --git a/examples/06_splitk_brgemm/splitk_brgemm.cpp b/examples/06_splitk_brgemm/splitk_brgemm.cpp
@@ -157,10 +157,10 @@ void splitk_brgemm_run(uint32_t iter) {
                 // [Split-K] When Split-K is used, update_method should be set to
                 // result_reduce_sum in order to aggregate partial sum from each sub-task
                 // to the final output matrix C
-                using epilogue_t = epilogue_t<
-                        epilogue_policy_tile_op<chained_tile_op_t<>,
-                                result_reduce_sum, gpu_arch::Xe>,
-                        tile_shape, mem_desc_output_c>;
+                using epilogue_t
+                        = epilogue_t<epilogue_policy_default<result_reduce_sum,
+                                             gpu_arch::Xe>,
+                                tile_shape, mem_desc_output_c>;
 
                 // Step 5: define the shared local memory usages
                 // developers have the responsibility to set

diff --git a/examples/07_gemm_softmax/gemm_softmax.cpp b/examples/07_gemm_softmax/gemm_softmax.cpp
@@ -26,7 +26,7 @@ template <typename data_type_a, typename data_type_b, typename data_type_c,
         typename data_type_acc = float>
 int gemm_softmax_result_validate(data_type_a *A_device, data_type_b *B_device,
         data_type_c *C_device, uint32_t m, uint32_t k, uint32_t n,
-        uint32_t batch_num, sycl::queue queue,
+        uint32_t batch_num, sycl::queue &queue,
         mem_layout mem_layout_a_ = mem_layout::row_major,
         mem_layout mem_layout_b_ = mem_layout::row_major) {
     uint32_t err_cnt = 0;
@@ -213,7 +213,8 @@ void gemm_softmax_run(uint32_t iter) {
 
                             // epilogue function to overwrite the result
                             using epilogue_t = epilogue_t<
-                                    epilogue_policy_default<gpu_arch::Xe>,
+                                    epilogue_policy_default<result_overwrite,
+                                            gpu_arch::Xe>,
                                     tile_shape, mem_desc_c_t>;
 
                             // using experimental::group::softmax

diff --git a/examples/08_multi_layer_perceptron/multi_layer_perceptron.cpp b/examples/08_multi_layer_perceptron/multi_layer_perceptron.cpp
@@ -37,7 +37,7 @@ template <typename data_type_a, typename data_type_b, typename data_type_c,
         typename data_type_acc = float>
 int mlp_result_validate(data_type_a *A_device, data_type_b *B_device,
         data_type_c *C_device, data_type_w *W_device, data_type_v *V_device,
-        uint32_t m, uint32_t k, uint32_t n, uint32_t l, sycl::queue queue,
+        uint32_t m, uint32_t k, uint32_t n, uint32_t l, sycl::queue &queue,
         mem_layout mem_layout_a_ = mem_layout::row_major,
         mem_layout mem_layout_w_ = mem_layout::row_major,
         mem_layout mem_layout_v_ = mem_layout::row_major) {
@@ -248,8 +248,7 @@ void mlp_run(uint32_t iter) {
                             mem_layout::row_major, mem_space::global>;
                     using epilogue_t = epilogue_t<
                             epilogue_policy_tile_op<
-                                    chained_tile_op_t<relu_op_t>,
-                                    result_overwrite, gpu_arch::Xe>,
+                                    chained_tile_op_t<relu_op_t>, gpu_arch::Xe>,
                             tile_shape, mem_desc_output_t>;
                     // [MLP] Define tile_op arguments
                     using epilogue_tile_op_args_t
@@ -351,8 +350,8 @@ void mlp_run(uint32_t iter) {
                             ::brgemm;
 
                     using epilogue_t = epilogue_t<
-                            epilogue_policy_tile_op<chained_tile_op_t<>,
-                                    result_overwrite, gpu_arch::Xe>,
+                            epilogue_policy_default<result_overwrite,
+                                    gpu_arch::Xe>,
                             tile_shape,
                             mem_desc_t<data_type_c, mem_layout::row_major,
                                     mem_space::global>>;

diff --git a/examples/09_scaled_dot_production_attention/scaled_dot_production_attention.cpp b/examples/09_scaled_dot_production_attention/scaled_dot_production_attention.cpp
@@ -31,7 +31,7 @@ template <typename dtype_in, typename dtype_out, typename data_type_acc = float>
 int sdp_fwd_result_validate(dtype_in *q_device, dtype_in *k_device,
         dtype_in *v_device, dtype_in *mask_device, dtype_out *c_device,
         uint32_t qk_m, uint32_t qk_k, uint32_t qk_n, uint32_t sv_m,
-        uint32_t sv_k, uint32_t sv_n, uint32_t batch_cnt, sycl::queue queue,
+        uint32_t sv_k, uint32_t sv_n, uint32_t batch_cnt, sycl::queue &queue,
         mem_layout mem_layout_qk_a_ = mem_layout::row_major,
         mem_layout mem_layout_qk_b_ = mem_layout::row_major,
         mem_layout mem_layout_sv_a_ = mem_layout::row_major,
@@ -267,8 +267,7 @@ void sdp_fwd_run(uint32_t iter) {
                             sg_tile_k_qk, mma_engine::xmx, gpu_arch::Xe,
                             prefetch_distance, periodic_sync_interval>::brgemm;
                     using epilogue0_t = epilogue_t<
-                            epilogue_policy_tile_op<post_op_t, result_overwrite,
-                                    gpu_arch::Xe>,
+                            epilogue_policy_tile_op<post_op_t, gpu_arch::Xe>,
                             tile_shape0,
                             mem_desc_t<dtype_sfx, mem_layout::row_major,
                                     mem_space::local>>;

diff --git a/examples/10_gate_recurrent_unit/gate_recurrent_unit.cpp b/examples/10_gate_recurrent_unit/gate_recurrent_unit.cpp
@@ -24,7 +24,7 @@ int validation(data_type *layer_inputs_device, data_type *h0_inputs_device,
         std::vector<data_type *> i_weights, std::vector<data_type *> h_weights,
         data_type *hidden_outputs_device, data_type *layer_outputs_device,
         uint32_t batch_size, uint32_t input_size, uint32_t hidden_size,
-        uint32_t sequence_length, sycl::queue queue, uint32_t layer_size = 1) {
+        uint32_t sequence_length, sycl::queue &queue, uint32_t layer_size = 1) {
     uint32_t layer_input_size = batch_size * input_size;
     uint32_t hidden_io_size = batch_size * hidden_size;
     uint32_t i_weight_size = input_size * hidden_size;
@@ -268,6 +268,9 @@ void gru_run(uint32_t iter) {
                 data[idx] = static_cast<data_type>(0.001 * random_float());
             },
             queue, device, context);
+
+    i_weights.push_back(ir_weights);
+
     auto iz_weights = alloc_device_and_init<data_type>(
             input_weight_size,
             [](data_type *data, size_t idx) {
@@ -280,7 +283,7 @@ void gru_run(uint32_t iter) {
     auto in_weights = alloc_device_and_init<data_type>(
             input_weight_size,
             [](data_type *data, size_t idx) {
-                data[idx] = static_cast<data_type>(0.001 * random_float());
+                data[idx] = static_cast<data_type>(0.0001 * random_float());
             },
             queue, device, context);
 

diff --git a/examples/10_gate_recurrent_unit/kernel_func.hpp b/examples/10_gate_recurrent_unit/kernel_func.hpp
@@ -128,8 +128,9 @@ struct gru_layer {
 
     // define arguments for each epilogue_tile_op in chained_tile_op_t<>
 
-    using epilogue_t = epilogue_t<epilogue_policy_default<gpu_arch::Xe>,
-            tile_shape, mem_desc_c_t>;
+    using epilogue_t = epilogue_t<
+            epilogue_policy_default<result_overwrite, gpu_arch::Xe>, tile_shape,
+            mem_desc_c_t>;
     using epilogue_args_t = typename epilogue_t::arguments_t;
 
     using matC_tile_desc_t = tile_desc_t<matAcc_t::tile_size_x,

diff --git a/include/group/epilogue/epilogue_policy.hpp b/include/group/epilogue/epilogue_policy.hpp
@@ -27,42 +27,38 @@ namespace gpu::xetla::group {
 /// @{
 
 /// @brief Default epilogue policy for store C.
+/// @tparam update_method_ Is the store method of matC.
 /// @tparam arch_ Is the HW architecture.
-template <gpu_arch arch_ = gpu_arch::Xe>
+template <typename update_method_ = result_overwrite,
+        gpu_arch arch_ = gpu_arch::Xe>
 struct epilogue_policy_default {
+    using update_method = update_method_;
     static constexpr gpu_arch arch_tag = arch_;
+    static_assert(std::is_same<update_method, result_overwrite>::value
+                    || std::is_same<update_method, result_reduce_sum>::value,
+            "The result can be either overwrite or reduce_sum");
 };
 
 /// @brief Epilogue policy for tile_op + store C fusion.
 /// @tparam tile_op_t_ Is the tile_op functor.
-/// @tparam update_method_ Is the store method of matC.
 /// @tparam arch_ Is the HW architecture.
-template <typename tile_op_t_, typename update_method_ = result_overwrite,
-        gpu_arch arch_ = gpu_arch::Xe>
+template <typename tile_op_t_, gpu_arch arch_ = gpu_arch::Xe>
 struct epilogue_policy_tile_op {
     using tile_op = tile_op_t_;
-    using update_method = update_method_;
-    static_assert(std::is_same<update_method, result_overwrite>::value
-                    || std::is_same<update_method, result_reduce_sum>::value,
-            "The result can be either overwrite or reduce_sum");
+    using update_method = result_overwrite;
     static constexpr gpu_arch arch_tag = arch_;
 };
 
 /// @brief Epilogue functor, specialized for quantization operator.
 /// @tparam tile_op_t_ is the tile op type.
 /// @tparam quant_op_t_ is the quantization op type
-/// @tparam update_method_ is update policy for result store.
 /// @tparam arch_ Is the HW architecture.
 template <typename tile_op_t_, typename quant_op_t_,
-        typename update_method_ = result_overwrite,
         gpu_arch arch_ = gpu_arch::Xe>
 struct epilogue_policy_quant_op {
     using tile_op = tile_op_t_;
     using quant_op = quant_op_t_;
-    using update_method = update_method_;
-    static_assert(std::is_same<update_method, result_overwrite>::value
-                    || std::is_same<update_method, result_reduce_sum>::value,
-            "The result can be either overwrite or reduce_sum");
+    using update_method = result_overwrite;
     static constexpr gpu_arch arch_tag = arch_;
 };
 /// @} xetla_epilogue

diff --git a/include/group/epilogue/impl/default_xe.hpp b/include/group/epilogue/impl/default_xe.hpp
@@ -29,12 +29,13 @@ namespace gpu::xetla::group {
 /// @{
 
 /// @brief Is the epilogue functor specialized for epilogue_policy_default and Xe architecture.
-template <typename tile_shape_, typename mem_desc_c_t_>
-class epilogue_t<epilogue_policy_default<gpu_arch::Xe>, tile_shape_,
-        mem_desc_c_t_> {
+template <typename tile_shape_, typename update_method_, typename mem_desc_c_t_>
+class epilogue_t<epilogue_policy_default<update_method_, gpu_arch::Xe>,
+        tile_shape_, mem_desc_c_t_> {
 public:
-    using epilogue_policy = epilogue_policy_default<gpu_arch::Xe>;
-    using update_method = result_overwrite;
+    using epilogue_policy
+            = epilogue_policy_default<update_method_, gpu_arch::Xe>;
+    using update_method = typename epilogue_policy::update_method;
     using tile_shape = tile_shape_;
     using mem_desc_c_t = mem_desc_c_t_;
     static constexpr gpu_arch arch_tag = gpu_arch::Xe;
@@ -55,8 +56,14 @@ class epilogue_t<epilogue_policy_default<gpu_arch::Xe>, tile_shape_,
     static constexpr mem_layout mem_layout_c = mem_desc_c_t::layout;
     static constexpr mem_space mem_space_c = mem_desc_c_t::space;
     static constexpr msg_type msg_type_c
-            = (mem_space_c == mem_space::global ? msg_type::block_2d
-                                                : msg_type::scatter);
+            = std::is_same<update_method, result_overwrite>::value
+            ? (mem_space_c == mem_space::global ? msg_type::block_2d
+                                                : msg_type::scatter)
+            : msg_type::atomic_add;
+    static_assert(!(std::is_same<update_method, result_reduce_sum>::value
+                          && mem_space_c == mem_space::local),
+            "Local memory not support result_reduce_sum");
+
     /// @brief Updates tile base descriptor based on the tid.
     __XETLA_API static void update_sg_tile_tdesc(
             work_group_t &g, mem_desc_c_t &mem_desc_c) {

diff --git a/include/group/epilogue/impl/quant_tile_op_xe.hpp b/include/group/epilogue/impl/quant_tile_op_xe.hpp
@@ -29,17 +29,17 @@ namespace gpu::xetla::group {
 /// @{
 
 /// @brief Is the epilogue functor specialized for epilogue_policy_quant_op and Xe architecture.
-template <typename tile_op_t_, typename quant_op_t_, typename update_method_,
-        typename tile_shape_, typename mem_desc_c_t_>
-class epilogue_t<epilogue_policy_quant_op<tile_op_t_, quant_op_t_,
-                         update_method_, gpu_arch::Xe>,
+template <typename tile_op_t_, typename quant_op_t_, typename tile_shape_,
+        typename mem_desc_c_t_>
+class epilogue_t<
+        epilogue_policy_quant_op<tile_op_t_, quant_op_t_, gpu_arch::Xe>,
         tile_shape_, mem_desc_c_t_> {
 public:
-    using epilogue_policy = epilogue_policy_quant_op<tile_op_t_, quant_op_t_,
-            update_method_, gpu_arch::Xe>;
+    using epilogue_policy
+            = epilogue_policy_quant_op<tile_op_t_, quant_op_t_, gpu_arch::Xe>;
     using quant_op_t = quant_op_t_;
     using tile_op_t = tile_op_t_;
-    using update_method = update_method_;
+    using update_method = typename epilogue_policy::update_method;
     using tile_shape = tile_shape_;
     using mem_desc_c_t = mem_desc_c_t_;
     static constexpr gpu_arch arch_tag = gpu_arch::Xe;
@@ -100,14 +100,8 @@ class epilogue_t<epilogue_policy_quant_op<tile_op_t_, quant_op_t_,
     static constexpr mem_layout mem_layout_c = mem_desc_c_t::layout;
     static constexpr mem_space mem_space_c = mem_desc_c_t::space;
     static constexpr msg_type msg_type_c
-            = std::is_same<update_method, result_overwrite>::value
-            ? (mem_space_c == mem_space::global ? msg_type::block_2d
-                                                : msg_type::scatter)
-            : msg_type::atomic_add;
-
-    static_assert(!(std::is_same<update_method, result_reduce_sum>::value
-                          && mem_space_c == mem_space::local),
-            "Local memory not support result_reduce_sum");
+            = (mem_space_c == mem_space::global ? msg_type::block_2d
+                                                : msg_type::scatter);
 
     /// @brief Updates tile base descriptor based on the tid.
     __XETLA_API static void update_sg_tile_tdesc(