From 1b319d5beb4c55805ab92a2e4e997a622287d6ab Mon Sep 17 00:00:00 2001 From: taozha2 Date: Tue, 13 Jun 2023 18:43:06 +0800 Subject: [PATCH] Add test cases and refine them (#563) --- CHANGELOG.md | 1 + README.md | 5 +- examples/01_basic_gemm/basic_gemm.cpp | 4 +- examples/02_basic_brgemm/basic_brgemm.cpp | 6 +- examples/03_gemm_relu_bias/gemm_relu_bias.cpp | 5 +- .../brgemm_polynomial.cpp | 9 +- examples/05_batch_gemm/batch_gemm.cpp | 4 +- examples/06_splitk_brgemm/splitk_brgemm.cpp | 8 +- examples/07_gemm_softmax/gemm_softmax.cpp | 5 +- .../multi_layer_perceptron.cpp | 9 +- .../scaled_dot_production_attention.cpp | 5 +- .../gate_recurrent_unit.cpp | 7 +- .../10_gate_recurrent_unit/kernel_func.hpp | 5 +- include/group/epilogue/epilogue_policy.hpp | 24 ++-- include/group/epilogue/impl/default_xe.hpp | 21 ++- .../group/epilogue/impl/quant_tile_op_xe.hpp | 24 ++-- include/group/epilogue/impl/tile_op_xe.hpp | 23 +--- include/subgroup/tile/config.hpp | 42 ------ media/docs/programming_guidelines.md | 14 +- media/docs/quick_start.md | 2 +- media/docs/terminology.md | 19 --- tests/integration/data_transformer/common.hpp | 2 +- tests/integration/gemm/bf16/common.hpp | 21 +-- tests/integration/gemm/bf16/kernel_func.hpp | 2 +- tests/integration/gemm/bf16/main.cpp | 6 +- tests/integration/gemm/fp16/kernel_func.hpp | 2 +- tests/integration/gemm/fp16/main.cpp | 27 +--- tests/integration/gemm/fp32/common.hpp | 78 ++++++++--- tests/integration/gemm/fp32/kernel_func.hpp | 8 +- tests/integration/gemm/fp32/main.cpp | 8 +- tests/integration/gemm/int8/common.hpp | 80 ++++++++---- tests/integration/gemm/int8/kernel_func.hpp | 2 +- tests/integration/gemm/int8/main.cpp | 6 +- .../gemm/int8_quantization/common.hpp | 2 +- .../gemm/int8_quantization/kernel_func.hpp | 2 +- tests/integration/gemm/tf32/kernel_func.hpp | 2 +- tests/integration/gemm/tf32/main.cpp | 26 +--- tests/integration/row_reduction/common.hpp | 2 +- tests/integration/sg_dropout_op/common.hpp | 2 +- .../integration/sg_dropout_op/kernel_func.hpp | 7 +- .../integration/vector_add/bf16_2d/common.hpp | 3 +- .../vector_add/int32_1d/common.hpp | 2 +- .../vector_add/int32_2d/common.hpp | 2 +- .../integration/vector_add/tf32_1d/common.hpp | 2 +- tests/unit/epilogue_tile_op/kernel_func.hpp | 11 +- tests/utils/buff_compare.hpp | 64 +++++---- tests/utils/common.hpp | 10 +- tests/utils/execution.hpp | 121 +++++++++++++----- tools/scripts/env.sh | 1 + 49 files changed, 373 insertions(+), 370 deletions(-) delete mode 100644 include/subgroup/tile/config.hpp diff --git a/CHANGELOG.md b/CHANGELOG.md index 2e5037e7..44ddf787 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ ## [v0.3.2](https://github.com/intel/xetla/releases/tag/v0.3.2) (2023-06-16) - New Features * Added some kernel-level APIs' parameters check functions, users need to explicit call them before launch the kernel; will return fail and print error messages when detect unsupported scenarios, continue launching the kernel for unspported scenarios may lead to unpredictable result. + * Removed reduce_sum + tile_op epilogue policy. * Added some unit test cases. * Refined some examples code. * Updated documents, added some diagrams and details. diff --git a/README.md b/README.md index 87929fb7..d0452e9c 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,7 @@ -# Intel® Xe Templates for Linear Algebra [v0.3.2](/CHANGELOG.md) +# Intel® Xe Templates for Linear Algebra + +_Intel® XeTLA [v0.3.2](/CHANGELOG.md) - June 2023_ + Intel® Xe Templates for Linear Algebra (Intel® XeTLA) is a collection of SYCL/ESIMD templates that enable high-performance General Matrix Multiply (GEMM), Convolution (CONV), and related computations on Intel Xe GPU architecture. Intel® XeTLA offers reusable C++ templates for kernel, work-group and sub-group levels, allowing developers to optimize and specialize kernels based on data types, tiling policies, algorithms, fusion policies, and more. One of the key features of Intel® XeTLA is its ability to abstract and hide details of Xe hardware implementations, particularly those related to matrix computations, such as the systolic array and other low level instructions. This ensures that SYCL/DPC++ developers can focus on leveraging the performance benefits of Intel® XeTLA without being burdened by hardware-specific instructions. diff --git a/examples/01_basic_gemm/basic_gemm.cpp b/examples/01_basic_gemm/basic_gemm.cpp index 80f9f3ac..dcb3cf3e 100644 --- a/examples/01_basic_gemm/basic_gemm.cpp +++ b/examples/01_basic_gemm/basic_gemm.cpp @@ -99,7 +99,9 @@ void basic_gemm_run(uint32_t iter) { ::brgemm; using epilogue_t = xetla::group::epilogue_t< - xetla::group::epilogue_policy_default, tile_shape, + xetla::group::epilogue_policy_default, + tile_shape, mem_desc_t>; using gemm_op_t = xetla::kernel::gemm_t< diff --git a/examples/02_basic_brgemm/basic_brgemm.cpp b/examples/02_basic_brgemm/basic_brgemm.cpp index 363f5c22..a0fb35c9 100644 --- a/examples/02_basic_brgemm/basic_brgemm.cpp +++ b/examples/02_basic_brgemm/basic_brgemm.cpp @@ -145,9 +145,9 @@ void basic_brgemm_run(uint32_t iter) { brgemm_t brgemm; // Step 4: epilogue function to overwrite the result - using epilogue_t - = epilogue_t, - tile_shape, mem_desc_output_c>; + using epilogue_t = epilogue_t< + epilogue_policy_default, + tile_shape, mem_desc_output_c>; // Step 5: define the shared local memory usages // developers have the responsibility to set diff --git a/examples/03_gemm_relu_bias/gemm_relu_bias.cpp b/examples/03_gemm_relu_bias/gemm_relu_bias.cpp index cec6d644..630cc0da 100644 --- a/examples/03_gemm_relu_bias/gemm_relu_bias.cpp +++ b/examples/03_gemm_relu_bias/gemm_relu_bias.cpp @@ -25,7 +25,7 @@ template int gemm_relu_bias_result_validate(data_type_a *A_device, data_type_b *B_device, data_type_c *C_device, data_type_d *D_device, uint32_t m, uint32_t k, - uint32_t n, sycl::queue queue, + uint32_t n, sycl::queue &queue, mem_layout mem_layout_a_ = mem_layout::row_major, mem_layout mem_layout_b_ = mem_layout::row_major) { auto A = alloc_host_and_copy(A_device, m * k, queue); @@ -167,8 +167,7 @@ void gemm_relu_bias_run(uint32_t iter) { using mem_desc_output_t = mem_desc_t; using epilogue_t = xetla::group::epilogue_t< - xetla::group::epilogue_policy_tile_op, + xetla::group::epilogue_policy_tile_op, tile_shape, mem_desc_output_t>; using gemm_op_t = xetla::kernel::gemm_t< diff --git a/examples/04_brgemm_polynomial/brgemm_polynomial.cpp b/examples/04_brgemm_polynomial/brgemm_polynomial.cpp index 2c03c6e8..ecce1a07 100644 --- a/examples/04_brgemm_polynomial/brgemm_polynomial.cpp +++ b/examples/04_brgemm_polynomial/brgemm_polynomial.cpp @@ -26,7 +26,7 @@ template int gemm_polynomial_result_validate(data_type_a *A_device, data_type_b *B_device, data_type_c *C_device, int m, int k, int n, - sycl::queue queue, mem_layout mem_layout_a_ = mem_layout::row_major, + sycl::queue &queue, mem_layout mem_layout_a_ = mem_layout::row_major, mem_layout mem_layout_b_ = mem_layout::row_major) { auto A = alloc_host_and_copy(A_device, m * k, queue); auto B = alloc_host_and_copy(B_device, k * n, queue); @@ -203,10 +203,9 @@ void brgemm_polynomial_run(int iter) { // is already calculated. // Mathematically epilogue_t is a map that applies to each element: // epilogue_t: [m, n] -> [m, n], C_acc |-> tile_op_t(C_acc) - using epilogue_t - = epilogue_t, - tile_shape, mem_desc_output_c>; + using epilogue_t = epilogue_t< + epilogue_policy_tile_op, + tile_shape, mem_desc_output_c>; // [Polynomial] define arguments for each epilogue_tile_op in chained_tile_op_t<> using epilogue_tile_op_args_t diff --git a/examples/05_batch_gemm/batch_gemm.cpp b/examples/05_batch_gemm/batch_gemm.cpp index b9c72cac..39c2d832 100644 --- a/examples/05_batch_gemm/batch_gemm.cpp +++ b/examples/05_batch_gemm/batch_gemm.cpp @@ -111,7 +111,9 @@ void batch_gemm_run(uint32_t iter) { ::brgemm; using epilogue_t = xetla::group::epilogue_t< - xetla::group::epilogue_policy_default, tile_shape, + xetla::group::epilogue_policy_default, + tile_shape, mem_desc_t>; using gemm_op_t = xetla::kernel::gemm_t< diff --git a/examples/06_splitk_brgemm/splitk_brgemm.cpp b/examples/06_splitk_brgemm/splitk_brgemm.cpp index 595b65a5..60bd84c2 100644 --- a/examples/06_splitk_brgemm/splitk_brgemm.cpp +++ b/examples/06_splitk_brgemm/splitk_brgemm.cpp @@ -157,10 +157,10 @@ void splitk_brgemm_run(uint32_t iter) { // [Split-K] When Split-K is used, update_method should be set to // result_reduce_sum in order to aggregate partial sum from each sub-task // to the final output matrix C - using epilogue_t = epilogue_t< - epilogue_policy_tile_op, - result_reduce_sum, gpu_arch::Xe>, - tile_shape, mem_desc_output_c>; + using epilogue_t + = epilogue_t, + tile_shape, mem_desc_output_c>; // Step 5: define the shared local memory usages // developers have the responsibility to set diff --git a/examples/07_gemm_softmax/gemm_softmax.cpp b/examples/07_gemm_softmax/gemm_softmax.cpp index 36e3cfc6..89bc2e00 100644 --- a/examples/07_gemm_softmax/gemm_softmax.cpp +++ b/examples/07_gemm_softmax/gemm_softmax.cpp @@ -26,7 +26,7 @@ template int gemm_softmax_result_validate(data_type_a *A_device, data_type_b *B_device, data_type_c *C_device, uint32_t m, uint32_t k, uint32_t n, - uint32_t batch_num, sycl::queue queue, + uint32_t batch_num, sycl::queue &queue, mem_layout mem_layout_a_ = mem_layout::row_major, mem_layout mem_layout_b_ = mem_layout::row_major) { uint32_t err_cnt = 0; @@ -213,7 +213,8 @@ void gemm_softmax_run(uint32_t iter) { // epilogue function to overwrite the result using epilogue_t = epilogue_t< - epilogue_policy_default, + epilogue_policy_default, tile_shape, mem_desc_c_t>; // using experimental::group::softmax diff --git a/examples/08_multi_layer_perceptron/multi_layer_perceptron.cpp b/examples/08_multi_layer_perceptron/multi_layer_perceptron.cpp index b271f813..f98039f7 100644 --- a/examples/08_multi_layer_perceptron/multi_layer_perceptron.cpp +++ b/examples/08_multi_layer_perceptron/multi_layer_perceptron.cpp @@ -37,7 +37,7 @@ template int mlp_result_validate(data_type_a *A_device, data_type_b *B_device, data_type_c *C_device, data_type_w *W_device, data_type_v *V_device, - uint32_t m, uint32_t k, uint32_t n, uint32_t l, sycl::queue queue, + uint32_t m, uint32_t k, uint32_t n, uint32_t l, sycl::queue &queue, mem_layout mem_layout_a_ = mem_layout::row_major, mem_layout mem_layout_w_ = mem_layout::row_major, mem_layout mem_layout_v_ = mem_layout::row_major) { @@ -248,8 +248,7 @@ void mlp_run(uint32_t iter) { mem_layout::row_major, mem_space::global>; using epilogue_t = epilogue_t< epilogue_policy_tile_op< - chained_tile_op_t, - result_overwrite, gpu_arch::Xe>, + chained_tile_op_t, gpu_arch::Xe>, tile_shape, mem_desc_output_t>; // [MLP] Define tile_op arguments using epilogue_tile_op_args_t @@ -351,8 +350,8 @@ void mlp_run(uint32_t iter) { ::brgemm; using epilogue_t = epilogue_t< - epilogue_policy_tile_op, - result_overwrite, gpu_arch::Xe>, + epilogue_policy_default, tile_shape, mem_desc_t>; diff --git a/examples/09_scaled_dot_production_attention/scaled_dot_production_attention.cpp b/examples/09_scaled_dot_production_attention/scaled_dot_production_attention.cpp index 9cc19c72..43cf0e56 100644 --- a/examples/09_scaled_dot_production_attention/scaled_dot_production_attention.cpp +++ b/examples/09_scaled_dot_production_attention/scaled_dot_production_attention.cpp @@ -31,7 +31,7 @@ template int sdp_fwd_result_validate(dtype_in *q_device, dtype_in *k_device, dtype_in *v_device, dtype_in *mask_device, dtype_out *c_device, uint32_t qk_m, uint32_t qk_k, uint32_t qk_n, uint32_t sv_m, - uint32_t sv_k, uint32_t sv_n, uint32_t batch_cnt, sycl::queue queue, + uint32_t sv_k, uint32_t sv_n, uint32_t batch_cnt, sycl::queue &queue, mem_layout mem_layout_qk_a_ = mem_layout::row_major, mem_layout mem_layout_qk_b_ = mem_layout::row_major, mem_layout mem_layout_sv_a_ = mem_layout::row_major, @@ -267,8 +267,7 @@ void sdp_fwd_run(uint32_t iter) { sg_tile_k_qk, mma_engine::xmx, gpu_arch::Xe, prefetch_distance, periodic_sync_interval>::brgemm; using epilogue0_t = epilogue_t< - epilogue_policy_tile_op, + epilogue_policy_tile_op, tile_shape0, mem_desc_t>; diff --git a/examples/10_gate_recurrent_unit/gate_recurrent_unit.cpp b/examples/10_gate_recurrent_unit/gate_recurrent_unit.cpp index b1034568..05e65ce0 100644 --- a/examples/10_gate_recurrent_unit/gate_recurrent_unit.cpp +++ b/examples/10_gate_recurrent_unit/gate_recurrent_unit.cpp @@ -24,7 +24,7 @@ int validation(data_type *layer_inputs_device, data_type *h0_inputs_device, std::vector i_weights, std::vector h_weights, data_type *hidden_outputs_device, data_type *layer_outputs_device, uint32_t batch_size, uint32_t input_size, uint32_t hidden_size, - uint32_t sequence_length, sycl::queue queue, uint32_t layer_size = 1) { + uint32_t sequence_length, sycl::queue &queue, uint32_t layer_size = 1) { uint32_t layer_input_size = batch_size * input_size; uint32_t hidden_io_size = batch_size * hidden_size; uint32_t i_weight_size = input_size * hidden_size; @@ -268,6 +268,9 @@ void gru_run(uint32_t iter) { data[idx] = static_cast(0.001 * random_float()); }, queue, device, context); + + i_weights.push_back(ir_weights); + auto iz_weights = alloc_device_and_init( input_weight_size, [](data_type *data, size_t idx) { @@ -280,7 +283,7 @@ void gru_run(uint32_t iter) { auto in_weights = alloc_device_and_init( input_weight_size, [](data_type *data, size_t idx) { - data[idx] = static_cast(0.001 * random_float()); + data[idx] = static_cast(0.0001 * random_float()); }, queue, device, context); diff --git a/examples/10_gate_recurrent_unit/kernel_func.hpp b/examples/10_gate_recurrent_unit/kernel_func.hpp index bb2daf44..07331dac 100644 --- a/examples/10_gate_recurrent_unit/kernel_func.hpp +++ b/examples/10_gate_recurrent_unit/kernel_func.hpp @@ -128,8 +128,9 @@ struct gru_layer { // define arguments for each epilogue_tile_op in chained_tile_op_t<> - using epilogue_t = epilogue_t, - tile_shape, mem_desc_c_t>; + using epilogue_t = epilogue_t< + epilogue_policy_default, tile_shape, + mem_desc_c_t>; using epilogue_args_t = typename epilogue_t::arguments_t; using matC_tile_desc_t = tile_desc_t +template struct epilogue_policy_default { + using update_method = update_method_; static constexpr gpu_arch arch_tag = arch_; + static_assert(std::is_same::value + || std::is_same::value, + "The result can be either overwrite or reduce_sum"); }; /// @brief Epilogue policy for tile_op + store C fusion. /// @tparam tile_op_t_ Is the tile_op functor. -/// @tparam update_method_ Is the store method of matC. /// @tparam arch_ Is the HW architecture. -template +template struct epilogue_policy_tile_op { using tile_op = tile_op_t_; - using update_method = update_method_; - static_assert(std::is_same::value - || std::is_same::value, - "The result can be either overwrite or reduce_sum"); + using update_method = result_overwrite; static constexpr gpu_arch arch_tag = arch_; }; /// @brief Epilogue functor, specialized for quantization operator. /// @tparam tile_op_t_ is the tile op type. /// @tparam quant_op_t_ is the quantization op type -/// @tparam update_method_ is update policy for result store. /// @tparam arch_ Is the HW architecture. template struct epilogue_policy_quant_op { using tile_op = tile_op_t_; using quant_op = quant_op_t_; - using update_method = update_method_; - static_assert(std::is_same::value - || std::is_same::value, - "The result can be either overwrite or reduce_sum"); + using update_method = result_overwrite; static constexpr gpu_arch arch_tag = arch_; }; /// @} xetla_epilogue diff --git a/include/group/epilogue/impl/default_xe.hpp b/include/group/epilogue/impl/default_xe.hpp index f498c2f5..e4856606 100644 --- a/include/group/epilogue/impl/default_xe.hpp +++ b/include/group/epilogue/impl/default_xe.hpp @@ -29,12 +29,13 @@ namespace gpu::xetla::group { /// @{ /// @brief Is the epilogue functor specialized for epilogue_policy_default and Xe architecture. -template -class epilogue_t, tile_shape_, - mem_desc_c_t_> { +template +class epilogue_t, + tile_shape_, mem_desc_c_t_> { public: - using epilogue_policy = epilogue_policy_default; - using update_method = result_overwrite; + using epilogue_policy + = epilogue_policy_default; + using update_method = typename epilogue_policy::update_method; using tile_shape = tile_shape_; using mem_desc_c_t = mem_desc_c_t_; static constexpr gpu_arch arch_tag = gpu_arch::Xe; @@ -55,8 +56,14 @@ class epilogue_t, tile_shape_, static constexpr mem_layout mem_layout_c = mem_desc_c_t::layout; static constexpr mem_space mem_space_c = mem_desc_c_t::space; static constexpr msg_type msg_type_c - = (mem_space_c == mem_space::global ? msg_type::block_2d - : msg_type::scatter); + = std::is_same::value + ? (mem_space_c == mem_space::global ? msg_type::block_2d + : msg_type::scatter) + : msg_type::atomic_add; + static_assert(!(std::is_same::value + && mem_space_c == mem_space::local), + "Local memory not support result_reduce_sum"); + /// @brief Updates tile base descriptor based on the tid. __XETLA_API static void update_sg_tile_tdesc( work_group_t &g, mem_desc_c_t &mem_desc_c) { diff --git a/include/group/epilogue/impl/quant_tile_op_xe.hpp b/include/group/epilogue/impl/quant_tile_op_xe.hpp index 73f3a622..dadc8eb9 100644 --- a/include/group/epilogue/impl/quant_tile_op_xe.hpp +++ b/include/group/epilogue/impl/quant_tile_op_xe.hpp @@ -29,17 +29,17 @@ namespace gpu::xetla::group { /// @{ /// @brief Is the epilogue functor specialized for epilogue_policy_quant_op and Xe architecture. -template -class epilogue_t, +template +class epilogue_t< + epilogue_policy_quant_op, tile_shape_, mem_desc_c_t_> { public: - using epilogue_policy = epilogue_policy_quant_op; + using epilogue_policy + = epilogue_policy_quant_op; using quant_op_t = quant_op_t_; using tile_op_t = tile_op_t_; - using update_method = update_method_; + using update_method = typename epilogue_policy::update_method; using tile_shape = tile_shape_; using mem_desc_c_t = mem_desc_c_t_; static constexpr gpu_arch arch_tag = gpu_arch::Xe; @@ -100,14 +100,8 @@ class epilogue_t::value - ? (mem_space_c == mem_space::global ? msg_type::block_2d - : msg_type::scatter) - : msg_type::atomic_add; - - static_assert(!(std::is_same::value - && mem_space_c == mem_space::local), - "Local memory not support result_reduce_sum"); + = (mem_space_c == mem_space::global ? msg_type::block_2d + : msg_type::scatter); /// @brief Updates tile base descriptor based on the tid. __XETLA_API static void update_sg_tile_tdesc( diff --git a/include/group/epilogue/impl/tile_op_xe.hpp b/include/group/epilogue/impl/tile_op_xe.hpp index 26372277..17fab24e 100644 --- a/include/group/epilogue/impl/tile_op_xe.hpp +++ b/include/group/epilogue/impl/tile_op_xe.hpp @@ -29,16 +29,13 @@ namespace gpu::xetla::group { /// @{ /// @brief Is the epilogue functor specialized for epilogue_policy_tile_op and Xe architecture. -template -class epilogue_t< - epilogue_policy_tile_op, - tile_shape_, mem_desc_c_t_> { +template +class epilogue_t, tile_shape_, + mem_desc_c_t_> { public: - using epilogue_policy - = epilogue_policy_tile_op; + using epilogue_policy = epilogue_policy_tile_op; using tile_op_t = tile_op_t_; - using update_method = update_method_; + using update_method = typename epilogue_policy::update_method; using tile_shape = tile_shape_; using mem_desc_c_t = mem_desc_c_t_; static constexpr gpu_arch arch_tag = gpu_arch::Xe; @@ -90,14 +87,8 @@ class epilogue_t< static constexpr mem_layout mem_layout_c = mem_desc_c_t::layout; static constexpr mem_space mem_space_c = mem_desc_c_t::space; static constexpr msg_type msg_type_c - = std::is_same::value - ? (mem_space_c == mem_space::global ? msg_type::block_2d - : msg_type::scatter) - : msg_type::atomic_add; - - static_assert(!(std::is_same::value - && mem_space_c == mem_space::local), - "Local memory not support result_reduce_sum"); + = (mem_space_c == mem_space::global ? msg_type::block_2d + : msg_type::scatter); /// @brief Updates tile base descriptor based on the tid. __XETLA_API static void update_sg_tile_tdesc( diff --git a/include/subgroup/tile/config.hpp b/include/subgroup/tile/config.hpp deleted file mode 100644 index 4f8bae5c..00000000 --- a/include/subgroup/tile/config.hpp +++ /dev/null @@ -1,42 +0,0 @@ -/******************************************************************************* -* Copyright (c) 2022-2023 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/// @file -/// C++ API - -#pragma once - -#include "subgroup/tile/common.hpp" - -namespace gpu::xetla::subgroup { - -/// @brief -/// -/// @tparam update_dir_ -/// @tparam omode_ -/// @tparam cyclic_size_ -template -struct mem_update_config_t { - static constexpr tdesc_update_dir update_dir = update_dir_; - static constexpr offset_mode omode = omode_; - static constexpr uint32_t cyclic_size = cyclic_size_; - // static_assert((omode != offset_mode::const_offset) || cyclic_size == 1, - // "for const_offset, the cyclic size should be 1"); -}; - -} // namespace gpu::xetla::subgroup diff --git a/media/docs/programming_guidelines.md b/media/docs/programming_guidelines.md index 744ad966..30ecc97e 100644 --- a/media/docs/programming_guidelines.md +++ b/media/docs/programming_guidelines.md @@ -15,15 +15,11 @@ There are two groups of API to imeplement GEMM, brgemm (mirco-kernels) in group ## The Key Things for Better Performance Intel® XeTLA provides the basic building block of GEMM unit; however, it still needs to implement the kernel carefully for the better perforamnce in both algorithm and hardware level. -1. Hardware Compute Unit -In Intel's GPU, the compute unit is organized by sub-slices, and there are many Execution Unit (EU) and shared local memory inside. -The Intel® XeTLA's micro-kernel is designed to fully utilize the whole sub-slices to archieve the best performance. Thus, the software developers response to allocate at least the number of work-group equal with the number of sub-slices. - -3. Number of work-group / sub-group -4. K slicing algorithm -5. Reuse register for post operations -6. Data sharing through shared local memory -7. Reduction +1. Number of work-group / sub-group +2. K slicing algorithm +3. Reuse register for post operations +4. Data sharing through shared local memory +5. Reduction ## How To Implement A GEMM With Building Block diff --git a/media/docs/quick_start.md b/media/docs/quick_start.md index dc77737e..d390769b 100644 --- a/media/docs/quick_start.md +++ b/media/docs/quick_start.md @@ -2,7 +2,7 @@ # Quick Start - [Preparations](/media/docs/quick_start.md#preparations) -- [Setup Environment](/media/docs/quick_start.md#setup-build-environment) +- [Setup Environment](/media/docs/quick_start.md#setup-environment) - [Build](/media/docs/quick_start.md#build) - [Run Tests](/media/docs/quick_start.md#run-tests) - [Run Examples](/media/docs/quick_start.md#run-examples) diff --git a/media/docs/terminology.md b/media/docs/terminology.md index e6dcaeea..2037ac8d 100644 --- a/media/docs/terminology.md +++ b/media/docs/terminology.md @@ -75,25 +75,6 @@ Intel® XeTLA based on [SYCL](https://registry.khronos.org/SYCL/specs/sycl-2020/ **work-item**:The SYCL work-item is a representation of a work-item among a collection of parallel executions of a kernel invoked on a device by a command. A work-item is executed by one or more processing elements as part of a work-group executing on a compute unit. A work-item is distinguished from other work items by its global id or the combination of its work-group id and its local id within a work-group. -Because XeTLA is on the top of SYCL, the most of concept is inherabit from SYCL SPEC. -https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#_work_group_data_parallel_kernels -work_item -work_group -sub_group -Ndrange -Shared local memory -Barrier - -XeTLA -kernel: -group: -subgroup: -tile: -compute policy: -compute_attr: -prefetch_distance: - - # Copyright Copyright (c) 2022-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at diff --git a/tests/integration/data_transformer/common.hpp b/tests/integration/data_transformer/common.hpp index c02dd100..05326aed 100644 --- a/tests/integration/data_transformer/common.hpp +++ b/tests/integration/data_transformer/common.hpp @@ -25,7 +25,7 @@ template int data_transformer_result_validate(data_type_in *in_device, data_type_out *out_device, size_t mat_m, size_t mat_n, bool is_transposed, int need_fp8_op, data_type_acc *amax_ptr_device, - data_type_acc *scale_device, sycl::queue queue) { + data_type_acc *scale_device, sycl::queue &queue) { auto in = alloc_host_and_copy( in_device, mat_m * mat_n, queue); auto out = alloc_host_and_copy( diff --git a/tests/integration/gemm/bf16/common.hpp b/tests/integration/gemm/bf16/common.hpp index 29e22495..717b35cb 100644 --- a/tests/integration/gemm/bf16/common.hpp +++ b/tests/integration/gemm/bf16/common.hpp @@ -277,30 +277,13 @@ class Test11 : public TestBase { using data_type_acc = float; }; -template -class input_buffer_init { -public: - void operator()(dtype_a *A, dtype_b *B, dtype_c *C, size_t size_a, - size_t size_b, size_t size_c) { - for (unsigned i = 0; i < size_a; ++i) { - A[i] = (i * 3) % 17; - } - for (unsigned i = 0; i < size_b; ++i) { - B[i] = (i * 5) % 19; - } - for (unsigned i = 0; i < size_c; ++i) { - C[i] = 0; - } - } -}; - template class result_validate { public: - int operator()(dtype_a *A, dtype_b *B, dtype_c *C, sycl::queue queue, - sycl::context context) { + int operator()(dtype_a *A, dtype_b *B, dtype_c *C, sycl::queue &queue, + sycl::context &context) { return gemm_result_validate(A, B, C, 1, Test::mat_m, Test::mat_k, Test::mat_n, queue, context, Test::layout_a, Test::layout_b); diff --git a/tests/integration/gemm/bf16/kernel_func.hpp b/tests/integration/gemm/bf16/kernel_func.hpp index f98ad1f8..08f63526 100644 --- a/tests/integration/gemm/bf16/kernel_func.hpp +++ b/tests/integration/gemm/bf16/kernel_func.hpp @@ -44,7 +44,7 @@ struct bgemm_test_func { using update_method = typename std::conditional<(l3_kslicing > 1), result_reduce_sum, result_overwrite>::type; using epilogue_t = epilogue_t< - epilogue_policy_tile_op, + epilogue_policy_default, tile_shape, mem_desc_t>; diff --git a/tests/integration/gemm/bf16/main.cpp b/tests/integration/gemm/bf16/main.cpp index 97d77bbd..a45e416f 100644 --- a/tests/integration/gemm/bf16/main.cpp +++ b/tests/integration/gemm/bf16/main.cpp @@ -30,9 +30,9 @@ TYPED_TEST_SUITE_P(bgemm_test); TYPED_TEST_P(bgemm_test, esimd) { gemm_exec(TypeParam::mat_m, TypeParam::mat_n, - TypeParam::mat_k, esimd_compile_string); + typename TypeParam::data_type_acc, result_validate, bgemm_func>( + TypeParam::mat_m, TypeParam::mat_n, TypeParam::mat_k, + esimd_compile_string); } REGISTER_TYPED_TEST_SUITE_P(bgemm_test, esimd); using tests = ::testing::Types 1), result_reduce_sum, result_overwrite>::type; using epilogue_t = epilogue_t< - epilogue_policy_tile_op, + epilogue_policy_default, tile_shape, mem_desc_t>; diff --git a/tests/integration/gemm/fp16/main.cpp b/tests/integration/gemm/fp16/main.cpp index b27d2ac6..7d123e89 100644 --- a/tests/integration/gemm/fp16/main.cpp +++ b/tests/integration/gemm/fp16/main.cpp @@ -370,30 +370,13 @@ class Test16 : public TestBase { using data_type_acc = float; }; -template -class input_buffer_init { -public: - void operator()(dtype_a *A, dtype_b *B, dtype_c *C, size_t size_a, - size_t size_b, size_t size_c) { - for (unsigned i = 0; i < size_a; ++i) { - A[i] = (i * 3) % 17; - } - for (unsigned i = 0; i < size_b; ++i) { - B[i] = (i * 5) % 19; - } - for (unsigned i = 0; i < size_c; ++i) { - C[i] = 0; - } - } -}; - template class result_validate { public: - int operator()(dtype_a *A, dtype_b *B, dtype_c *C, sycl::queue queue, - sycl::context context) { + int operator()(dtype_a *A, dtype_b *B, dtype_c *C, sycl::queue &queue, + sycl::context &context) { return gemm_result_validate(A, B, C, 1, Test::mat_m, Test::mat_k, Test::mat_n, queue, context, Test::layout_a, Test::layout_b); @@ -419,9 +402,9 @@ TYPED_TEST_SUITE_P(fp16_gemm_test); TYPED_TEST_P(fp16_gemm_test, esimd) { gemm_exec(TypeParam::mat_m, TypeParam::mat_n, - TypeParam::mat_k, esimd_compile_string); + typename TypeParam::data_type_acc, result_validate, fp16_gemm_func>( + TypeParam::mat_m, TypeParam::mat_n, TypeParam::mat_k, + esimd_compile_string); } REGISTER_TYPED_TEST_SUITE_P(fp16_gemm_test, esimd); using tests = ::testing::Types -class input_buffer_init { +class Test9 : public TestBase { public: - void operator()(dtype_a *A, dtype_b *B, dtype_c *C, size_t size_a, - size_t size_b, size_t size_c) { - for (unsigned i = 0; i < size_a; ++i) { - A[i] = (i * 3) % 17; - } - for (unsigned i = 0; i < size_b; ++i) { - B[i] = (i * 5) % 19; - } - for (unsigned i = 0; i < size_c; ++i) { - C[i] = 0; - } - } + static constexpr size_t batch_size = 64; + static constexpr size_t mat_m = 256; + static constexpr size_t mat_n = 256; + static constexpr size_t mat_k = 256; + static constexpr size_t wg_m = 256; + static constexpr size_t wg_n = 256; + static constexpr size_t sg_m = 32; + static constexpr size_t sg_n = 64; + static constexpr size_t sg_k = 8; + static constexpr uint32_t l3_kslicing = 2; + static constexpr uint32_t slm_kslicing = 1; + static constexpr mem_layout layout_a = mem_layout::row_major; + static constexpr mem_layout layout_b = mem_layout::row_major; + static constexpr mma_engine engine = mma_engine::xmx; +}; + +class Test10 : public TestBase { +public: + static constexpr size_t batch_size = 64; + static constexpr size_t mat_m = 256; + static constexpr size_t mat_n = 256; + static constexpr size_t mat_k = 256; + static constexpr size_t wg_m = 256; + static constexpr size_t wg_n = 256; + static constexpr size_t sg_m = 32; + static constexpr size_t sg_n = 64; + static constexpr size_t sg_k = 8; + static constexpr uint32_t l3_kslicing = 2; + static constexpr uint32_t slm_kslicing = 1; + static constexpr mem_layout layout_a = mem_layout::row_major; + static constexpr mem_layout layout_b = mem_layout::row_major; +}; + +class Test11 : public TestBase { +public: + static constexpr size_t batch_size = 35; + static constexpr size_t mat_m = 4193; + static constexpr size_t mat_k = 1134; + static constexpr size_t mat_n = 686; + static constexpr size_t wg_m = 256; + static constexpr size_t wg_n = 256; + static constexpr size_t sg_m = 32; + static constexpr size_t sg_n = 64; + static constexpr size_t sg_k = 32; + static constexpr uint32_t l3_kslicing = 1; + static constexpr uint32_t slm_kslicing = 16; + static constexpr mem_layout layout_a = mem_layout::col_major; + static constexpr mem_layout layout_b = mem_layout::row_major; + static constexpr mma_engine engine = mma_engine::xmx; }; template using sgemm_func = sgemm_test_func; + Test::layout_a, Test::layout_b, Test::l3_kslicing, Test::slm_kslicing, + Test::engine>; template class result_validate { public: - int operator()(dtype_a *A, dtype_b *B, dtype_c *C, sycl::queue queue, - sycl::context context) { + int operator()(dtype_a *A, dtype_b *B, dtype_c *C, sycl::queue &queue, + sycl::context &context) { return gemm_result_validate(A, B, - C, 1, Test::mat_m, Test::mat_k, Test::mat_n, queue, context, - Test::layout_a, Test::layout_b); + C, Test::batch_size, Test::mat_m, Test::mat_k, Test::mat_n, + queue, context, Test::layout_a, Test::layout_b); } }; diff --git a/tests/integration/gemm/fp32/kernel_func.hpp b/tests/integration/gemm/fp32/kernel_func.hpp index 9fe185da..1ef52f2d 100644 --- a/tests/integration/gemm/fp32/kernel_func.hpp +++ b/tests/integration/gemm/fp32/kernel_func.hpp @@ -23,7 +23,7 @@ using namespace gpu::xetla; template + uint32_t slm_kslicing, mma_engine engine> struct sgemm_test_func { static const char *func_name() { return "sgemm_test_func"; } @@ -38,13 +38,13 @@ struct sgemm_test_func { static constexpr uint32_t prefetch_distance = 1; using brgemm_t = typename brgemm_selector_t::brgemm; + tile_shape, sg_k, engine, gpu_arch::Xe, prefetch_distance, + periodic_sync_interval>::brgemm; using update_method = typename std::conditional<(l3_kslicing > 1), result_reduce_sum, result_overwrite>::type; using epilogue_t = epilogue_t< - epilogue_policy_tile_op, + epilogue_policy_default, tile_shape, mem_desc_t>; diff --git a/tests/integration/gemm/fp32/main.cpp b/tests/integration/gemm/fp32/main.cpp index 13cb0cb1..07cf27fd 100644 --- a/tests/integration/gemm/fp32/main.cpp +++ b/tests/integration/gemm/fp32/main.cpp @@ -31,12 +31,14 @@ std::string esimd_compile_string template class sgemm_test : public ::testing::Test {}; TYPED_TEST_SUITE_P(sgemm_test); + TYPED_TEST_P(sgemm_test, esimd) { gemm_exec(TypeParam::mat_m, - TypeParam::mat_n, TypeParam::mat_k, esimd_compile_string); + result_validate, sgemm_func>(TypeParam::mat_m, TypeParam::mat_n, + TypeParam::mat_k, esimd_compile_string, TypeParam::batch_size); } + REGISTER_TYPED_TEST_SUITE_P(sgemm_test, esimd); using tests = ::testing::Types; + Test8, Test9, Test10, Test11>; INSTANTIATE_TYPED_TEST_SUITE_P(sgemm_test_suite, sgemm_test, tests); diff --git a/tests/integration/gemm/int8/common.hpp b/tests/integration/gemm/int8/common.hpp index d7f3590e..269bd5e5 100644 --- a/tests/integration/gemm/int8/common.hpp +++ b/tests/integration/gemm/int8/common.hpp @@ -182,38 +182,62 @@ class Test6 : public TestBase { using data_type_acc = int; }; -template -class input_buffer_init { +template +using int8_gemm_func = int8gemm_test_func; + +template +class result_validate { + public: - void operator()(dtype_a *A, dtype_b *B, dtype_c *C, size_t size_a, - size_t size_b, size_t size_c) { - for (unsigned i = 0; i < size_a; ++i) { - A[i] = (i * 3) % 17; - } - for (unsigned i = 0; i < size_b; ++i) { - B[i] = (i * 5) % 19; - } - for (unsigned i = 0; i < size_c; ++i) { - C[i] = 0; + int operator()(data_type_a *A_device, data_type_b *B_device, + data_type_c *C_device, sycl::queue &queue, sycl::context &context) { + auto A = alloc_host_and_copy( + A_device, Test::mat_m * Test::mat_k, queue); + auto B = alloc_host_and_copy( + B_device, Test::mat_k * Test::mat_n, queue); + auto C = alloc_host_and_copy( + C_device, Test::mat_m * Test::mat_n, queue); + + buff_cmp::buff_vals data( + C, Test::mat_m, Test::mat_n, Test::mat_n); + std::vector acc_buffer(Test::mat_m * Test::mat_n, 0); + + { + bool is_col_major_a = Test::layout_a == mem_layout::col_major; + bool is_col_major_b = Test::layout_b == mem_layout::col_major; + for (int i = 0; i < Test::mat_m; i++) { + for (int j = 0; j < Test::mat_n; j++) { + for (int k = 0; k < Test::mat_k; k++) { + data_type_acc a_temp = is_col_major_a + ? A[i + k * Test::mat_m] + : A[i * Test::mat_k + k]; + data_type_acc b_temp = is_col_major_b + ? B[k + j * Test::mat_k] + : B[k * Test::mat_n + j]; + acc_buffer[i * Test::mat_n + j] + = acc_buffer[i * Test::mat_n + j] + + a_temp * b_temp; + } + } + } } - } -}; -template -using int8_gemm_func = int8gemm_test_func; + buff_cmp::buff_vals other( + acc_buffer.data(), Test::mat_m, Test::mat_n, Test::mat_n); + bool result = buff_cmp::xetla_buff_cmp(data, other, + Test::name(Test::mat_m, Test::mat_n, Test::mat_k, Test::wg_m, + Test::wg_n, Test::sg_m, Test::sg_n, Test::sg_k, + Test::layout_a, Test::layout_b)); -template -class result_validate { + free(A); + free(B); + free(C); -public: - int operator()(dtype_a *A, dtype_b *B, dtype_c *C, sycl::queue queue, - sycl::context context) { - return gemm_result_validate(A, B, - C, 1, Test::mat_m, Test::mat_k, Test::mat_n, queue, context, - Test::layout_a, Test::layout_b); + return result ? 0 : 1; } }; diff --git a/tests/integration/gemm/int8/kernel_func.hpp b/tests/integration/gemm/int8/kernel_func.hpp index 636398ef..a3561089 100644 --- a/tests/integration/gemm/int8/kernel_func.hpp +++ b/tests/integration/gemm/int8/kernel_func.hpp @@ -44,7 +44,7 @@ struct int8gemm_test_func { using update_method = typename std::conditional<(l3_kslicing > 1), result_reduce_sum, result_overwrite>::type; using epilogue_t = epilogue_t< - epilogue_policy_tile_op, + epilogue_policy_default, tile_shape, mem_desc_t>; diff --git a/tests/integration/gemm/int8/main.cpp b/tests/integration/gemm/int8/main.cpp index 92829979..a50a59d1 100644 --- a/tests/integration/gemm/int8/main.cpp +++ b/tests/integration/gemm/int8/main.cpp @@ -35,9 +35,9 @@ TYPED_TEST_SUITE_P(int8_gemm_test); TYPED_TEST_P(int8_gemm_test, esimd) { gemm_exec(TypeParam::mat_m, TypeParam::mat_n, - TypeParam::mat_k, esimd_compile_string); + typename TypeParam::data_type_acc, result_validate, int8_gemm_func>( + TypeParam::mat_m, TypeParam::mat_n, TypeParam::mat_k, + esimd_compile_string); } REGISTER_TYPED_TEST_SUITE_P(int8_gemm_test, esimd); diff --git a/tests/integration/gemm/int8_quantization/common.hpp b/tests/integration/gemm/int8_quantization/common.hpp index 5e4942ef..138fc1e0 100644 --- a/tests/integration/gemm/int8_quantization/common.hpp +++ b/tests/integration/gemm/int8_quantization/common.hpp @@ -25,7 +25,7 @@ int gemm_result_validate(data_type_a *A_device, data_type_b *B_device, data_type_c *C_device, data_type_param *scale_device, data_type_param *offset_device, int m, int k, int n, gpu::xetla::mem_layout mem_layout_a, - gpu::xetla::mem_layout mem_layout_b, sycl::queue queue) { + gpu::xetla::mem_layout mem_layout_b, sycl::queue &queue) { auto A = alloc_host_and_copy(A_device, m * k, queue); auto B = alloc_host_and_copy(B_device, k * n, queue); diff --git a/tests/integration/gemm/int8_quantization/kernel_func.hpp b/tests/integration/gemm/int8_quantization/kernel_func.hpp index 1f3c7e85..8cb0c27b 100644 --- a/tests/integration/gemm/int8_quantization/kernel_func.hpp +++ b/tests/integration/gemm/int8_quantization/kernel_func.hpp @@ -37,7 +37,7 @@ struct igemm_quantize_func { using quant_op_t = subgroup::quant_op_t; using epilogue_t = gpu::xetla::group::epilogue_t< gpu::xetla::group::epilogue_policy_quant_op, + gpu_arch::Xe>, tile_shape, mem_desc_t>; diff --git a/tests/integration/gemm/tf32/kernel_func.hpp b/tests/integration/gemm/tf32/kernel_func.hpp index 8e272782..fec8bc5b 100644 --- a/tests/integration/gemm/tf32/kernel_func.hpp +++ b/tests/integration/gemm/tf32/kernel_func.hpp @@ -42,7 +42,7 @@ struct gemm_test_func { using update_method = typename std::conditional<(l3_kslicing > 1), result_reduce_sum, result_overwrite>::type; using epilogue_t = epilogue_t< - epilogue_policy_tile_op, + epilogue_policy_default, tile_shape, mem_desc_t>; diff --git a/tests/integration/gemm/tf32/main.cpp b/tests/integration/gemm/tf32/main.cpp index 40d7645b..85047073 100644 --- a/tests/integration/gemm/tf32/main.cpp +++ b/tests/integration/gemm/tf32/main.cpp @@ -19,29 +19,12 @@ #include "test.hpp" #include -template -class input_buffer_init { -public: - void operator()(dtype_a *A, dtype_b *B, dtype_c *C, size_t size_a, - size_t size_b, size_t size_c) { - for (unsigned i = 0; i < size_a; ++i) { - A[i] = (i * 3) % 17; - } - for (unsigned i = 0; i < size_b; ++i) { - B[i] = (i * 5) % 19; - } - for (unsigned i = 0; i < size_c; ++i) { - C[i] = 0; - } - } -}; - template class result_validate { public: - int operator()(dtype_a *A, dtype_b *B, dtype_c *C, sycl::queue queue, - sycl::context context) { + int operator()(dtype_a *A, dtype_b *B, dtype_c *C, sycl::queue &queue, + sycl::context &context) { return gemm_result_validate(A, B, C, 1, Test::mat_m, Test::mat_k, Test::mat_n, queue, context, Test::layout_a, Test::layout_b); @@ -67,9 +50,8 @@ TYPED_TEST_SUITE_P(gemm_tf32); TYPED_TEST_P(gemm_tf32, esimd) { gemm_exec( - TypeParam::mat_m, TypeParam::mat_n, TypeParam::mat_k, - esimd_compile_string); + float, result_validate, gemm_func>(TypeParam::mat_m, + TypeParam::mat_n, TypeParam::mat_k, esimd_compile_string); } REGISTER_TYPED_TEST_SUITE_P(gemm_tf32, esimd); INSTANTIATE_TYPED_TEST_SUITE_P(gemm_tf32_suite, gemm_tf32, tests); diff --git a/tests/integration/row_reduction/common.hpp b/tests/integration/row_reduction/common.hpp index 1b7c7280..dafca051 100644 --- a/tests/integration/row_reduction/common.hpp +++ b/tests/integration/row_reduction/common.hpp @@ -31,7 +31,7 @@ int reduction_result_validate(data_type_in *device_in, data_type_out *device_out, data_type_w *device_w_in, data_type_x *device_x_out, data_type_d *device_d_out, uint8_t *device_mask_in, int m, int n, float scale, - reduction_fused_kind fused_op, sycl::queue queue) { + reduction_fused_kind fused_op, sycl::queue &queue) { int err_cnt = 0; bool is_bias_gelu_bwd = fused_op == reduction_fused_kind::bias_gelu_w_bwd; bool is_bias_dropout_bwd diff --git a/tests/integration/sg_dropout_op/common.hpp b/tests/integration/sg_dropout_op/common.hpp index bed1f604..3c906b87 100644 --- a/tests/integration/sg_dropout_op/common.hpp +++ b/tests/integration/sg_dropout_op/common.hpp @@ -30,7 +30,7 @@ template int dropout_result_validate(data_type_x *in_device, data_type_y *out_device, int m, int n, uint8_t *buffer_mask_device, float drop_out_scale, - sycl::queue queue) { + sycl::queue &queue) { auto in = alloc_host_and_copy(in_device, m * n, queue); auto out = alloc_host_and_copy(out_device, m * n, queue); auto buffer_mask diff --git a/tests/integration/sg_dropout_op/kernel_func.hpp b/tests/integration/sg_dropout_op/kernel_func.hpp index fb160cd0..c60de6e2 100644 --- a/tests/integration/sg_dropout_op/kernel_func.hpp +++ b/tests/integration/sg_dropout_op/kernel_func.hpp @@ -56,10 +56,9 @@ struct dropout_func_t { subgroup::dropout_op_t, subgroup::rng_dropout_op_t>::type; - using epilogue_t - = group::epilogue_t, - tile_shape, mem_desc_out_t>; + using epilogue_t = group::epilogue_t< + group::epilogue_policy_tile_op, tile_shape, + mem_desc_out_t>; using epilogue_args_t = typename epilogue_t::arguments_t; static inline void run(xetla_exec_item<3> &ei, dtype_in *mat_in_ptr, diff --git a/tests/integration/vector_add/bf16_2d/common.hpp b/tests/integration/vector_add/bf16_2d/common.hpp index 1295e895..5d7b675d 100644 --- a/tests/integration/vector_add/bf16_2d/common.hpp +++ b/tests/integration/vector_add/bf16_2d/common.hpp @@ -26,7 +26,8 @@ class Test1; #define data_type bf16 int vadd_result_validate(data_type *A_device, data_type *B_device, - data_type *C_device, unsigned Size, unsigned pitch, sycl::queue queue) { + data_type *C_device, unsigned Size, unsigned pitch, + sycl::queue &queue) { auto A = alloc_host_and_copy(A_device, pitch * pitch, queue); auto B = alloc_host_and_copy(B_device, pitch * pitch, queue); auto C = alloc_host_and_copy(C_device, pitch * pitch, queue); diff --git a/tests/integration/vector_add/int32_1d/common.hpp b/tests/integration/vector_add/int32_1d/common.hpp index bdf408ee..d5192be3 100644 --- a/tests/integration/vector_add/int32_1d/common.hpp +++ b/tests/integration/vector_add/int32_1d/common.hpp @@ -25,7 +25,7 @@ using namespace gpu::xetla; using namespace cl::sycl; int vadd_result_validate(data_type *A_device, data_type *B_device, - data_type *C_device, unsigned Size, sycl::queue queue) { + data_type *C_device, unsigned Size, sycl::queue &queue) { auto A = alloc_host_and_copy(A_device, Size, queue); auto B = alloc_host_and_copy(B_device, Size, queue); auto C = alloc_host_and_copy(C_device, Size, queue); diff --git a/tests/integration/vector_add/int32_2d/common.hpp b/tests/integration/vector_add/int32_2d/common.hpp index 846a8254..6e754261 100644 --- a/tests/integration/vector_add/int32_2d/common.hpp +++ b/tests/integration/vector_add/int32_2d/common.hpp @@ -24,7 +24,7 @@ class Test1; using namespace cl::sycl; int vadd_result_validate(data_type *A_device, data_type *B_device, - data_type *C_device, unsigned Size, sycl::queue queue) { + data_type *C_device, unsigned Size, sycl::queue &queue) { auto A = alloc_host_and_copy(A_device, Size, queue); auto B = alloc_host_and_copy(B_device, Size, queue); auto C = alloc_host_and_copy(C_device, Size, queue); diff --git a/tests/integration/vector_add/tf32_1d/common.hpp b/tests/integration/vector_add/tf32_1d/common.hpp index 3cbb26b9..22b78902 100644 --- a/tests/integration/vector_add/tf32_1d/common.hpp +++ b/tests/integration/vector_add/tf32_1d/common.hpp @@ -28,7 +28,7 @@ class Test1; using namespace cl::sycl; int vadd_result_validate(data_type *A_device, data_type *B_device, - data_type *C_device, unsigned Size, sycl::queue queue) { + data_type *C_device, unsigned Size, sycl::queue &queue) { auto A = alloc_host_and_copy(A_device, Size, queue); auto B = alloc_host_and_copy(B_device, Size, queue); auto C = alloc_host_and_copy(C_device, Size, queue); diff --git a/tests/unit/epilogue_tile_op/kernel_func.hpp b/tests/unit/epilogue_tile_op/kernel_func.hpp index f4976cd5..0f67c03e 100644 --- a/tests/unit/epilogue_tile_op/kernel_func.hpp +++ b/tests/unit/epilogue_tile_op/kernel_func.hpp @@ -34,8 +34,7 @@ struct tile_elemwise_op_func { using tile_shape = tile_shape_t; using mem_desc_c_t = mem_desc_t; - using epilogue_policy = epilogue_policy_tile_op; + using epilogue_policy = epilogue_policy_tile_op; using epilogue_t = epilogue_t; using work_group_t = typename tile_shape::work_group_t; using epilogue_args_t = typename epilogue_t::arguments_t; @@ -70,7 +69,7 @@ struct tile_elemwise_op_func; using epilogue_policy = epilogue_policy_tile_op, - result_overwrite, gpu_arch::Xe>; + gpu_arch::Xe>; using epilogue_t = epilogue_t; using work_group_t = typename tile_shape::work_group_t; using epilogue_args_t = typename epilogue_t::arguments_t; @@ -105,7 +104,7 @@ struct tile_elemwise_op_func; using epilogue_policy = epilogue_policy_tile_op, - result_overwrite, gpu_arch::Xe>; + gpu_arch::Xe>; using epilogue_t = epilogue_t; using work_group_t = typename tile_shape::work_group_t; using epilogue_args_t = typename epilogue_t::arguments_t; @@ -136,7 +135,7 @@ struct tile_elemwise_op_func; using epilogue_policy = epilogue_policy_tile_op, - result_overwrite, gpu_arch::Xe>; + gpu_arch::Xe>; using epilogue_t = epilogue_t; using work_group_t = typename tile_shape::work_group_t; using epilogue_args_t = typename epilogue_t::arguments_t; @@ -165,7 +164,7 @@ struct tile_elemwise_op_func; using epilogue_policy = epilogue_policy_tile_op< elemwise_reduce_op_t, - result_overwrite, gpu_arch::Xe>; + gpu_arch::Xe>; using epilogue_t = epilogue_t; using work_group_t = typename tile_shape::work_group_t; using epilogue_args_t = typename epilogue_t::arguments_t; diff --git a/tests/utils/buff_compare.hpp b/tests/utils/buff_compare.hpp index 3acb5b7b..bd587a25 100644 --- a/tests/utils/buff_compare.hpp +++ b/tests/utils/buff_compare.hpp @@ -34,7 +34,7 @@ namespace buff_cmp { /// @{ // convenient datatype to represent ulp-converted fp buffers -using ulp_vec = std::vector; +using ulp_vec = std::vector; /// ///@brief Structure used to describe tensors / buffers as stdlib vectors, idx_mapping vector is used to ignore "unwanted" elements from a tensor array. @@ -44,8 +44,8 @@ template struct buff_vals { using type = dtype; std::vector buff; - std::vector idx_mapping; - unsigned size; + std::vector idx_mapping; + size_t size; /// @brief Initializes and empty buff_vals structure. /// @tparam dtype Datatype of the output structure and input buffer. @@ -55,10 +55,10 @@ struct buff_vals { /// @tparam dtype Datatype of the output structure and input buffer. /// @param data Pointer to buffer of input data. /// @param n Size of input buffer data - buff_vals(dtype_src *data, uint32_t n) { + buff_vals(dtype_src *data, size_t n) { this->size = n; this->buff.resize(this->size, 0); - for (uint32_t i = 0; i < this->size; ++i) { + for (size_t i = 0; i < this->size; ++i) { this->buff[i] = data[i]; this->idx_mapping.push_back(i); } @@ -70,14 +70,13 @@ struct buff_vals { /// @param Blocky By default used to define size of input buffer. If input is organized as tensor (meaning, by blocks) then defines block height. /// @param Blockx Default value is 1 for non-tensor buffers, otherwise describes tensor block width. /// @param Sizex Default value is 1 for non-tensor buffers, otherwise describes tensor pitch. - buff_vals( - dtype_src *data, uint32_t Blocky, uint32_t Blockx, uint32_t Sizex) { + buff_vals(dtype_src *data, size_t Blocky, size_t Blockx, size_t Sizex) { this->size = Blockx * Blocky; this->buff.resize(this->size, 0); this->idx_mapping.resize(this->size, 0); - unsigned idx = 0; - for (int i = 0; i < Blocky; ++i) { - for (int j = 0; j < Blockx; ++j) { + size_t idx = 0; + for (size_t i = 0; i < Blocky; ++i) { + for (size_t j = 0; j < Blockx; ++j) { this->buff[idx] = data[i * Sizex + j]; this->idx_mapping[idx] = i * Sizex + j; ++idx; @@ -89,7 +88,7 @@ struct buff_vals { /// @tparam dtype Datatype of the given structure and new element. /// @param val Element to be added to structure. /// @param idx Index mapping for element being added. - void push(dtype val, uint32_t idx) { + void push(dtype val, size_t idx) { ++this->size; this->buff.push_back(val); this->idx_mapping.push_back(idx); @@ -121,15 +120,15 @@ struct buff_vals { /// template buff_vals xetla_get_buff_vals( - dtype *data, unsigned Blocky, unsigned Blockx = 1, unsigned Sizex = 1) { + dtype *data, size_t Blocky, size_t Blockx = 1, size_t Sizex = 1) { buff_vals res; res.size = Blockx * Blocky; res.buff.resize(res.size, 0); res.idx_mapping.resize(res.size, 0); - unsigned idx = 0; - for (int i = 0; i < Blocky; ++i) { - for (int j = 0; j < Blockx; ++j) { + size_t idx = 0; + for (size_t i = 0; i < Blocky; ++i) { + for (size_t j = 0; j < Blockx; ++j) { res.buff[idx] = data[i * Sizex + j]; res.idx_mapping[idx] = i * Sizex + j; ++idx; @@ -142,7 +141,7 @@ buff_vals xetla_get_buff_vals( struct rel_abs_vals { std::vector ate; std::vector rte; - unsigned size; + size_t size; }; /// @@ -171,7 +170,7 @@ rel_abs_vals xetla_get_rte_and_ate(T1 &v1, T2 &v2) { res.rte.resize(res.size, 0.0); res.ate.resize(res.size, 0.0); - for (unsigned i = 0; i < res.size; ++i) { + for (size_t i = 0; i < res.size; ++i) { res.ate[i] = get_ate(v1.buff[i], v2.buff[i]); res.rte[i] = get_rte(v1.buff[i], v2.buff[i]); } @@ -190,7 +189,7 @@ ulp_vec xetla_get_ulp_buffer(T &v1) { using dtype = typename T::type; using uint_dtype = gpu::xetla::uint_type_t; ulp_vec ulp_buff(v1.size, 0); - for (unsigned i = 0; i < ulp_buff.size(); ++i) { + for (size_t i = 0; i < ulp_buff.size(); ++i) { uint_dtype val = (*reinterpret_cast(&v1.buff[i])); ulp_buff[i] = val; } @@ -211,7 +210,7 @@ ulp_vec xetla_get_ulp_buffer(T &v1) { /// template bool _handle_fp_types(buff_vals &data, buff_vals &other, - std::string name, uint64_t ulp_tol, double abs_tol) { + std::string name, size_t ulp_tol, double abs_tol) { if (std::is_same, gpu::xetla::bf16>::value) { if (ulp_tol == 0) ulp_tol = 8; if (abs_tol == 0) abs_tol = 0.25; @@ -233,7 +232,7 @@ bool _handle_fp_types(buff_vals &data, buff_vals &other, ulp_vec ulp_data = xetla_get_ulp_buffer(data); ulp_vec ulp_other = xetla_get_ulp_buffer(other); - auto get_ulp_ate = [=](uint64_t a, uint64_t b) { + auto get_ulp_ate = [=](size_t a, size_t b) { if (a > b) return a - b; else @@ -242,10 +241,10 @@ bool _handle_fp_types(buff_vals &data, buff_vals &other, ulp_vec aulpte; aulpte.resize(ulp_data.size(), 0); - for (unsigned i = 0; i < ulp_data.size(); ++i) + for (size_t i = 0; i < ulp_data.size(); ++i) aulpte[i] = get_ulp_ate(ulp_data[i], ulp_other[i]); - unsigned aulpidx + size_t aulpidx = std::max_element(aulpte.begin(), aulpte.end()) - aulpte.begin(); std::cout << "\t" @@ -258,16 +257,16 @@ bool _handle_fp_types(buff_vals &data, buff_vals &other, << "data_val: " << ulp_data[aulpidx] << " gold_val: " << ulp_other[aulpidx] << std::endl; - uint64_t ulp_threshold = ulp_tol; + size_t ulp_threshold = ulp_tol; double small_num_threshold = abs_tol; - uint32_t diff_elems_count = 0; + size_t diff_elems_count = 0; bool flag = true; - for (unsigned i = 0; i < ulp_data.size(); ++i) { + for (size_t i = 0; i < ulp_data.size(); ++i) { float des = other.buff[i]; float act = data.buff[i]; - uint64_t ulp_des = ulp_other[i]; - uint64_t ulp_act = ulp_data[i]; - uint64_t sub_ulp = ulp_act - ulp_des; + size_t ulp_des = ulp_other[i]; + size_t ulp_act = ulp_data[i]; + size_t sub_ulp = ulp_act - ulp_des; if (ulp_des > ulp_act) sub_ulp = ulp_des - ulp_act; if (!((fabs(act - des) <= small_num_threshold) || (sub_ulp <= ulp_threshold))) { @@ -292,7 +291,7 @@ bool _handle_fp_types(buff_vals &data, buff_vals &other, template bool _cast_and_handle_fp_types(T1 &data, T2 &other, std::string name, - double diff_elems_tol, uint64_t ulp_tol, double abs_tol) { + double diff_elems_tol, size_t ulp_tol, double abs_tol) { buff_vals casted_data, casted_other; casted_data.size = data.size; casted_data.buff @@ -323,8 +322,7 @@ bool _cast_and_handle_fp_types(T1 &data, T2 &other, std::string name, /// template bool xetla_buff_cmp(T1 &data, T2 &other, std::string name, - double diff_elems_tol = 0.02, uint64_t ulp_tol = 0, - double abs_tol = 0) { + double diff_elems_tol = 0.02, size_t ulp_tol = 0, double abs_tol = 0) { if (data.size != other.size) { std::cout << "ERROR: buffer size or shape mismatch!\n"; return false; @@ -383,8 +381,8 @@ bool xetla_buff_cmp(T1 &data, T2 &other, std::string name, data, other, name, diff_elems_tol, ulp_tol, abs_tol); } } else { - uint32_t diff_elems_count = 0; - for (unsigned i = 0; i < data.size; ++i) + size_t diff_elems_count = 0; + for (size_t i = 0; i < data.size; ++i) if (data.buff[i] != other.buff[i]) ++diff_elems_count; float fail_rate = diff_elems_count / ((float)data.size) * 100; float pass_rate = 100 - fail_rate; diff --git a/tests/utils/common.hpp b/tests/utils/common.hpp index 1678e512..3fd6c5b7 100644 --- a/tests/utils/common.hpp +++ b/tests/utils/common.hpp @@ -101,7 +101,7 @@ inline data_type *alloc_device_and_init(size_t size, sycl::queue &queue, sycl::device &device, sycl::context &context) { auto host_ptr = static_cast(malloc(size * sizeof(data_type))); - for (uint32_t i = 0; i < size; ++i) { + for (size_t i = 0; i < size; ++i) { init_func(host_ptr, i); } @@ -129,15 +129,15 @@ template int gemm_result_validate(data_type_a *A_device, data_type_b *B_device, data_type_c *C_device, uint32_t batch_size, uint32_t m, uint32_t k, - uint32_t n, sycl::queue queue, sycl::context context, + uint32_t n, sycl::queue &queue, sycl::context &context, mem_layout mem_layout_a_ = mem_layout::row_major, mem_layout mem_layout_b_ = mem_layout::row_major) { bool is_col_major_a = mem_layout_a_ == mem_layout::col_major; bool is_col_major_b = mem_layout_b_ == mem_layout::col_major; // define slice of each matrices - uint32_t size_a_slice = m * k; - uint32_t size_b_slice = k * n; - uint32_t size_c_slice = m * n; + size_t size_a_slice = m * k; + size_t size_b_slice = k * n; + size_t size_c_slice = m * n; auto A = alloc_host_and_copy( A_device, batch_size * size_a_slice, queue); diff --git a/tests/utils/execution.hpp b/tests/utils/execution.hpp index db9c620a..499345fc 100644 --- a/tests/utils/execution.hpp +++ b/tests/utils/execution.hpp @@ -17,20 +17,23 @@ #pragma once #include "profiling.hpp" +#include "xetla.hpp" using namespace cl::sycl; using namespace gpu; using namespace gpu::xetla; +enum class test_result { complete = 0, skip = 1, fail = 2 }; + template class initialize_func, template class validate_func, template class KERNEL, int SLMSIZE = 128 * 1024, int BARNUM = 32> void gemm_exec(size_t matrix_m, size_t matrix_n, size_t matrix_k, - std::string compile_str) { + std::string compile_str, size_t batch = 1) { + test_result result = test_result::complete; constexpr size_t wg_tile_m = Test::wg_m; constexpr size_t wg_tile_n = Test::wg_n; @@ -38,32 +41,33 @@ void gemm_exec(size_t matrix_m, size_t matrix_n, size_t matrix_k, constexpr size_t sg_tile_n = Test::sg_n; constexpr size_t sg_tile_k = Test::sg_k; - int size_a = matrix_m * matrix_k; - int size_b = matrix_k * matrix_n; - int size_c = matrix_m * matrix_n; + size_t size_a = matrix_m * matrix_k; + size_t size_b = matrix_k * matrix_n; + size_t size_c = matrix_m * matrix_n; sycl::property_list properties {sycl::property::queue::enable_profiling()}; auto queue = sycl::queue(properties); auto context = queue.get_info(); auto device = queue.get_info(); - std::cout << "Running on " << device.get_info() << "\n"; + std::cout << "Running on batch: " << batch << ", " + << device.get_info() << "\n"; auto A = alloc_device_and_init( - size_a, + batch * size_a, [](data_type_a *data, size_t idx) { - data[idx] = static_cast(random_float()); + data[idx] = static_cast((idx * 3) % 17); }, queue, device, context); auto B = alloc_device_and_init( - size_b, + batch * size_b, [](data_type_b *data, size_t idx) { - data[idx] = static_cast(random_float()); + data[idx] = static_cast((idx * 5) % 19); }, queue, device, context); auto C = alloc_device_and_init( - size_c, + batch * size_c, [](data_type_c *data, size_t idx) { - data[idx] = static_cast(0.0f); + data[idx] = static_cast(0); }, queue, device, context); @@ -92,33 +96,88 @@ void gemm_exec(size_t matrix_m, size_t matrix_n, size_t matrix_k, setenv("SYCL_PROGRAM_COMPILE_OPTIONS", compile_str.c_str(), 1); kernel_bundle exeBundle = build(inputBundle); unsetenv("SYCL_PROGRAM_COMPILE_OPTIONS"); - auto e_esimd = queue.submit([&](handler &cgh) { - cgh.use_kernel_bundle(exeBundle); - cgh.parallel_for( - nd_range, [=](nd_item<3> item) SYCL_ESIMD_KERNEL { - gpu::xetla::xetla_exec_item<3> ei(item); - gpu::xetla::xetla_local_init(); - gpu::xetla::xetla_nbarrier_init(); - KERNEL::run(ei, A, B, C, matrix_m, - matrix_n, matrix_k); - }); - }); - e_esimd.wait(); + using namespace gpu::xetla::group; + using namespace gpu::xetla::kernel; + using namespace gpu::xetla::subgroup; + using tile_shape + = tile_shape_t; + static constexpr uint32_t periodic_sync_interval = 8; + static constexpr uint32_t prefetch_distance = 3; + using brgemm_t = typename brgemm_selector_t::brgemm; + + using update_method = typename std::conditional<(Test::l3_kslicing > 1), + result_reduce_sum, result_overwrite>::type; + using epilogue_t = epilogue_t< + epilogue_policy_default, + tile_shape, + mem_desc_t>; + + using gemm_op_t = gemm_t, + brgemm_t, epilogue_t>; + + for (size_t i = 0; i < batch; i++) { + auto A_ptr = A + i * size_a; + auto B_ptr = B + i * size_b; + auto C_ptr = C + i * size_c; + typename gemm_op_t::arguments_t arg(matrix_m, matrix_k, matrix_n, + A_ptr, + Test::layout_a == mem_layout::col_major ? matrix_m + : matrix_k, + B_ptr, + Test::layout_b == mem_layout::col_major ? matrix_k + : matrix_n, + C_ptr, matrix_n); + if (!gemm_op_t::can_implement(arg)) { + std::cout << "The arguments cannot be supported, skip ... " + << std::endl; + result = test_result::skip; + break; + } + + auto e_esimd = queue.submit([&](handler &cgh) { + cgh.use_kernel_bundle(exeBundle); + cgh.parallel_for( + nd_range, [=](nd_item<3> item) SYCL_ESIMD_KERNEL { + gpu::xetla::xetla_exec_item<3> ei(item); + gpu::xetla::xetla_local_init(); + gpu::xetla::xetla_nbarrier_init(); + KERNEL::run(ei, A_ptr, B_ptr, C_ptr, + matrix_m, matrix_n, matrix_k); + }); + }); + e_esimd.wait(); + } } catch (cl::sycl::exception const &e) { std::cout << "SYCL exception caught: " << e.what() << '\n'; - FAIL(); + result = test_result::fail; } // validation - validate_func - vfunc; - ASSERT_EQ(0, vfunc(A, B, C, queue, context)); + if (result == test_result::complete) { + validate_func + vfunc; + ASSERT_EQ(0, vfunc(A, B, C, queue, context)); + } free(A, context); free(B, context); free(C, context); + + if (result == test_result::skip) { + GTEST_SKIP(); + } else if (result != test_result::complete) { + FAIL(); + } } /// @brief The template function to execute kernel in esimd way for unit test framework @@ -128,8 +187,8 @@ void gemm_exec(size_t matrix_m, size_t matrix_n, size_t matrix_k, /// @param nd_range the range of workitems /// @param validate_result validation function, taking 3 parameters buffer A, B as input C as output /// -template +template void kernel_run(auto nd_range, auto validate_result) { queue queue {}; diff --git a/tools/scripts/env.sh b/tools/scripts/env.sh index 357f58e3..36c5cf1e 100755 --- a/tools/scripts/env.sh +++ b/tools/scripts/env.sh @@ -10,4 +10,5 @@ source ${ONEAPI_INSTALL_PATH}/setvars.sh # Export environment variables export CC=icx +export CXX=icpx export ONEAPI_DEVICE_SELECTOR=level_zero1:*