From 39e3747ff6c6f0d2b9dfd1d3a131c2a9c5529458 Mon Sep 17 00:00:00 2001 From: Ouadie EL FAROUKI <104583441+OuadiElfarouki@users.noreply.github.com> Date: Tue, 19 Dec 2023 15:17:31 +0000 Subject: [PATCH 1/2] Refined transpose kernel configurations for CPU target (#485) * Down-sized some copy-op benchmarks --- common/include/common/common_utils.hpp | 76 ++++++++----------- .../extension/backend/default_cpu.hpp | 8 +- 2 files changed, 36 insertions(+), 48 deletions(-) diff --git a/common/include/common/common_utils.hpp b/common/include/common/common_utils.hpp index fc4562aa8..6cb967cc3 100644 --- a/common/include/common/common_utils.hpp +++ b/common/include/common/common_utils.hpp @@ -1242,15 +1242,13 @@ static inline std::vector> get_matcopy_params( std::vector> matcopy_default; constexpr index_t dmin = 64, dmax = 8192; constexpr scalar_t alpha{2}; + constexpr index_t lda_mul = 1; + constexpr index_t ldb_mul = 1; for (char trans : {'n', 't'}) { for (index_t m = dmin; m <= dmax; m *= 2) { for (index_t n = dmin; n <= dmax; n *= 2) { - for (index_t lda_mul = 1; lda_mul < 2; ++lda_mul) { - for (index_t ldb_mul = 1; ldb_mul < 2; ++ldb_mul) { - matcopy_default.push_back( - std::make_tuple(trans, m, n, alpha, lda_mul, ldb_mul)); - } - } + matcopy_default.push_back( + std::make_tuple(trans, m, n, alpha, lda_mul, ldb_mul)); } } } @@ -1287,17 +1285,15 @@ static inline std::vector> get_omatcopy2_params( std::vector> omatcopy2_default; constexpr index_t dmin = 1024, dmax = 8192; constexpr scalar_t alpha{2}; + constexpr index_t lda_mul = 1; + constexpr index_t ldb_mul = 1; for (char trans : {'n', 't'}) { for (index_t m = dmin; m <= dmax; m *= 2) { for (index_t n = dmin; n <= dmax; n *= 2) { - for (index_t lda_mul = 1; lda_mul < 2; ++lda_mul) { - for (index_t inc_a = 1; inc_a < 3; ++inc_a) { - for (index_t ldb_mul = 1; ldb_mul < 2; ++ldb_mul) { - for (index_t inc_b = 1; inc_b < 3; ++inc_b) { - omatcopy2_default.push_back(std::make_tuple( - trans, m, n, alpha, lda_mul, ldb_mul, inc_a, inc_b)); - } - } + for (index_t inc_a = 1; inc_a < 3; ++inc_a) { + for (index_t inc_b = 1; inc_b < 3; ++inc_b) { + omatcopy2_default.push_back(std::make_tuple( + trans, m, n, alpha, lda_mul, ldb_mul, inc_a, inc_b)); } } } @@ -1336,21 +1332,20 @@ get_matcopy_batch_params(Args& args) { if (args.csv_param.empty()) { warning_no_csv(); std::vector> matcopy_batch_default; - constexpr index_t dmin = 256, dmax = 8192; + constexpr index_t dmin = 256, dmax = 4096; constexpr scalar_t alpha{2}; constexpr index_t batch_size{3}; constexpr index_t stride_a_mul{1}; constexpr index_t stride_b_mul{1}; + constexpr index_t lda_mul = 1; + constexpr index_t ldb_mul = 1; + constexpr index_t ldc_mul = 1; for (char trans : {'n', 't'}) { for (index_t m = dmin; m <= dmax; m *= 2) { for (index_t n = dmin; n <= dmax; n *= 2) { - for (index_t lda_mul = 1; lda_mul < 2; ++lda_mul) { - for (index_t ldb_mul = 1; ldb_mul < 2; ++ldb_mul) { - matcopy_batch_default.push_back( - std::make_tuple(trans, m, n, alpha, lda_mul, ldb_mul, - stride_a_mul, stride_b_mul, batch_size)); - } - } + matcopy_batch_default.push_back( + std::make_tuple(trans, m, n, alpha, lda_mul, ldb_mul, + stride_a_mul, stride_b_mul, batch_size)); } } } @@ -1386,22 +1381,19 @@ static inline std::vector> get_omatadd_params( if (args.csv_param.empty()) { warning_no_csv(); std::vector> omatadd_default; - constexpr index_t dmin = 64, dmax = 8192; + constexpr index_t dmin = 64, dmax = 4096; constexpr scalar_t alpha{2}; constexpr scalar_t beta{2}; + constexpr index_t lda_mul = 1; + constexpr index_t ldb_mul = 1; + constexpr index_t ldc_mul = 1; for (char trans_a : {'n', 't'}) { for (char trans_b : {'n', 't'}) { for (index_t m = dmin; m <= dmax; m *= 2) { for (index_t n = dmin; n <= dmax; n *= 2) { - for (index_t lda_mul = 1; lda_mul < 2; ++lda_mul) { - for (index_t ldb_mul = 1; ldb_mul < 2; ++ldb_mul) { - for (index_t ldc_mul = 1; ldc_mul < 2; ++ldc_mul) { - omatadd_default.push_back( - std::make_tuple(trans_a, trans_b, m, n, alpha, beta, - lda_mul, ldb_mul, ldc_mul)); - } - } - } + omatadd_default.push_back(std::make_tuple(trans_a, trans_b, m, n, + alpha, beta, lda_mul, + ldb_mul, ldc_mul)); } } } @@ -1439,27 +1431,23 @@ get_omatadd_batch_params(Args& args) { if (args.csv_param.empty()) { warning_no_csv(); std::vector> omatadd_batch_default; - constexpr index_t dmin = 256, dmax = 8192; + constexpr index_t dmin = 1024, dmax = 4096; constexpr scalar_t alpha{2}; constexpr scalar_t beta{2}; constexpr index_t batch_size{3}; constexpr index_t stride_a_mul{1}; constexpr index_t stride_b_mul{1}; constexpr index_t stride_c_mul{1}; + constexpr index_t lda_mul = 1; + constexpr index_t ldb_mul = 1; + constexpr index_t ldc_mul = 1; for (char trans_a : {'n', 't'}) { - for (char trans_b : {'n', 't'}) { + for (char trans_b : {'n'}) { for (index_t m = dmin; m <= dmax; m *= 2) { for (index_t n = dmin; n <= dmax; n *= 2) { - for (index_t lda_mul = 1; lda_mul < 2; ++lda_mul) { - for (index_t ldb_mul = 1; ldb_mul < 2; ++ldb_mul) { - for (index_t ldc_mul = 1; ldc_mul < 2; ++ldc_mul) { - omatadd_batch_default.push_back( - std::make_tuple(trans_a, trans_b, m, n, alpha, beta, - lda_mul, ldb_mul, ldc_mul, stride_a_mul, - stride_b_mul, stride_c_mul, batch_size)); - } - } - } + omatadd_batch_default.push_back(std::make_tuple( + trans_a, trans_b, m, n, alpha, beta, lda_mul, ldb_mul, ldc_mul, + stride_a_mul, stride_b_mul, stride_c_mul, batch_size)); } } } diff --git a/src/interface/extension/backend/default_cpu.hpp b/src/interface/extension/backend/default_cpu.hpp index d8a2f6c24..ba714e78c 100644 --- a/src/interface/extension/backend/default_cpu.hpp +++ b/src/interface/extension/backend/default_cpu.hpp @@ -37,12 +37,12 @@ typename sb_handle_t::event_t _transpose_outplace( container_0_t in_, index_t _ld_in, index_t _inc_in, index_t _stride_in, container_1_t out_, index_t _ld_out, index_t _inc_out, index_t _stride_out, index_t _batch_size, const typename sb_handle_t::event_t& _dependencies) { - if (_M * _N < (1 << 20)) { + if (_M * _N < (1 << 16)) { return blas::internal::_transpose_outplace_impl<16, 64, 64, false>( sb_handle, _M, _N, _alpha, in_, _ld_in, _inc_in, _stride_in, out_, _ld_out, _inc_out, _stride_out, _batch_size, _dependencies); } else { - return blas::internal::_transpose_outplace_impl<32, 128, 64, false>( + return blas::internal::_transpose_outplace_impl<32, 32, 64, false>( sb_handle, _M, _N, _alpha, in_, _ld_in, _inc_in, _stride_in, out_, _ld_out, _inc_out, _stride_out, _batch_size, _dependencies); } @@ -58,13 +58,13 @@ typename sb_handle_t::event_t _transpose_add( index_t _b_rows, index_t _b_cols, index_t _stride_b, container_2_t c_, index_t _ld_c, index_t _stride_c, index_t _batch_size, const typename sb_handle_t::event_t& _dependencies) { - if (_M * _N < (1 << 20)) { + if (_M * _N < (1 << 16)) { return blas::internal::_transpose_add_impl( sb_handle, _M, _N, _alpha, a_, _ld_a, _a_rows, _a_cols, _stride_a, _beta, b_, _ld_b, _b_rows, _b_cols, _stride_b, c_, _ld_c, _stride_c, _batch_size, _dependencies); } else { - return blas::internal::_transpose_add_impl( + return blas::internal::_transpose_add_impl( sb_handle, _M, _N, _alpha, a_, _ld_a, _a_rows, _a_cols, _stride_a, _beta, b_, _ld_b, _b_rows, _b_cols, _stride_b, c_, _ld_c, _stride_c, _batch_size, _dependencies); From 8c5224f072e8ffb0a87fa8bf7ec392be357bedf6 Mon Sep 17 00:00:00 2001 From: pgorlani <92453485+pgorlani@users.noreply.github.com> Date: Tue, 19 Dec 2023 17:25:01 +0000 Subject: [PATCH 2/2] Revert configuration for gemm_batched_strided (#486) This configuration performs better on NVIDIA GPUs. --- cmake/CmakeFunctionHelper.cmake | 3 +++ src/interface/blas3/backend/nvidia_gpu.hpp | 6 +++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/cmake/CmakeFunctionHelper.cmake b/cmake/CmakeFunctionHelper.cmake index c7804d0aa..cd84d38cd 100644 --- a/cmake/CmakeFunctionHelper.cmake +++ b/cmake/CmakeFunctionHelper.cmake @@ -592,6 +592,9 @@ elseif(${TUNING_TARGET} STREQUAL "NVIDIA_GPU") add_gemm_configuration( "${data}" 256 "false" "true" "true" 128 8 8 16 16 1 1 1 1 1 1 1 1 1 float float "local" "standard" "full" 1 "strided" "false") + add_gemm_configuration( + "${data}" 64 "false" "false" "true" + 64 8 8 8 8 1 1 2 2 1 1 1 1 1 float float "local" "standard" "full" 1 "strided" "false") endforeach() if(BLAS_ENABLE_COMPLEX) # Extract list of complex for each data in supported_types diff --git a/src/interface/blas3/backend/nvidia_gpu.hpp b/src/interface/blas3/backend/nvidia_gpu.hpp index 72ef7160d..b2bca28ef 100644 --- a/src/interface/blas3/backend/nvidia_gpu.hpp +++ b/src/interface/blas3/backend/nvidia_gpu.hpp @@ -113,9 +113,9 @@ _gemm(sb_handle_t& sb_handle, index_t _M, index_t _N, index_t _K, if (batch_size > 1) { return blas::Gemm_Launcher< - container_0_t, container_1_t, container_2_t, 256, false, true, true, - 128, Tile<8, 8, 16, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1, float, float>, - _t_a, _t_b, s_a, s_b, static_cast(gemm_memory_t::local), + container_0_t, container_1_t, container_2_t, 64, false, false, true, + 64, Tile<8, 8, 8, 8, 1, 1, 2, 2, 1, 1, 1, 1, 1, float, float>, _t_a, + _t_b, s_a, s_b, static_cast(gemm_memory_t::local), static_cast(gemm_algorithm_t::standard), static_cast(gemm_vectorization_t::full), is_beta_zero, 1, static_cast(gemm_batch_type_t::strided),