From a4a2ec94df34d6349d43ef3897f59395ae26deb0 Mon Sep 17 00:00:00 2001 From: Ouadie EL FAROUKI Date: Wed, 6 Mar 2024 17:41:31 +0000 Subject: [PATCH 01/12] minor fixes and reverting of ACPP changes causing unexpected tests & header only lib behaviors --- cmake/Modules/SYCL.cmake | 12 ++++++---- include/container/sycl_iterator.h | 40 +++++++------------------------ samples/CMakeLists.txt | 4 ++++ 3 files changed, 19 insertions(+), 37 deletions(-) diff --git a/cmake/Modules/SYCL.cmake b/cmake/Modules/SYCL.cmake index 31f28245e..a4efc0226 100644 --- a/cmake/Modules/SYCL.cmake +++ b/cmake/Modules/SYCL.cmake @@ -26,11 +26,13 @@ include(CheckCXXCompilerFlag) include(ConfigurePORTBLAS) # find_package(AdaptiveCpp) requires ACPP_TARGETS to be set, so set it to a default value before find_package(AdaptiveCpp) -if(SYCL_COMPILER MATCHES "adaptivecpp" AND NOT ACPP_TARGETS AND NOT ENV{ACPP_TARGETS}) - message(STATUS "Using `omp` as ACPP_TARGETS") - set(ACPP_TARGETS "omp") -else() - message(STATUS "Using ${ACPP_TARGETS} as ACPP_TARGETS") +if(SYCL_COMPILER MATCHES "adaptivecpp") + if(NOT ACPP_TARGETS AND NOT ENV{ACPP_TARGETS}) + message(STATUS "Using `omp` as ACPP_TARGETS") + set(ACPP_TARGETS "omp") + else() + message(STATUS "Using ${ACPP_TARGETS} as ACPP_TARGETS") + endif() endif() check_cxx_compiler_flag("--acpp-targets" has_acpp) diff --git a/include/container/sycl_iterator.h b/include/container/sycl_iterator.h index 9e12939ed..e6ea4f953 100644 --- a/include/container/sycl_iterator.h +++ b/include/container/sycl_iterator.h @@ -194,32 +194,17 @@ template inline typename BufferIterator::template accessor_t BufferIterator::get_range_accessor(cl::sycl::handler& cgh, size_t size) { - if constexpr (acc_md_t == cl::sycl::access::mode::read) { - return typename BufferIterator::template accessor_t( - buffer_, cgh, cl::sycl::range<1>(size), - cl::sycl::id<1>(BufferIterator::get_offset())); - } else { - // Skip data initialization if not accessing in read mode only - return typename BufferIterator::template accessor_t( - buffer_, cgh, cl::sycl::range<1>(size), - cl::sycl::id<1>(BufferIterator::get_offset()), - cl::sycl::property::no_init{}); - } + return typename BufferIterator::template accessor_t( + buffer_, cgh, cl::sycl::range<1>(size), + cl::sycl::id<1>(BufferIterator::get_offset())); } template template inline typename BufferIterator::template accessor_t BufferIterator::get_range_accessor(cl::sycl::handler& cgh) { - if constexpr (acc_md_t == cl::sycl::access::mode::read) { - return BufferIterator::get_range_accessor( - cgh, BufferIterator::get_size()); - } else { - // Skip data initialization if not accessing in read mode only - return BufferIterator::get_range_accessor( - cgh, BufferIterator::get_size(), - cl::sycl::property::no_init{}); - } + return BufferIterator::get_range_accessor( + cgh, BufferIterator::get_size()); } template @@ -227,18 +212,9 @@ template inline typename BufferIterator::template placeholder_accessor_t< acc_md_t> BufferIterator::get_range_accessor(size_t size) { - if constexpr (acc_md_t == cl::sycl::access::mode::read) { - return typename BufferIterator::template placeholder_accessor_t< - acc_md_t>(buffer_, cl::sycl::range<1>(size), - cl::sycl::id<1>(BufferIterator::get_offset())); - - } else { - // Skip data initialization if not accessing in read mode only - return typename BufferIterator::template placeholder_accessor_t< - acc_md_t>(buffer_, cl::sycl::range<1>(size), - cl::sycl::id<1>(BufferIterator::get_offset()), - cl::sycl::property::no_init{}); - } + return typename BufferIterator::template placeholder_accessor_t< + acc_md_t>(buffer_, cl::sycl::range<1>(size), + cl::sycl::id<1>(BufferIterator::get_offset())); } template diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt index eac74afb7..d5a99be3f 100644 --- a/samples/CMakeLists.txt +++ b/samples/CMakeLists.txt @@ -1,3 +1,7 @@ +cmake_minimum_required(VERSION 3.4.3) + +project(portBLASSample LANGUAGES CXX) + set(PORTBLAS_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../include) set(PORTBLAS_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../src) list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}) From 1073e1c2a9a7181ecefa345c1e3ea9ba61d72465 Mon Sep 17 00:00:00 2001 From: Ouadie EL FAROUKI Date: Wed, 6 Mar 2024 19:23:55 +0000 Subject: [PATCH 02/12] fixes to gemm half support with default cpu --- CMakeLists.txt | 6 --- cmake/CmakeFunctionHelper.cmake | 9 +++++ src/interface/blas3/backend/default_cpu.hpp | 43 +++++++++++++++++++++ 3 files changed, 52 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 954f32cac..5c3462e3f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -113,12 +113,6 @@ option(BLAS_ENABLE_EXTENSIONS "Whether to enable portBLAS extensions" ON) option(BLAS_ENABLE_COMPLEX "Whether to enable complex data type for GEMM" OFF) option(BLAS_ENABLE_HALF "Whether to enable sycl::half data type for supported operators" OFF) -if(((NOT INSTALL_HEADER_ONLY) AND (TUNING_TARGET STREQUAL "DEFAULT_CPU")) - OR (INSTALL_HEADER_ONLY AND (NOT TUNING_TARGET))) - set(BLAS_ENABLE_HALF OFF) - message(STATUS "FP16 operations are not supported for CPU targets. BLAS_ENABLE_HALF is disabled") -endif() - if (SYCL_COMPILER MATCHES "adaptivecpp") if(BLAS_ENABLE_COMPLEX) message(STATUS "SYCL Complex data is not supported on AdaptiveCpp/hipSYCL. Complex diff --git a/cmake/CmakeFunctionHelper.cmake b/cmake/CmakeFunctionHelper.cmake index 2825b3a92..fe60c42f7 100644 --- a/cmake/CmakeFunctionHelper.cmake +++ b/cmake/CmakeFunctionHelper.cmake @@ -702,6 +702,15 @@ else() # default cpu backend add_gemm_configuration( "${data}" 64 "false" "false" "false" 64 2 2 4 4 1 1 1 1 4 4 1 1 1 float float "no_local" "standard" "full" 4 "interleaved" "false" "false") + + if(BLAS_ENABLE_HALF) + add_gemm_configuration( + "half" 128 "false" "false" "false" + 64 4 4 8 8 1 1 1 1 1 1 1 1 1 float float "no_local" "standard" "full" 1 "strided" "false" "false") + add_gemm_configuration( + "half" 64 "false" "false" "false" + 64 2 2 4 4 1 1 1 1 4 4 1 1 1 float float "no_local" "standard" "full" 4 "interleaved" "false" "false") + endif() endforeach() if(BLAS_ENABLE_COMPLEX) diff --git a/src/interface/blas3/backend/default_cpu.hpp b/src/interface/blas3/backend/default_cpu.hpp index 33f50539f..54de53e83 100644 --- a/src/interface/blas3/backend/default_cpu.hpp +++ b/src/interface/blas3/backend/default_cpu.hpp @@ -120,6 +120,49 @@ _gemm(sb_handle_t& sb_handle, index_t _M, index_t _N, index_t _K, } } +// Half Configurations +template +typename std::enable_if::value, + typename sb_handle_t::event_t>::type +_gemm(sb_handle_t& sb_handle, index_t _M, index_t _N, index_t _K, + element_out_t _alpha, container_0_t _a, index_t _lda, index_t _stridea, + container_1_t _b, index_t _ldb, index_t _strideb, element_out_t _beta, + container_2_t _c, index_t _ldc, index_t _stridec, index_t batch_size, + gemm_batch_type_t batch_type, + const typename sb_handle_t::event_t& _dependencies) { + // Unused configuration cases + if constexpr (s_a || s_b) { + return _dependencies; + } else { + if (batch_type == gemm_batch_type_t::interleaved) { + return blas::Gemm_Launcher< + container_0_t, container_1_t, container_2_t, 64, false, false, false, + 64, Tile<2, 2, 4, 4, 1, 1, 1, 1, 4, 4>, _t_a, _t_b, s_a, s_b, + static_cast(gemm_memory_t::no_local), + static_cast(gemm_algorithm_t::standard), + static_cast(gemm_vectorization_t::full), is_beta_zero, 4, + static_cast(gemm_batch_type_t::interleaved)>:: + template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, _lda, + _stridea, _b, _ldb, _strideb, _beta, _c, _ldc, + _stridec, batch_size, _dependencies); + } + + return blas::Gemm_Launcher< + container_0_t, container_1_t, container_2_t, 128, false, false, false, + 64, Tile<4, 4, 8, 8>, _t_a, _t_b, s_a, s_b, + static_cast(gemm_memory_t::no_local), + static_cast(gemm_algorithm_t::standard), + static_cast(gemm_vectorization_t::full), is_beta_zero, 1, + static_cast(gemm_batch_type_t::strided)>:: + template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, _lda, _stridea, + _b, _ldb, _strideb, _beta, _c, _ldc, _stridec, + batch_size, _dependencies); + } +} + // Complex Configurations #ifdef BLAS_ENABLE_COMPLEX template Date: Thu, 7 Mar 2024 02:14:50 +0000 Subject: [PATCH 03/12] typo fix --- src/interface/blas3/backend/default_cpu.hpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/interface/blas3/backend/default_cpu.hpp b/src/interface/blas3/backend/default_cpu.hpp index 54de53e83..bf5dec3b3 100644 --- a/src/interface/blas3/backend/default_cpu.hpp +++ b/src/interface/blas3/backend/default_cpu.hpp @@ -122,14 +122,13 @@ _gemm(sb_handle_t& sb_handle, index_t _M, index_t _N, index_t _K, // Half Configurations template -typename std::enable_if::value, + typename sb_handle_t, typename container_0_t, typename container_1_t, + typename container_2_t, typename element_t, typename index_t> +typename std::enable_if::value, typename sb_handle_t::event_t>::type _gemm(sb_handle_t& sb_handle, index_t _M, index_t _N, index_t _K, - element_out_t _alpha, container_0_t _a, index_t _lda, index_t _stridea, - container_1_t _b, index_t _ldb, index_t _strideb, element_out_t _beta, + element_t _alpha, container_0_t _a, index_t _lda, index_t _stridea, + container_1_t _b, index_t _ldb, index_t _strideb, element_t _beta, container_2_t _c, index_t _ldc, index_t _stridec, index_t batch_size, gemm_batch_type_t batch_type, const typename sb_handle_t::event_t& _dependencies) { From 22bbd8dabf5ad08576e7a763e1179762fd27d6a7 Mon Sep 17 00:00:00 2001 From: Ouadie EL FAROUKI Date: Mon, 11 Mar 2024 13:09:09 +0000 Subject: [PATCH 04/12] added license text to samples CmakeLists --- samples/CMakeLists.txt | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt index d5a99be3f..a1392057f 100644 --- a/samples/CMakeLists.txt +++ b/samples/CMakeLists.txt @@ -1,3 +1,27 @@ +#/*************************************************************************** +# * +# * @license +# * Copyright (C) Codeplay Software Limited +# * Licensed under the Apache License, Version 2.0 (the "License"); +# * you may not use this file except in compliance with the License. +# * You may obtain a copy of the License at +# * +# * http://www.apache.org/licenses/LICENSE-2.0 +# * +# * For your convenience, a copy of the License has been included in this +# * repository. +# * +# * Unless required by applicable law or agreed to in writing, software +# * distributed under the License is distributed on an "AS IS" BASIS, +# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# * See the License for the specific language governing permissions and +# * limitations under the License. +# * +# * portBLAS: BLAS implementation using SYCL +# * +# * @filename CMakeLists.txt +# * +# **************************************************************************/ cmake_minimum_required(VERSION 3.4.3) project(portBLASSample LANGUAGES CXX) From 4773f809d5927896a62edb73122c01da8fa87eaa Mon Sep 17 00:00:00 2001 From: nscipione Date: Wed, 20 Mar 2024 15:00:22 +0000 Subject: [PATCH 05/12] Add check for managed usm allocation for AMD AMD atomic operation implementation requires some specific hardware to work properly with current reduction kernel. This patch adds a check for AMD only to provides the correct result even if the specific hardware is not available. Signed-off-by: nscipione --- include/interface/blas1_interface.h | 6 +- include/operations/blas1_trees.h | 11 ++-- src/interface/blas1/backend/amd_gpu.hpp | 47 ++++++++++++--- src/interface/blas1_interface.hpp | 9 +-- src/operations/blas1/WGAtomicReduction.hpp | 69 +++++++++++++--------- 5 files changed, 94 insertions(+), 48 deletions(-) diff --git a/include/interface/blas1_interface.h b/include/interface/blas1_interface.h index 7684a0a65..80d104e01 100644 --- a/include/interface/blas1_interface.h +++ b/include/interface/blas1_interface.h @@ -136,9 +136,9 @@ typename sb_handle_t::event_t _asum( * \brief Prototype for the internal implementation of the ASUM operation. See * documentation in the blas1_interface.hpp file for details. */ -template +template typename sb_handle_t::event_t _asum_impl( sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx, container_1_t _rs, const index_t number_WG, diff --git a/include/operations/blas1_trees.h b/include/operations/blas1_trees.h index f0d94d596..27a34b5ec 100644 --- a/include/operations/blas1_trees.h +++ b/include/operations/blas1_trees.h @@ -208,7 +208,7 @@ struct AssignReduction { * function below. * */ -template +template struct WGAtomicReduction { using value_t = typename lhs_t::value_t; using index_t = typename rhs_t::index_t; @@ -304,10 +304,11 @@ inline AssignReduction make_assign_reduction( lhs_, rhs_, local_num_thread_, global_num_thread_); } -template -inline WGAtomicReduction make_wg_atomic_reduction( - lhs_t &lhs_, rhs_t &rhs_) { - return WGAtomicReduction(lhs_, rhs_); +template +inline WGAtomicReduction +make_wg_atomic_reduction(lhs_t &lhs_, rhs_t &rhs_) { + return WGAtomicReduction(lhs_, rhs_); } template diff --git a/src/interface/blas1/backend/amd_gpu.hpp b/src/interface/blas1/backend/amd_gpu.hpp index 999b596df..741e7f730 100644 --- a/src/interface/blas1/backend/amd_gpu.hpp +++ b/src/interface/blas1/backend/amd_gpu.hpp @@ -34,16 +34,45 @@ template (localSize), 32>( - sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies); + /** + * This compile time check is absolutely necessary for AMD gpu. + * AMD atomic operations required a specific combination of hardware that we + *cannot check neither enforce to users. Since reduction operators kernel + *implementation useses atomic operation without that particular combination + *the operator may fail silently. This check enforce a different atomic + *address space causing a big performance degradation, but making the kernel + *behaves correctly also with managed memory (aka malloc_shared allocation). + **/ + bool managed_mem{false}; + if constexpr (std::is_pointer_v) { + managed_mem = + sycl::usm::alloc::shared == + sycl::get_pointer_type(_rs, sb_handle.get_queue().get_context()); + } + if (managed_mem) { + if (_N < (1 << 18)) { + constexpr index_t localSize = 1024; + const index_t number_WG = (_N + localSize - 1) / localSize; + return blas::internal::_asum_impl(localSize), 32, true>( + sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies); + } else { + constexpr int localSize = 512; + constexpr index_t number_WG = 256; + return blas::internal::_asum_impl( + sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies); + } } else { - constexpr int localSize = 512; - constexpr index_t number_WG = 256; - return blas::internal::_asum_impl( - sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies); + if (_N < (1 << 18)) { + constexpr index_t localSize = 1024; + const index_t number_WG = (_N + localSize - 1) / localSize; + return blas::internal::_asum_impl(localSize), 32, false>( + sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies); + } else { + constexpr int localSize = 512; + constexpr index_t number_WG = 256; + return blas::internal::_asum_impl( + sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies); + } } } } // namespace backend diff --git a/src/interface/blas1_interface.hpp b/src/interface/blas1_interface.hpp index df914b71b..7f6ee962e 100644 --- a/src/interface/blas1_interface.hpp +++ b/src/interface/blas1_interface.hpp @@ -226,9 +226,9 @@ typename sb_handle_t::event_t _asum( * implementation use a kernel implementation which doesn't * require local memory. */ -template +template typename sb_handle_t::event_t _asum_impl( sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx, container_1_t _rs, const index_t number_WG, @@ -238,7 +238,8 @@ typename sb_handle_t::event_t _asum_impl( auto rs = make_vector_view(_rs, static_cast(1), static_cast(1)); typename sb_handle_t::event_t ret; - auto asumOp = make_wg_atomic_reduction(rs, vx); + auto asumOp = + make_wg_atomic_reduction(rs, vx); if constexpr (localMemSize != 0) { ret = sb_handle.execute(asumOp, static_cast(localSize), static_cast(number_WG * localSize), diff --git a/src/operations/blas1/WGAtomicReduction.hpp b/src/operations/blas1/WGAtomicReduction.hpp index 22d923e8d..779c33b43 100644 --- a/src/operations/blas1/WGAtomicReduction.hpp +++ b/src/operations/blas1/WGAtomicReduction.hpp @@ -35,26 +35,30 @@ namespace blas { * and atomics operation to combine the results. * * */ -template -WGAtomicReduction::WGAtomicReduction(lhs_t& _l, - rhs_t& _r) +template +WGAtomicReduction::WGAtomicReduction( + lhs_t& _l, rhs_t& _r) : lhs_(_l), rhs_(_r){}; -template -PORTBLAS_INLINE typename WGAtomicReduction::index_t -WGAtomicReduction::get_size() const { +template +PORTBLAS_INLINE + typename WGAtomicReduction::index_t + WGAtomicReduction::get_size() const { return rhs_.get_size(); } -template -PORTBLAS_INLINE bool WGAtomicReduction::valid_thread( +template +PORTBLAS_INLINE bool +WGAtomicReduction::valid_thread( cl::sycl::nd_item<1> ndItem) const { return true; } -template -PORTBLAS_INLINE typename WGAtomicReduction::value_t -WGAtomicReduction::eval(cl::sycl::nd_item<1> ndItem) { +template +PORTBLAS_INLINE + typename WGAtomicReduction::value_t + WGAtomicReduction::eval( + cl::sycl::nd_item<1> ndItem) { auto atomic_res = cl::sycl::atomic_ref::eval(cl::sycl::nd_item<1> ndItem) { } return {}; } -template + +template template -PORTBLAS_INLINE typename WGAtomicReduction::value_t -WGAtomicReduction::eval(sharedT scratch, - cl::sycl::nd_item<1> ndItem) { - auto atomic_res = - cl::sycl::atomic_ref( - lhs_.get_data()[0]); +PORTBLAS_INLINE + typename WGAtomicReduction::value_t + WGAtomicReduction::eval( + sharedT scratch, cl::sycl::nd_item<1> ndItem) { const auto size = get_size(); const int lid = static_cast(ndItem.get_global_linear_id()); const auto loop_stride = @@ -119,22 +120,36 @@ WGAtomicReduction::eval(sharedT scratch, cl::sycl::plus()); } if (ndItem.get_local_id()[0] == 0) { - atomic_res += val; + if constexpr (!managed_mem) { + auto atomic_res = + cl::sycl::atomic_ref( + lhs_.get_data()[0]); + atomic_res += val; + } else { + auto atomic_res = + cl::sycl::atomic_ref( + lhs_.get_data()[0]); + atomic_res += val; + } } return {}; } -template -PORTBLAS_INLINE void WGAtomicReduction::bind( - cl::sycl::handler& h) { +template +PORTBLAS_INLINE void WGAtomicReduction::bind(cl::sycl::handler& h) { lhs_.bind(h); rhs_.bind(h); } -template -PORTBLAS_INLINE void -WGAtomicReduction::adjust_access_displacement() { +template +PORTBLAS_INLINE void WGAtomicReduction::adjust_access_displacement() { lhs_.adjust_access_displacement(); rhs_.adjust_access_displacement(); } From b859f43b00d1cb9e8f27ee0b03827cbfc5ab1d82 Mon Sep 17 00:00:00 2001 From: nscipione Date: Wed, 20 Mar 2024 21:30:23 +0000 Subject: [PATCH 06/12] Add usm memory fix to other reduction operators Signed-off-by: nscipione --- include/interface/blas1_interface.h | 10 +-- src/interface/blas1/backend/amd_gpu.hpp | 82 +++++++++++++++++++------ src/interface/blas1_interface.hpp | 16 ++--- 3 files changed, 78 insertions(+), 30 deletions(-) diff --git a/include/interface/blas1_interface.h b/include/interface/blas1_interface.h index 80d104e01..f344e55be 100644 --- a/include/interface/blas1_interface.h +++ b/include/interface/blas1_interface.h @@ -257,9 +257,9 @@ typename sb_handle_t::event_t _nrm2( * \brief Prototype for the internal implementation of the NRM2 operator. See * documentation in the blas1_interface.hpp file for details. */ -template +template typename sb_handle_t::event_t _nrm2_impl( sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx, container_1_t _rs, const index_t number_WG, @@ -269,8 +269,8 @@ typename sb_handle_t::event_t _nrm2_impl( * \brief Prototype for the internal implementation of the Dot operator. See * documentation in the blas1_interface.hpp file for details. */ -template typename sb_handle_t::event_t _dot_impl( sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx, diff --git a/src/interface/blas1/backend/amd_gpu.hpp b/src/interface/blas1/backend/amd_gpu.hpp index 741e7f730..7ec252995 100644 --- a/src/interface/blas1/backend/amd_gpu.hpp +++ b/src/interface/blas1/backend/amd_gpu.hpp @@ -130,16 +130,39 @@ template (localSize), 32>( - sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies); + /** + * Read comment in _asum above. + **/ + bool managed_mem{false}; + if constexpr (std::is_pointer_v) { + managed_mem = + sycl::usm::alloc::shared == + sycl::get_pointer_type(_rs, sb_handle.get_queue().get_context()); + } + if (managed_mem) { + if (_N < (1 << 18)) { + constexpr index_t localSize = 1024; + const index_t number_WG = (_N + localSize - 1) / localSize; + return blas::internal::_nrm2_impl(localSize), 32, true>( + sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies); + } else { + constexpr int localSize = 512; + constexpr index_t number_WG = 512; + return blas::internal::_nrm2_impl( + sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies); + } } else { - constexpr int localSize = 512; - constexpr index_t number_WG = 512; - return blas::internal::_nrm2_impl( - sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies); + if (_N < (1 << 18)) { + constexpr index_t localSize = 1024; + const index_t number_WG = (_N + localSize - 1) / localSize; + return blas::internal::_nrm2_impl(localSize), 32, false>( + sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies); + } else { + constexpr int localSize = 512; + constexpr index_t number_WG = 512; + return blas::internal::_nrm2_impl( + sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies); + } } } } // namespace backend @@ -153,16 +176,39 @@ typename sb_handle_t::event_t _dot( sb_handle_t& sb_handle, index_t _N, container_0_t _vx, increment_t _incx, container_1_t _vy, increment_t _incy, container_2_t _rs, const typename sb_handle_t::event_t& _dependencies) { - if (_N < (1 << 18)) { - constexpr index_t localSize = 1024; - const index_t number_WG = (_N + localSize - 1) / localSize; - return blas::internal::_dot_impl(localSize), 32>( - sb_handle, _N, _vx, _incx, _vy, _incy, _rs, number_WG, _dependencies); + /** + * Read comment in _asum above. + **/ + bool managed_mem{false}; + if constexpr (std::is_pointer_v) { + managed_mem = + sycl::usm::alloc::shared == + sycl::get_pointer_type(_rs, sb_handle.get_queue().get_context()); + } + if (managed_mem) { + if (_N < (1 << 18)) { + constexpr index_t localSize = 1024; + const index_t number_WG = (_N + localSize - 1) / localSize; + return blas::internal::_dot_impl(localSize), 32, true>( + sb_handle, _N, _vx, _incx, _vy, _incy, _rs, number_WG, _dependencies); + } else { + constexpr int localSize = 512; + constexpr index_t number_WG = 512; + return blas::internal::_dot_impl( + sb_handle, _N, _vx, _incx, _vy, _incy, _rs, number_WG, _dependencies); + } } else { - constexpr int localSize = 512; - constexpr index_t number_WG = 512; - return blas::internal::_dot_impl( - sb_handle, _N, _vx, _incx, _vy, _incy, _rs, number_WG, _dependencies); + if (_N < (1 << 18)) { + constexpr index_t localSize = 1024; + const index_t number_WG = (_N + localSize - 1) / localSize; + return blas::internal::_dot_impl(localSize), 32, false>( + sb_handle, _N, _vx, _incx, _vy, _incy, _rs, number_WG, _dependencies); + } else { + constexpr int localSize = 512; + constexpr index_t number_WG = 512; + return blas::internal::_dot_impl( + sb_handle, _N, _vx, _incx, _vy, _incy, _rs, number_WG, _dependencies); + } } } } // namespace backend diff --git a/src/interface/blas1_interface.hpp b/src/interface/blas1_interface.hpp index 7f6ee962e..bc458d42e 100644 --- a/src/interface/blas1_interface.hpp +++ b/src/interface/blas1_interface.hpp @@ -548,9 +548,9 @@ typename sb_handle_t::event_t _nrm2( * implementation use a kernel implementation which doesn't * require local memory. */ -template +template typename sb_handle_t::event_t _nrm2_impl( sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx, container_1_t _rs, const index_t number_WG, @@ -561,7 +561,8 @@ typename sb_handle_t::event_t _nrm2_impl( static_cast(1)); auto prdOp = make_op(vx); - auto assignOp = make_wg_atomic_reduction(rs, prdOp); + auto assignOp = + make_wg_atomic_reduction(rs, prdOp); typename sb_handle_t::event_t ret0; if constexpr (localMemSize != 0) { ret0 = sb_handle.execute(assignOp, static_cast(localSize), @@ -596,8 +597,8 @@ typename sb_handle_t::event_t _nrm2_impl( * implementation use a kernel implementation which doesn't * require local memory. */ -template typename sb_handle_t::event_t _dot_impl( sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx, @@ -613,7 +614,8 @@ typename sb_handle_t::event_t _dot_impl( static_cast(1)); auto prdOp = make_op(vx, vy); - auto wgReductionOp = make_wg_atomic_reduction(rs, prdOp); + auto wgReductionOp = + make_wg_atomic_reduction(rs, prdOp); if constexpr (localMemSize) { ret_event = From 030a5add0decbf8f3f7d604398b6aa84d7b54cf4 Mon Sep 17 00:00:00 2001 From: nscipione Date: Thu, 21 Mar 2024 09:21:55 +0000 Subject: [PATCH 07/12] Renaming template parameter and variable, avoid memory check if using buffer only Signed-off-by: nscipione --- include/operations/blas1_trees.h | 9 +++-- src/interface/blas1/backend/amd_gpu.hpp | 30 ++++++++++----- src/operations/blas1/WGAtomicReduction.hpp | 44 +++++++++++++--------- 3 files changed, 52 insertions(+), 31 deletions(-) diff --git a/include/operations/blas1_trees.h b/include/operations/blas1_trees.h index 27a34b5ec..d005e1915 100644 --- a/include/operations/blas1_trees.h +++ b/include/operations/blas1_trees.h @@ -208,7 +208,8 @@ struct AssignReduction { * function below. * */ -template +template struct WGAtomicReduction { using value_t = typename lhs_t::value_t; using index_t = typename rhs_t::index_t; @@ -304,11 +305,11 @@ inline AssignReduction make_assign_reduction( lhs_, rhs_, local_num_thread_, global_num_thread_); } -template -inline WGAtomicReduction +inline WGAtomicReduction make_wg_atomic_reduction(lhs_t &lhs_, rhs_t &rhs_) { - return WGAtomicReduction(lhs_, rhs_); + return WGAtomicReduction(lhs_, rhs_); } template diff --git a/src/interface/blas1/backend/amd_gpu.hpp b/src/interface/blas1/backend/amd_gpu.hpp index 7ec252995..e6d8e44ed 100644 --- a/src/interface/blas1/backend/amd_gpu.hpp +++ b/src/interface/blas1/backend/amd_gpu.hpp @@ -43,13 +43,17 @@ typename sb_handle_t::event_t _asum( *address space causing a big performance degradation, but making the kernel *behaves correctly also with managed memory (aka malloc_shared allocation). **/ - bool managed_mem{false}; +#ifdef SB_ENABLE_USM + bool usm_managed_mem{false}; if constexpr (std::is_pointer_v) { - managed_mem = + usm_managed_mem = sycl::usm::alloc::shared == sycl::get_pointer_type(_rs, sb_handle.get_queue().get_context()); } - if (managed_mem) { +#else + constexpr bool usm_managed_mem{false}; +#endif + if (usm_managed_mem) { if (_N < (1 << 18)) { constexpr index_t localSize = 1024; const index_t number_WG = (_N + localSize - 1) / localSize; @@ -133,13 +137,17 @@ typename sb_handle_t::event_t _nrm2( /** * Read comment in _asum above. **/ - bool managed_mem{false}; +#ifdef SB_ENABLE_USM + bool usm_managed_mem{false}; if constexpr (std::is_pointer_v) { - managed_mem = + usm_managed_mem = sycl::usm::alloc::shared == sycl::get_pointer_type(_rs, sb_handle.get_queue().get_context()); } - if (managed_mem) { +#else + constexpr bool usm_managed_mem{false}; +#endif + if (usm_managed_mem) { if (_N < (1 << 18)) { constexpr index_t localSize = 1024; const index_t number_WG = (_N + localSize - 1) / localSize; @@ -179,13 +187,17 @@ typename sb_handle_t::event_t _dot( /** * Read comment in _asum above. **/ - bool managed_mem{false}; +#ifdef SB_ENABLE_USM + bool usm_managed_mem{false}; if constexpr (std::is_pointer_v) { - managed_mem = + usm_managed_mem = sycl::usm::alloc::shared == sycl::get_pointer_type(_rs, sb_handle.get_queue().get_context()); } - if (managed_mem) { +#else + constexpr bool usm_managed_mem{false}; +#endif + if (usm_managed_mem) { if (_N < (1 << 18)) { constexpr index_t localSize = 1024; const index_t number_WG = (_N + localSize - 1) / localSize; diff --git a/src/operations/blas1/WGAtomicReduction.hpp b/src/operations/blas1/WGAtomicReduction.hpp index 779c33b43..21756c7c0 100644 --- a/src/operations/blas1/WGAtomicReduction.hpp +++ b/src/operations/blas1/WGAtomicReduction.hpp @@ -35,29 +35,34 @@ namespace blas { * and atomics operation to combine the results. * * */ -template -WGAtomicReduction::WGAtomicReduction( +template +WGAtomicReduction::WGAtomicReduction( lhs_t& _l, rhs_t& _r) : lhs_(_l), rhs_(_r){}; -template +template PORTBLAS_INLINE - typename WGAtomicReduction::index_t - WGAtomicReduction::get_size() const { + typename WGAtomicReduction::index_t + WGAtomicReduction::get_size() + const { return rhs_.get_size(); } -template +template PORTBLAS_INLINE bool -WGAtomicReduction::valid_thread( +WGAtomicReduction::valid_thread( cl::sycl::nd_item<1> ndItem) const { return true; } -template +template PORTBLAS_INLINE - typename WGAtomicReduction::value_t - WGAtomicReduction::eval( + typename WGAtomicReduction::value_t + WGAtomicReduction::eval( cl::sycl::nd_item<1> ndItem) { auto atomic_res = cl::sycl::atomic_ref +template template PORTBLAS_INLINE - typename WGAtomicReduction::value_t - WGAtomicReduction::eval( + typename WGAtomicReduction::value_t + WGAtomicReduction::eval( sharedT scratch, cl::sycl::nd_item<1> ndItem) { const auto size = get_size(); const int lid = static_cast(ndItem.get_global_linear_id()); @@ -120,7 +126,7 @@ PORTBLAS_INLINE cl::sycl::plus()); } if (ndItem.get_local_id()[0] == 0) { - if constexpr (!managed_mem) { + if constexpr (!usmManagedMem) { auto atomic_res = cl::sycl::atomic_ref -PORTBLAS_INLINE void WGAtomicReduction +PORTBLAS_INLINE void WGAtomicReduction::bind(cl::sycl::handler& h) { lhs_.bind(h); rhs_.bind(h); } -template -PORTBLAS_INLINE void WGAtomicReduction +PORTBLAS_INLINE void WGAtomicReduction::adjust_access_displacement() { lhs_.adjust_access_displacement(); rhs_.adjust_access_displacement(); From eeec76116bf41a18d459b48ed96751ff8fa72856 Mon Sep 17 00:00:00 2001 From: Ouadie EL FAROUKI Date: Mon, 25 Mar 2024 14:52:05 +0000 Subject: [PATCH 08/12] Added clarifications regarding half support --- README.md | 4 ++-- src/interface/blas3/backend/default_cpu.hpp | 3 ++- src/interface/blas3/backend/intel_gpu.hpp | 3 ++- src/interface/blas3/backend/nvidia_gpu.hpp | 3 ++- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 7b1ee2b70..3752e5bf7 100644 --- a/README.md +++ b/README.md @@ -491,10 +491,10 @@ Some of the supported options are: | `BLAS_MEMPOOL_BENCHMARK` | `ON`/`OFF` | Determines whether to enable the scratchpad memory pool for benchmark execution. `OFF` by default | | `BLAS_ENABLE_CONST_INPUT` | `ON`/`OFF` | Determines whether to enable kernel instantiation with const input buffer (`ON` by default) | | `BLAS_ENABLE_EXTENSIONS` | `ON`/`OFF` | Determines whether to enable portBLAS extensions (`ON` by default) | -| `BLAS_DATA_TYPES` | `float;double` | Determines the floating-point types to instantiate BLAS operations for. Default is `float` | -| `BLAS_INDEX_TYPES` | `int32_t;int64_t` | Determines the type(s) to use for `index_t` and `increment_t`. Default is `int` | +| `BLAS_DATA_TYPES` | `float;double` | Determines the floating-point types to instantiate BLAS operations for. Default is `float`. Enabling other types such as complex or half requires setting their respective options *(next)*. | | `BLAS_ENABLE_COMPLEX` | `ON`/`OFF` | Determines whether to enable Complex data type support *(GEMM Operators only)* (`OFF` by default) | | `BLAS_ENABLE_HALF` | `ON`/`OFF` | Determines whether to enable Half data type support *(Support is limited to some Level 1 operators and Gemm)* (`OFF` by default) | +| `BLAS_INDEX_TYPES` | `int32_t;int64_t` | Determines the type(s) to use for `index_t` and `increment_t`. Default is `int` | ## ComputeCpp Compilation *(Deprecated)* diff --git a/src/interface/blas3/backend/default_cpu.hpp b/src/interface/blas3/backend/default_cpu.hpp index bf5dec3b3..dbe475d1b 100644 --- a/src/interface/blas3/backend/default_cpu.hpp +++ b/src/interface/blas3/backend/default_cpu.hpp @@ -132,7 +132,8 @@ _gemm(sb_handle_t& sb_handle, index_t _M, index_t _N, index_t _K, container_2_t _c, index_t _ldc, index_t _stridec, index_t batch_size, gemm_batch_type_t batch_type, const typename sb_handle_t::event_t& _dependencies) { - // Unused configuration cases + // The symmetric matrice(s) cases are not enabled with half (fp16) as the Symm + // operator's specification doesn't include half floating point data. if constexpr (s_a || s_b) { return _dependencies; } else { diff --git a/src/interface/blas3/backend/intel_gpu.hpp b/src/interface/blas3/backend/intel_gpu.hpp index bf56e684e..8cc8180ef 100644 --- a/src/interface/blas3/backend/intel_gpu.hpp +++ b/src/interface/blas3/backend/intel_gpu.hpp @@ -222,7 +222,8 @@ _gemm(sb_handle_t& sb_handle, index_t _M, index_t _N, index_t _K, container_2_t _c, index_t _ldc, index_t _stridec, index_t batch_size, gemm_batch_type_t batch_type, const typename sb_handle_t::event_t& _dependencies) { - // Unused configuration cases + // The symmetric matrice(s) cases are not enabled with half (fp16) as the Symm + // operator's specification doesn't include half floating point data. if constexpr (s_a && s_b || ((s_a && _t_b) || (s_b && _t_a))) { return _dependencies; } else { diff --git a/src/interface/blas3/backend/nvidia_gpu.hpp b/src/interface/blas3/backend/nvidia_gpu.hpp index a60933694..a6f8dc419 100644 --- a/src/interface/blas3/backend/nvidia_gpu.hpp +++ b/src/interface/blas3/backend/nvidia_gpu.hpp @@ -188,7 +188,8 @@ _gemm(sb_handle_t& sb_handle, index_t _M, index_t _N, index_t _K, container_2_t _c, index_t _ldc, index_t _stridec, index_t batch_size, gemm_batch_type_t batch_type, const typename sb_handle_t::event_t& _dependencies) { - // Unused configuration cases + // The symmetric matrice(s) cases are not enabled with half (fp16) as the Symm + // operator's specification doesn't include half floating point data. if constexpr (s_a && s_b || ((s_a && _t_b) || (s_b && _t_a))) { return _dependencies; } else { From 0201a502ba92b79a6d061a0c3b3a545c84396e8b Mon Sep 17 00:00:00 2001 From: nscipione Date: Mon, 25 Mar 2024 16:16:01 +0000 Subject: [PATCH 09/12] Add documentation for new template parameter added Add documentation for `usmManagedMem` template parameter. Signed-off-by: nscipione --- src/interface/blas1_interface.hpp | 60 ++++++++++++++++++++++++++----- 1 file changed, 51 insertions(+), 9 deletions(-) diff --git a/src/interface/blas1_interface.hpp b/src/interface/blas1_interface.hpp index bc458d42e..7c8180aa5 100644 --- a/src/interface/blas1_interface.hpp +++ b/src/interface/blas1_interface.hpp @@ -219,12 +219,26 @@ typename sb_handle_t::event_t _asum( * the platform being compiled for and other parameters, provides different * template parameters to ensure the most optimal kernel is constructed. * - * @tparam localSize specifies the number of threads per work group used by - * the kernel - * @tparam localMemSize specifies the size of local shared memory to use, which + * @tparam localSize Specifies the number of threads per work group used by + * the kernel + * @tparam localMemSize Specifies the size of local shared memory to use, which * is device and implementation dependent. If 0 the * implementation use a kernel implementation which doesn't * require local memory. + * @tparam usmManagedMem Specifies if usm memory allocation is automatically + * managed or not. The memory automatically managed + * requires that atomic address space is set to generic. + * This is a strict requirement only for AMD gpus, since + * otherwise it will rely on pcie atomics + * which we cannot enforce, guarantee or check due to its + * hardware nature. Other targets do not have the same + * strong dependency and managed memory is handled + * correctly in any case by default. It is automatically + * initialized to false to reduce verbosity of + * initialization for many targets since only one of them, + * with specific allocation type, requires a different + * value. Having a default value allows the compiler to + * handle automatically other templates. */ template Date: Tue, 26 Mar 2024 17:42:55 +0100 Subject: [PATCH 10/12] Apply suggestions from code review Co-authored-by: HJA Bird Co-authored-by: pgorlani <92453485+pgorlani@users.noreply.github.com> --- src/interface/blas1/backend/amd_gpu.hpp | 14 ++++++------- src/operations/blas1/WGAtomicReduction.hpp | 23 ++++++++-------------- 2 files changed, 15 insertions(+), 22 deletions(-) diff --git a/src/interface/blas1/backend/amd_gpu.hpp b/src/interface/blas1/backend/amd_gpu.hpp index e6d8e44ed..3a1409b8d 100644 --- a/src/interface/blas1/backend/amd_gpu.hpp +++ b/src/interface/blas1/backend/amd_gpu.hpp @@ -35,13 +35,13 @@ typename sb_handle_t::event_t _asum( sb_handle_t& sb_handle, index_t _N, container_0_t _vx, increment_t _incx, container_1_t _rs, const typename sb_handle_t::event_t& _dependencies) { /** - * This compile time check is absolutely necessary for AMD gpu. - * AMD atomic operations required a specific combination of hardware that we - *cannot check neither enforce to users. Since reduction operators kernel - *implementation useses atomic operation without that particular combination - *the operator may fail silently. This check enforce a different atomic - *address space causing a big performance degradation, but making the kernel - *behaves correctly also with managed memory (aka malloc_shared allocation). + * This compile time check is absolutely necessary for AMD GPUs. + * AMD's atomic operations require a specific combination of hardware that + * cannot be checked nor enforced. Since the reduction operator kernel + * implementation uses atomic operations, without that particular hardware combination + * the reduction may silently fail. This check enforces a different atomic + * address space causing a big performance degradation, but also making the kernel + * behave correctly with managed memory (aka malloc_shared allocation). **/ #ifdef SB_ENABLE_USM bool usm_managed_mem{false}; diff --git a/src/operations/blas1/WGAtomicReduction.hpp b/src/operations/blas1/WGAtomicReduction.hpp index 21756c7c0..ca46b8269 100644 --- a/src/operations/blas1/WGAtomicReduction.hpp +++ b/src/operations/blas1/WGAtomicReduction.hpp @@ -126,21 +126,14 @@ PORTBLAS_INLINE cl::sycl::plus()); } if (ndItem.get_local_id()[0] == 0) { - if constexpr (!usmManagedMem) { - auto atomic_res = - cl::sycl::atomic_ref( - lhs_.get_data()[0]); - atomic_res += val; - } else { - auto atomic_res = - cl::sycl::atomic_ref( - lhs_.get_data()[0]); - atomic_res += val; - } + constexpr cl::sycl::access::address_space addr_sp = + usmManagedMem ? cl::sycl::access::address_space::generic_space + : cl::sycl::access::address_space::global_space; + auto atomic_res = + cl::sycl::atomic_ref( + lhs_.get_data()[0]); + atomic_res += val; } return {}; From 3568b4858dfb87d2cabc82eed9555991358f350b Mon Sep 17 00:00:00 2001 From: nscipione Date: Thu, 28 Mar 2024 14:05:11 +0000 Subject: [PATCH 11/12] Moving memory alloc type into helper function and addressing other PR comments Addressing PR comments on memory allocation type function checker and updating comment. Fixing documentation. Signed-off-by: nscipione --- include/portblas_helper.h | 10 +++++ src/interface/blas1/backend/amd_gpu.hpp | 40 ++++++++----------- src/interface/blas1_interface.hpp | 52 +++++++++---------------- 3 files changed, 45 insertions(+), 57 deletions(-) diff --git a/include/portblas_helper.h b/include/portblas_helper.h index 4ef250829..a07ba745a 100644 --- a/include/portblas_helper.h +++ b/include/portblas_helper.h @@ -220,6 +220,16 @@ inline cl::sycl::event fill(cl::sycl::queue q, element_t *buff, element_t value, } #endif +template +inline bool is_malloc_shared(sb_handle_t &sb_handle, const containerT _rs) { + if constexpr (std::is_pointer_v) { + return sycl::usm::alloc::shared == + sycl::get_pointer_type(_rs, sb_handle.get_queue().get_context()); + } else { + return false; + } +} + } // end namespace helper } // end namespace blas #endif // PORTBLAS_HELPER_H diff --git a/src/interface/blas1/backend/amd_gpu.hpp b/src/interface/blas1/backend/amd_gpu.hpp index 3a1409b8d..f61bc6e8b 100644 --- a/src/interface/blas1/backend/amd_gpu.hpp +++ b/src/interface/blas1/backend/amd_gpu.hpp @@ -25,6 +25,7 @@ #ifndef PORTBLAS_ASUM_AMD_GPU_BACKEND_HPP #define PORTBLAS_ASUM_AMD_GPU_BACKEND_HPP #include "interface/blas1_interface.h" +#include "portblas_helper.h" namespace blas { namespace asum { @@ -38,18 +39,11 @@ typename sb_handle_t::event_t _asum( * This compile time check is absolutely necessary for AMD GPUs. * AMD's atomic operations require a specific combination of hardware that * cannot be checked nor enforced. Since the reduction operator kernel - * implementation uses atomic operations, without that particular hardware combination - * the reduction may silently fail. This check enforces a different atomic - * address space causing a big performance degradation, but also making the kernel - * behave correctly with managed memory (aka malloc_shared allocation). + * implementation uses atomic operations, without that particular hardware + * combination the reduction may silently fail. **/ #ifdef SB_ENABLE_USM - bool usm_managed_mem{false}; - if constexpr (std::is_pointer_v) { - usm_managed_mem = - sycl::usm::alloc::shared == - sycl::get_pointer_type(_rs, sb_handle.get_queue().get_context()); - } + const bool usm_managed_mem = blas::helper::is_malloc_shared(sb_handle, _rs); #else constexpr bool usm_managed_mem{false}; #endif @@ -135,15 +129,14 @@ typename sb_handle_t::event_t _nrm2( sb_handle_t& sb_handle, index_t _N, container_0_t _vx, increment_t _incx, container_1_t _rs, const typename sb_handle_t::event_t& _dependencies) { /** - * Read comment in _asum above. + * This compile time check is absolutely necessary for AMD GPUs. + * AMD's atomic operations require a specific combination of hardware that + * cannot be checked nor enforced. Since the reduction operator kernel + * implementation uses atomic operations, without that particular hardware + * combination the reduction may silently fail. **/ #ifdef SB_ENABLE_USM - bool usm_managed_mem{false}; - if constexpr (std::is_pointer_v) { - usm_managed_mem = - sycl::usm::alloc::shared == - sycl::get_pointer_type(_rs, sb_handle.get_queue().get_context()); - } + const bool usm_managed_mem = blas::helper::is_malloc_shared(sb_handle, _rs); #else constexpr bool usm_managed_mem{false}; #endif @@ -185,15 +178,14 @@ typename sb_handle_t::event_t _dot( container_1_t _vy, increment_t _incy, container_2_t _rs, const typename sb_handle_t::event_t& _dependencies) { /** - * Read comment in _asum above. + * This compile time check is absolutely necessary for AMD GPUs. + * AMD's atomic operations require a specific combination of hardware that + * cannot be checked nor enforced. Since the reduction operator kernel + * implementation uses atomic operations, without that particular hardware + * combination the reduction may silently fail. **/ #ifdef SB_ENABLE_USM - bool usm_managed_mem{false}; - if constexpr (std::is_pointer_v) { - usm_managed_mem = - sycl::usm::alloc::shared == - sycl::get_pointer_type(_rs, sb_handle.get_queue().get_context()); - } + const bool usm_managed_mem = blas::helper::is_malloc_shared(sb_handle, _rs); #else constexpr bool usm_managed_mem{false}; #endif diff --git a/src/interface/blas1_interface.hpp b/src/interface/blas1_interface.hpp index 7c8180aa5..e92027823 100644 --- a/src/interface/blas1_interface.hpp +++ b/src/interface/blas1_interface.hpp @@ -226,19 +226,14 @@ typename sb_handle_t::event_t _asum( * implementation use a kernel implementation which doesn't * require local memory. * @tparam usmManagedMem Specifies if usm memory allocation is automatically - * managed or not. The memory automatically managed + * managed or not. Automatically managed memory * requires that atomic address space is set to generic. - * This is a strict requirement only for AMD gpus, since - * otherwise it will rely on pcie atomics - * which we cannot enforce, guarantee or check due to its - * hardware nature. Other targets do not have the same + * This is a strict requirement only for AMD GPUs, since + * AMD's implementation of atomics may depend on specific + * hardware configurations (PCIe atomics) that cannot be + * checked at runtime. Other targets do not have the same * strong dependency and managed memory is handled - * correctly in any case by default. It is automatically - * initialized to false to reduce verbosity of - * initialization for many targets since only one of them, - * with specific allocation type, requires a different - * value. Having a default value allows the compiler to - * handle automatically other templates. + * correctly by default. */ template Date: Wed, 3 Apr 2024 16:40:29 +0100 Subject: [PATCH 12/12] Addressed review comments --- src/interface/blas3/backend/intel_gpu.hpp | 2 +- src/interface/blas3/backend/nvidia_gpu.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/interface/blas3/backend/intel_gpu.hpp b/src/interface/blas3/backend/intel_gpu.hpp index 8cc8180ef..1fc837dd3 100644 --- a/src/interface/blas3/backend/intel_gpu.hpp +++ b/src/interface/blas3/backend/intel_gpu.hpp @@ -224,7 +224,7 @@ _gemm(sb_handle_t& sb_handle, index_t _M, index_t _N, index_t _K, const typename sb_handle_t::event_t& _dependencies) { // The symmetric matrice(s) cases are not enabled with half (fp16) as the Symm // operator's specification doesn't include half floating point data. - if constexpr (s_a && s_b || ((s_a && _t_b) || (s_b && _t_a))) { + if constexpr (s_a || s_b) { return _dependencies; } else { if (batch_type == gemm_batch_type_t::interleaved) { diff --git a/src/interface/blas3/backend/nvidia_gpu.hpp b/src/interface/blas3/backend/nvidia_gpu.hpp index a6f8dc419..00b036d4f 100644 --- a/src/interface/blas3/backend/nvidia_gpu.hpp +++ b/src/interface/blas3/backend/nvidia_gpu.hpp @@ -190,7 +190,7 @@ _gemm(sb_handle_t& sb_handle, index_t _M, index_t _N, index_t _K, const typename sb_handle_t::event_t& _dependencies) { // The symmetric matrice(s) cases are not enabled with half (fp16) as the Symm // operator's specification doesn't include half floating point data. - if constexpr (s_a && s_b || ((s_a && _t_b) || (s_b && _t_a))) { + if constexpr (s_a || s_b) { return _dependencies; } else { if (batch_type == gemm_batch_type_t::interleaved) {