From a4a2ec94df34d6349d43ef3897f59395ae26deb0 Mon Sep 17 00:00:00 2001
From: Ouadie EL FAROUKI <ouadie.elfarouki@codeplay.com>
Date: Wed, 6 Mar 2024 17:41:31 +0000
Subject: [PATCH 01/12] minor fixes and reverting of ACPP changes causing
 unexpected tests & header only lib behaviors

---
 cmake/Modules/SYCL.cmake          | 12 ++++++----
 include/container/sycl_iterator.h | 40 +++++++------------------------
 samples/CMakeLists.txt            |  4 ++++
 3 files changed, 19 insertions(+), 37 deletions(-)
diff --git a/cmake/Modules/SYCL.cmake b/cmake/Modules/SYCL.cmake
index 31f28245e..a4efc0226 100644
--- a/cmake/Modules/SYCL.cmake
+++ b/cmake/Modules/SYCL.cmake
@@ -26,11 +26,13 @@ include(CheckCXXCompilerFlag)
 include(ConfigurePORTBLAS)
 
 # find_package(AdaptiveCpp) requires ACPP_TARGETS to be set, so set it to a default value before find_package(AdaptiveCpp)
-if(SYCL_COMPILER MATCHES "adaptivecpp" AND NOT ACPP_TARGETS AND NOT ENV{ACPP_TARGETS})
-  message(STATUS "Using `omp` as ACPP_TARGETS")
-  set(ACPP_TARGETS "omp")
-else()
-  message(STATUS "Using ${ACPP_TARGETS} as ACPP_TARGETS")
+if(SYCL_COMPILER MATCHES "adaptivecpp")
+  if(NOT ACPP_TARGETS AND NOT ENV{ACPP_TARGETS})
+    message(STATUS "Using `omp` as ACPP_TARGETS")
+    set(ACPP_TARGETS "omp")
+  else()
+    message(STATUS "Using ${ACPP_TARGETS} as ACPP_TARGETS")
+  endif()
 endif()
 
 check_cxx_compiler_flag("--acpp-targets" has_acpp)
diff --git a/include/container/sycl_iterator.h b/include/container/sycl_iterator.h
index 9e12939ed..e6ea4f953 100644
--- a/include/container/sycl_iterator.h
+++ b/include/container/sycl_iterator.h
@@ -194,32 +194,17 @@ template <cl::sycl::access::mode acc_md_t>
 inline typename BufferIterator<element_t>::template accessor_t<acc_md_t>
 BufferIterator<element_t>::get_range_accessor(cl::sycl::handler& cgh,
                                               size_t size) {
-  if constexpr (acc_md_t == cl::sycl::access::mode::read) {
-    return typename BufferIterator<element_t>::template accessor_t<acc_md_t>(
-        buffer_, cgh, cl::sycl::range<1>(size),
-        cl::sycl::id<1>(BufferIterator<element_t>::get_offset()));
-  } else {
-    // Skip data initialization if not accessing in read mode only
-    return typename BufferIterator<element_t>::template accessor_t<acc_md_t>(
-        buffer_, cgh, cl::sycl::range<1>(size),
-        cl::sycl::id<1>(BufferIterator<element_t>::get_offset()),
-        cl::sycl::property::no_init{});
-  }
+  return typename BufferIterator<element_t>::template accessor_t<acc_md_t>(
+      buffer_, cgh, cl::sycl::range<1>(size),
+      cl::sycl::id<1>(BufferIterator<element_t>::get_offset()));
 }
 
 template <typename element_t>
 template <cl::sycl::access::mode acc_md_t>
 inline typename BufferIterator<element_t>::template accessor_t<acc_md_t>
 BufferIterator<element_t>::get_range_accessor(cl::sycl::handler& cgh) {
-  if constexpr (acc_md_t == cl::sycl::access::mode::read) {
-    return BufferIterator<element_t>::get_range_accessor<acc_md_t>(
-        cgh, BufferIterator<element_t>::get_size());
-  } else {
-    // Skip data initialization if not accessing in read mode only
-    return BufferIterator<element_t>::get_range_accessor<acc_md_t>(
-        cgh, BufferIterator<element_t>::get_size(),
-        cl::sycl::property::no_init{});
-  }
+  return BufferIterator<element_t>::get_range_accessor<acc_md_t>(
+      cgh, BufferIterator<element_t>::get_size());
 }
 
 template <typename element_t>
@@ -227,18 +212,9 @@ template <cl::sycl::access::mode acc_md_t>
 inline typename BufferIterator<element_t>::template placeholder_accessor_t<
     acc_md_t>
 BufferIterator<element_t>::get_range_accessor(size_t size) {
-  if constexpr (acc_md_t == cl::sycl::access::mode::read) {
-    return typename BufferIterator<element_t>::template placeholder_accessor_t<
-        acc_md_t>(buffer_, cl::sycl::range<1>(size),
-                  cl::sycl::id<1>(BufferIterator<element_t>::get_offset()));
-
-  } else {
-    // Skip data initialization if not accessing in read mode only
-    return typename BufferIterator<element_t>::template placeholder_accessor_t<
-        acc_md_t>(buffer_, cl::sycl::range<1>(size),
-                  cl::sycl::id<1>(BufferIterator<element_t>::get_offset()),
-                  cl::sycl::property::no_init{});
-  }
+  return typename BufferIterator<element_t>::template placeholder_accessor_t<
+      acc_md_t>(buffer_, cl::sycl::range<1>(size),
+                cl::sycl::id<1>(BufferIterator<element_t>::get_offset()));
 }
 
 template <typename element_t>
diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt
index eac74afb7..d5a99be3f 100644
--- a/samples/CMakeLists.txt
+++ b/samples/CMakeLists.txt
@@ -1,3 +1,7 @@
+cmake_minimum_required(VERSION 3.4.3)
+
+project(portBLASSample LANGUAGES CXX)
+
 set(PORTBLAS_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../include)
 set(PORTBLAS_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../src)
 list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR})

From 1073e1c2a9a7181ecefa345c1e3ea9ba61d72465 Mon Sep 17 00:00:00 2001
From: Ouadie EL FAROUKI <ouadie.elfarouki@codeplay.com>
Date: Wed, 6 Mar 2024 19:23:55 +0000
Subject: [PATCH 02/12] fixes to gemm half support with default cpu

---
 CMakeLists.txt                              |  6 ---
 cmake/CmakeFunctionHelper.cmake             |  9 +++++
 src/interface/blas3/backend/default_cpu.hpp | 43 +++++++++++++++++++++
 3 files changed, 52 insertions(+), 6 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 954f32cac..5c3462e3f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -113,12 +113,6 @@ option(BLAS_ENABLE_EXTENSIONS "Whether to enable portBLAS extensions" ON)
 option(BLAS_ENABLE_COMPLEX "Whether to enable complex data type for GEMM" OFF)
 option(BLAS_ENABLE_HALF "Whether to enable sycl::half data type for supported operators" OFF)
 
-if(((NOT INSTALL_HEADER_ONLY) AND (TUNING_TARGET STREQUAL "DEFAULT_CPU"))
-    OR (INSTALL_HEADER_ONLY AND (NOT TUNING_TARGET)))
-  set(BLAS_ENABLE_HALF OFF)
-  message(STATUS "FP16 operations are not supported for CPU targets. BLAS_ENABLE_HALF is disabled")
-endif()
-
 if (SYCL_COMPILER MATCHES "adaptivecpp") 
   if(BLAS_ENABLE_COMPLEX)
     message(STATUS "SYCL Complex data is not supported on AdaptiveCpp/hipSYCL. Complex
diff --git a/cmake/CmakeFunctionHelper.cmake b/cmake/CmakeFunctionHelper.cmake
index 2825b3a92..fe60c42f7 100644
--- a/cmake/CmakeFunctionHelper.cmake
+++ b/cmake/CmakeFunctionHelper.cmake
@@ -702,6 +702,15 @@ else() # default cpu backend
     add_gemm_configuration(
       "${data}" 64 "false" "false" "false"
       64 2 2 4 4 1 1 1 1 4 4 1 1 1 float float "no_local" "standard" "full" 4 "interleaved" "false" "false")
+
+    if(BLAS_ENABLE_HALF)
+      add_gemm_configuration(
+        "half"  128 "false" "false" "false"
+        64 4 4 8 8 1 1 1 1 1 1 1 1 1 float float "no_local" "standard" "full" 1 "strided" "false" "false")
+      add_gemm_configuration(
+        "half" 64 "false" "false" "false"
+        64 2 2 4 4 1 1 1 1 4 4 1 1 1 float float "no_local" "standard" "full" 4 "interleaved" "false" "false")
+    endif()
   endforeach()
 
   if(BLAS_ENABLE_COMPLEX)
diff --git a/src/interface/blas3/backend/default_cpu.hpp b/src/interface/blas3/backend/default_cpu.hpp
index 33f50539f..54de53e83 100644
--- a/src/interface/blas3/backend/default_cpu.hpp
+++ b/src/interface/blas3/backend/default_cpu.hpp
@@ -120,6 +120,49 @@ _gemm(sb_handle_t& sb_handle, index_t _M, index_t _N, index_t _K,
   }
 }
 
+// Half Configurations
+template <bool _t_a, bool _t_b, bool s_a, bool s_b, bool is_beta_zero,
+          typename element_in_t, typename element_out_t, typename sb_handle_t,
+          typename container_0_t, typename container_1_t,
+          typename container_2_t, typename index_t>
+typename std::enable_if<is_half<element_in_t>::value,
+                        typename sb_handle_t::event_t>::type
+_gemm(sb_handle_t& sb_handle, index_t _M, index_t _N, index_t _K,
+      element_out_t _alpha, container_0_t _a, index_t _lda, index_t _stridea,
+      container_1_t _b, index_t _ldb, index_t _strideb, element_out_t _beta,
+      container_2_t _c, index_t _ldc, index_t _stridec, index_t batch_size,
+      gemm_batch_type_t batch_type,
+      const typename sb_handle_t::event_t& _dependencies) {
+  // Unused configuration cases
+  if constexpr (s_a || s_b) {
+    return _dependencies;
+  } else {
+    if (batch_type == gemm_batch_type_t::interleaved) {
+      return blas::Gemm_Launcher<
+          container_0_t, container_1_t, container_2_t, 64, false, false, false,
+          64, Tile<2, 2, 4, 4, 1, 1, 1, 1, 4, 4>, _t_a, _t_b, s_a, s_b,
+          static_cast<int>(gemm_memory_t::no_local),
+          static_cast<int>(gemm_algorithm_t::standard),
+          static_cast<int>(gemm_vectorization_t::full), is_beta_zero, 4,
+          static_cast<int>(gemm_batch_type_t::interleaved)>::
+          template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, _lda,
+                                _stridea, _b, _ldb, _strideb, _beta, _c, _ldc,
+                                _stridec, batch_size, _dependencies);
+    }
+
+    return blas::Gemm_Launcher<
+        container_0_t, container_1_t, container_2_t, 128, false, false, false,
+        64, Tile<4, 4, 8, 8>, _t_a, _t_b, s_a, s_b,
+        static_cast<int>(gemm_memory_t::no_local),
+        static_cast<int>(gemm_algorithm_t::standard),
+        static_cast<int>(gemm_vectorization_t::full), is_beta_zero, 1,
+        static_cast<int>(gemm_batch_type_t::strided)>::
+        template _select_gemm(sb_handle, _M, _N, _K, _alpha, _a, _lda, _stridea,
+                              _b, _ldb, _strideb, _beta, _c, _ldc, _stridec,
+                              batch_size, _dependencies);
+  }
+}
+
 // Complex Configurations
 #ifdef BLAS_ENABLE_COMPLEX
 template <bool _t_a, bool _t_b, bool s_a, bool s_b, bool is_beta_zero,

From e1fb46680e88a7d33f9f9de45eb7e8d5b9da898c Mon Sep 17 00:00:00 2001
From: Ouadie EL FAROUKI <ouadie.elfarouki@codeplay.com>
Date: Thu, 7 Mar 2024 02:14:50 +0000
Subject: [PATCH 03/12] typo fix

---
 src/interface/blas3/backend/default_cpu.hpp | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/interface/blas3/backend/default_cpu.hpp b/src/interface/blas3/backend/default_cpu.hpp
index 54de53e83..bf5dec3b3 100644
--- a/src/interface/blas3/backend/default_cpu.hpp
+++ b/src/interface/blas3/backend/default_cpu.hpp
@@ -122,14 +122,13 @@ _gemm(sb_handle_t& sb_handle, index_t _M, index_t _N, index_t _K,
 
 // Half Configurations
 template <bool _t_a, bool _t_b, bool s_a, bool s_b, bool is_beta_zero,
-          typename element_in_t, typename element_out_t, typename sb_handle_t,
-          typename container_0_t, typename container_1_t,
-          typename container_2_t, typename index_t>
-typename std::enable_if<is_half<element_in_t>::value,
+          typename sb_handle_t, typename container_0_t, typename container_1_t,
+          typename container_2_t, typename element_t, typename index_t>
+typename std::enable_if<is_half<element_t>::value,
                         typename sb_handle_t::event_t>::type
 _gemm(sb_handle_t& sb_handle, index_t _M, index_t _N, index_t _K,
-      element_out_t _alpha, container_0_t _a, index_t _lda, index_t _stridea,
-      container_1_t _b, index_t _ldb, index_t _strideb, element_out_t _beta,
+      element_t _alpha, container_0_t _a, index_t _lda, index_t _stridea,
+      container_1_t _b, index_t _ldb, index_t _strideb, element_t _beta,
       container_2_t _c, index_t _ldc, index_t _stridec, index_t batch_size,
       gemm_batch_type_t batch_type,
       const typename sb_handle_t::event_t& _dependencies) {

From 22bbd8dabf5ad08576e7a763e1179762fd27d6a7 Mon Sep 17 00:00:00 2001
From: Ouadie EL FAROUKI <ouadie.elfarouki@codeplay.com>
Date: Mon, 11 Mar 2024 13:09:09 +0000
Subject: [PATCH 04/12] added license text to samples CmakeLists

---
 samples/CMakeLists.txt | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt
index d5a99be3f..a1392057f 100644
--- a/samples/CMakeLists.txt
+++ b/samples/CMakeLists.txt
@@ -1,3 +1,27 @@
+#/***************************************************************************
+# *
+# *  @license
+# *  Copyright (C) Codeplay Software Limited
+# *  Licensed under the Apache License, Version 2.0 (the "License");
+# *  you may not use this file except in compliance with the License.
+# *  You may obtain a copy of the License at
+# *
+# *      http://www.apache.org/licenses/LICENSE-2.0
+# *
+# *  For your convenience, a copy of the License has been included in this
+# *  repository.
+# *
+# *  Unless required by applicable law or agreed to in writing, software
+# *  distributed under the License is distributed on an "AS IS" BASIS,
+# *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# *  See the License for the specific language governing permissions and
+# *  limitations under the License.
+# *
+# *  portBLAS: BLAS implementation using SYCL
+# *
+# *  @filename CMakeLists.txt
+# *
+# **************************************************************************/
 cmake_minimum_required(VERSION 3.4.3)
 
 project(portBLASSample LANGUAGES CXX)

From 4773f809d5927896a62edb73122c01da8fa87eaa Mon Sep 17 00:00:00 2001
From: nscipione <nicolo.scipione@codeplay.com>
Date: Wed, 20 Mar 2024 15:00:22 +0000
Subject: [PATCH 05/12] Add check for managed usm allocation for AMD

AMD atomic operation implementation requires some specific hardware to
work properly with current reduction kernel. This patch adds a check for
AMD only to provides the correct result even if the specific hardware is
not available.

Signed-off-by: nscipione <nicolo.scipione@codeplay.com>
---
 include/interface/blas1_interface.h        |  6 +-
 include/operations/blas1_trees.h           | 11 ++--
 src/interface/blas1/backend/amd_gpu.hpp    | 47 ++++++++++++---
 src/interface/blas1_interface.hpp          |  9 +--
 src/operations/blas1/WGAtomicReduction.hpp | 69 +++++++++++++---------
 5 files changed, 94 insertions(+), 48 deletions(-)

diff --git a/include/interface/blas1_interface.h b/include/interface/blas1_interface.h
index 7684a0a65..80d104e01 100644
--- a/include/interface/blas1_interface.h
+++ b/include/interface/blas1_interface.h
@@ -136,9 +136,9 @@ typename sb_handle_t::event_t _asum(
  * \brief Prototype for the internal implementation of the ASUM operation. See
  * documentation in the blas1_interface.hpp file for details.
  */
-template <int localSize, int localMemSize, typename sb_handle_t,
-          typename container_0_t, typename container_1_t, typename index_t,
-          typename increment_t>
+template <int localSize, int localMemSize, bool usmManagedMem = false,
+          typename sb_handle_t, typename container_0_t, typename container_1_t,
+          typename index_t, typename increment_t>
 typename sb_handle_t::event_t _asum_impl(
     sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
     container_1_t _rs, const index_t number_WG,
diff --git a/include/operations/blas1_trees.h b/include/operations/blas1_trees.h
index f0d94d596..27a34b5ec 100644
--- a/include/operations/blas1_trees.h
+++ b/include/operations/blas1_trees.h
@@ -208,7 +208,7 @@ struct AssignReduction {
  * function below.
  *
  */
-template <typename operator_t, typename lhs_t, typename rhs_t>
+template <typename operator_t, bool managed_mem, typename lhs_t, typename rhs_t>
 struct WGAtomicReduction {
   using value_t = typename lhs_t::value_t;
   using index_t = typename rhs_t::index_t;
@@ -304,10 +304,11 @@ inline AssignReduction<operator_t, lhs_t, rhs_t> make_assign_reduction(
       lhs_, rhs_, local_num_thread_, global_num_thread_);
 }
 
-template <typename operator_t, typename lhs_t, typename rhs_t>
-inline WGAtomicReduction<operator_t, lhs_t, rhs_t> make_wg_atomic_reduction(
-    lhs_t &lhs_, rhs_t &rhs_) {
-  return WGAtomicReduction<operator_t, lhs_t, rhs_t>(lhs_, rhs_);
+template <typename operator_t, bool managed_mem = false, typename lhs_t,
+          typename rhs_t>
+inline WGAtomicReduction<operator_t, managed_mem, lhs_t, rhs_t>
+make_wg_atomic_reduction(lhs_t &lhs_, rhs_t &rhs_) {
+  return WGAtomicReduction<operator_t, managed_mem, lhs_t, rhs_t>(lhs_, rhs_);
 }
 
 template <bool is_max, bool is_step0, typename lhs_t, typename rhs_t>
diff --git a/src/interface/blas1/backend/amd_gpu.hpp b/src/interface/blas1/backend/amd_gpu.hpp
index 999b596df..741e7f730 100644
--- a/src/interface/blas1/backend/amd_gpu.hpp
+++ b/src/interface/blas1/backend/amd_gpu.hpp
@@ -34,16 +34,45 @@ template <typename sb_handle_t, typename container_0_t, typename container_1_t,
 typename sb_handle_t::event_t _asum(
     sb_handle_t& sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
     container_1_t _rs, const typename sb_handle_t::event_t& _dependencies) {
-  if (_N < (1 << 18)) {
-    constexpr index_t localSize = 1024;
-    const index_t number_WG = (_N + localSize - 1) / localSize;
-    return blas::internal::_asum_impl<static_cast<int>(localSize), 32>(
-        sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
+  /**
+   * This compile time check is absolutely necessary for AMD gpu.
+   * AMD atomic operations required a specific combination of hardware that we
+   *cannot check neither enforce to users. Since reduction operators kernel
+   *implementation useses atomic operation without that particular combination
+   *the operator may fail silently. This check enforce a different atomic
+   *address space causing a big performance degradation, but making the kernel
+   *behaves correctly also with managed memory (aka malloc_shared allocation).
+   **/
+  bool managed_mem{false};
+  if constexpr (std::is_pointer_v<decltype(_rs)>) {
+    managed_mem =
+        sycl::usm::alloc::shared ==
+        sycl::get_pointer_type(_rs, sb_handle.get_queue().get_context());
+  }
+  if (managed_mem) {
+    if (_N < (1 << 18)) {
+      constexpr index_t localSize = 1024;
+      const index_t number_WG = (_N + localSize - 1) / localSize;
+      return blas::internal::_asum_impl<static_cast<int>(localSize), 32, true>(
+          sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
+    } else {
+      constexpr int localSize = 512;
+      constexpr index_t number_WG = 256;
+      return blas::internal::_asum_impl<localSize, 32, true>(
+          sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
+    }
   } else {
-    constexpr int localSize = 512;
-    constexpr index_t number_WG = 256;
-    return blas::internal::_asum_impl<localSize, 32>(
-        sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
+    if (_N < (1 << 18)) {
+      constexpr index_t localSize = 1024;
+      const index_t number_WG = (_N + localSize - 1) / localSize;
+      return blas::internal::_asum_impl<static_cast<int>(localSize), 32, false>(
+          sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
+    } else {
+      constexpr int localSize = 512;
+      constexpr index_t number_WG = 256;
+      return blas::internal::_asum_impl<localSize, 32, false>(
+          sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
+    }
   }
 }
 }  // namespace backend
diff --git a/src/interface/blas1_interface.hpp b/src/interface/blas1_interface.hpp
index df914b71b..7f6ee962e 100644
--- a/src/interface/blas1_interface.hpp
+++ b/src/interface/blas1_interface.hpp
@@ -226,9 +226,9 @@ typename sb_handle_t::event_t _asum(
  *                      implementation use a kernel implementation which doesn't
  *                      require local memory.
  */
-template <int localSize, int localMemSize, typename sb_handle_t,
-          typename container_0_t, typename container_1_t, typename index_t,
-          typename increment_t>
+template <int localSize, int localMemSize, bool usmManagedMem,
+          typename sb_handle_t, typename container_0_t, typename container_1_t,
+          typename index_t, typename increment_t>
 typename sb_handle_t::event_t _asum_impl(
     sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
     container_1_t _rs, const index_t number_WG,
@@ -238,7 +238,8 @@ typename sb_handle_t::event_t _asum_impl(
   auto rs = make_vector_view(_rs, static_cast<increment_t>(1),
                              static_cast<index_t>(1));
   typename sb_handle_t::event_t ret;
-  auto asumOp = make_wg_atomic_reduction<AbsoluteAddOperator>(rs, vx);
+  auto asumOp =
+      make_wg_atomic_reduction<AbsoluteAddOperator, usmManagedMem>(rs, vx);
   if constexpr (localMemSize != 0) {
     ret = sb_handle.execute(asumOp, static_cast<index_t>(localSize),
                             static_cast<index_t>(number_WG * localSize),
diff --git a/src/operations/blas1/WGAtomicReduction.hpp b/src/operations/blas1/WGAtomicReduction.hpp
index 22d923e8d..779c33b43 100644
--- a/src/operations/blas1/WGAtomicReduction.hpp
+++ b/src/operations/blas1/WGAtomicReduction.hpp
@@ -35,26 +35,30 @@ namespace blas {
  * and atomics operation to combine the results.
  *
  * */
-template <typename operator_t, typename lhs_t, typename rhs_t>
-WGAtomicReduction<operator_t, lhs_t, rhs_t>::WGAtomicReduction(lhs_t& _l,
-                                                               rhs_t& _r)
+template <typename operator_t, bool managed_mem, typename lhs_t, typename rhs_t>
+WGAtomicReduction<operator_t, managed_mem, lhs_t, rhs_t>::WGAtomicReduction(
+    lhs_t& _l, rhs_t& _r)
     : lhs_(_l), rhs_(_r){};
 
-template <typename operator_t, typename lhs_t, typename rhs_t>
-PORTBLAS_INLINE typename WGAtomicReduction<operator_t, lhs_t, rhs_t>::index_t
-WGAtomicReduction<operator_t, lhs_t, rhs_t>::get_size() const {
+template <typename operator_t, bool managed_mem, typename lhs_t, typename rhs_t>
+PORTBLAS_INLINE
+    typename WGAtomicReduction<operator_t, managed_mem, lhs_t, rhs_t>::index_t
+    WGAtomicReduction<operator_t, managed_mem, lhs_t, rhs_t>::get_size() const {
   return rhs_.get_size();
 }
 
-template <typename operator_t, typename lhs_t, typename rhs_t>
-PORTBLAS_INLINE bool WGAtomicReduction<operator_t, lhs_t, rhs_t>::valid_thread(
+template <typename operator_t, bool managed_mem, typename lhs_t, typename rhs_t>
+PORTBLAS_INLINE bool
+WGAtomicReduction<operator_t, managed_mem, lhs_t, rhs_t>::valid_thread(
     cl::sycl::nd_item<1> ndItem) const {
   return true;
 }
 
-template <typename operator_t, typename lhs_t, typename rhs_t>
-PORTBLAS_INLINE typename WGAtomicReduction<operator_t, lhs_t, rhs_t>::value_t
-WGAtomicReduction<operator_t, lhs_t, rhs_t>::eval(cl::sycl::nd_item<1> ndItem) {
+template <typename operator_t, bool managed_mem, typename lhs_t, typename rhs_t>
+PORTBLAS_INLINE
+    typename WGAtomicReduction<operator_t, managed_mem, lhs_t, rhs_t>::value_t
+    WGAtomicReduction<operator_t, managed_mem, lhs_t, rhs_t>::eval(
+        cl::sycl::nd_item<1> ndItem) {
   auto atomic_res =
       cl::sycl::atomic_ref<value_t, cl::sycl::memory_order::relaxed,
                            cl::sycl::memory_scope::device,
@@ -80,16 +84,13 @@ WGAtomicReduction<operator_t, lhs_t, rhs_t>::eval(cl::sycl::nd_item<1> ndItem) {
   }
   return {};
 }
-template <typename operator_t, typename lhs_t, typename rhs_t>
+
+template <typename operator_t, bool managed_mem, typename lhs_t, typename rhs_t>
 template <typename sharedT>
-PORTBLAS_INLINE typename WGAtomicReduction<operator_t, lhs_t, rhs_t>::value_t
-WGAtomicReduction<operator_t, lhs_t, rhs_t>::eval(sharedT scratch,
-                                                  cl::sycl::nd_item<1> ndItem) {
-  auto atomic_res =
-      cl::sycl::atomic_ref<value_t, cl::sycl::memory_order::relaxed,
-                           cl::sycl::memory_scope::device,
-                           cl::sycl::access::address_space::global_space>(
-          lhs_.get_data()[0]);
+PORTBLAS_INLINE
+    typename WGAtomicReduction<operator_t, managed_mem, lhs_t, rhs_t>::value_t
+    WGAtomicReduction<operator_t, managed_mem, lhs_t, rhs_t>::eval(
+        sharedT scratch, cl::sycl::nd_item<1> ndItem) {
   const auto size = get_size();
   const int lid = static_cast<int>(ndItem.get_global_linear_id());
   const auto loop_stride =
@@ -119,22 +120,36 @@ WGAtomicReduction<operator_t, lhs_t, rhs_t>::eval(sharedT scratch,
                                       cl::sycl::plus<value_t>());
   }
   if (ndItem.get_local_id()[0] == 0) {
-    atomic_res += val;
+    if constexpr (!managed_mem) {
+      auto atomic_res =
+          cl::sycl::atomic_ref<value_t, cl::sycl::memory_order::relaxed,
+                               cl::sycl::memory_scope::device,
+                               cl::sycl::access::address_space::global_space>(
+              lhs_.get_data()[0]);
+      atomic_res += val;
+    } else {
+      auto atomic_res =
+          cl::sycl::atomic_ref<value_t, cl::sycl::memory_order::relaxed,
+                               cl::sycl::memory_scope::device,
+                               cl::sycl::access::address_space::generic_space>(
+              lhs_.get_data()[0]);
+      atomic_res += val;
+    }
   }
 
   return {};
 }
 
-template <typename operator_t, typename lhs_t, typename rhs_t>
-PORTBLAS_INLINE void WGAtomicReduction<operator_t, lhs_t, rhs_t>::bind(
-    cl::sycl::handler& h) {
+template <typename operator_t, bool managed_mem, typename lhs_t, typename rhs_t>
+PORTBLAS_INLINE void WGAtomicReduction<operator_t, managed_mem, lhs_t,
+                                       rhs_t>::bind(cl::sycl::handler& h) {
   lhs_.bind(h);
   rhs_.bind(h);
 }
 
-template <typename operator_t, typename lhs_t, typename rhs_t>
-PORTBLAS_INLINE void
-WGAtomicReduction<operator_t, lhs_t, rhs_t>::adjust_access_displacement() {
+template <typename operator_t, bool managed_mem, typename lhs_t, typename rhs_t>
+PORTBLAS_INLINE void WGAtomicReduction<operator_t, managed_mem, lhs_t,
+                                       rhs_t>::adjust_access_displacement() {
   lhs_.adjust_access_displacement();
   rhs_.adjust_access_displacement();
 }

From b859f43b00d1cb9e8f27ee0b03827cbfc5ab1d82 Mon Sep 17 00:00:00 2001
From: nscipione <nicolo.scipione@codeplay.com>
Date: Wed, 20 Mar 2024 21:30:23 +0000
Subject: [PATCH 06/12] Add usm memory fix to other reduction operators

Signed-off-by: nscipione <nicolo.scipione@codeplay.com>
---
 include/interface/blas1_interface.h     | 10 +--
 src/interface/blas1/backend/amd_gpu.hpp | 82 +++++++++++++++++++------
 src/interface/blas1_interface.hpp       | 16 ++---
 3 files changed, 78 insertions(+), 30 deletions(-)

diff --git a/include/interface/blas1_interface.h b/include/interface/blas1_interface.h
index 80d104e01..f344e55be 100644
--- a/include/interface/blas1_interface.h
+++ b/include/interface/blas1_interface.h
@@ -257,9 +257,9 @@ typename sb_handle_t::event_t _nrm2(
  * \brief Prototype for the internal implementation of the NRM2 operator. See
  * documentation in the blas1_interface.hpp file for details.
  */
-template <int localSize, int localMemSize, typename sb_handle_t,
-          typename container_0_t, typename container_1_t, typename index_t,
-          typename increment_t>
+template <int localSize, int localMemSize, bool usmManagedMem = false,
+          typename sb_handle_t, typename container_0_t, typename container_1_t,
+          typename index_t, typename increment_t>
 typename sb_handle_t::event_t _nrm2_impl(
     sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
     container_1_t _rs, const index_t number_WG,
@@ -269,8 +269,8 @@ typename sb_handle_t::event_t _nrm2_impl(
  * \brief Prototype for the internal implementation of the Dot operator. See
  * documentation in the blas1_interface.hpp file for details.
  */
-template <int localSize, int localMemSize, typename sb_handle_t,
-          typename container_0_t, typename container_1_t,
+template <int localSize, int localMemSize, bool usmManagedMem = false,
+          typename sb_handle_t, typename container_0_t, typename container_1_t,
           typename container_2_t, typename index_t, typename increment_t>
 typename sb_handle_t::event_t _dot_impl(
     sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
diff --git a/src/interface/blas1/backend/amd_gpu.hpp b/src/interface/blas1/backend/amd_gpu.hpp
index 741e7f730..7ec252995 100644
--- a/src/interface/blas1/backend/amd_gpu.hpp
+++ b/src/interface/blas1/backend/amd_gpu.hpp
@@ -130,16 +130,39 @@ template <typename sb_handle_t, typename container_0_t, typename container_1_t,
 typename sb_handle_t::event_t _nrm2(
     sb_handle_t& sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
     container_1_t _rs, const typename sb_handle_t::event_t& _dependencies) {
-  if (_N < (1 << 18)) {
-    constexpr index_t localSize = 1024;
-    const index_t number_WG = (_N + localSize - 1) / localSize;
-    return blas::internal::_nrm2_impl<static_cast<int>(localSize), 32>(
-        sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
+  /**
+   * Read comment in _asum above.
+   **/
+  bool managed_mem{false};
+  if constexpr (std::is_pointer_v<decltype(_rs)>) {
+    managed_mem =
+        sycl::usm::alloc::shared ==
+        sycl::get_pointer_type(_rs, sb_handle.get_queue().get_context());
+  }
+  if (managed_mem) {
+    if (_N < (1 << 18)) {
+      constexpr index_t localSize = 1024;
+      const index_t number_WG = (_N + localSize - 1) / localSize;
+      return blas::internal::_nrm2_impl<static_cast<int>(localSize), 32, true>(
+          sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
+    } else {
+      constexpr int localSize = 512;
+      constexpr index_t number_WG = 512;
+      return blas::internal::_nrm2_impl<localSize, 32, true>(
+          sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
+    }
   } else {
-    constexpr int localSize = 512;
-    constexpr index_t number_WG = 512;
-    return blas::internal::_nrm2_impl<localSize, 32>(
-        sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
+    if (_N < (1 << 18)) {
+      constexpr index_t localSize = 1024;
+      const index_t number_WG = (_N + localSize - 1) / localSize;
+      return blas::internal::_nrm2_impl<static_cast<int>(localSize), 32, false>(
+          sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
+    } else {
+      constexpr int localSize = 512;
+      constexpr index_t number_WG = 512;
+      return blas::internal::_nrm2_impl<localSize, 32, false>(
+          sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
+    }
   }
 }
 }  // namespace backend
@@ -153,16 +176,39 @@ typename sb_handle_t::event_t _dot(
     sb_handle_t& sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
     container_1_t _vy, increment_t _incy, container_2_t _rs,
     const typename sb_handle_t::event_t& _dependencies) {
-  if (_N < (1 << 18)) {
-    constexpr index_t localSize = 1024;
-    const index_t number_WG = (_N + localSize - 1) / localSize;
-    return blas::internal::_dot_impl<static_cast<int>(localSize), 32>(
-        sb_handle, _N, _vx, _incx, _vy, _incy, _rs, number_WG, _dependencies);
+  /**
+   * Read comment in _asum above.
+   **/
+  bool managed_mem{false};
+  if constexpr (std::is_pointer_v<decltype(_rs)>) {
+    managed_mem =
+        sycl::usm::alloc::shared ==
+        sycl::get_pointer_type(_rs, sb_handle.get_queue().get_context());
+  }
+  if (managed_mem) {
+    if (_N < (1 << 18)) {
+      constexpr index_t localSize = 1024;
+      const index_t number_WG = (_N + localSize - 1) / localSize;
+      return blas::internal::_dot_impl<static_cast<int>(localSize), 32, true>(
+          sb_handle, _N, _vx, _incx, _vy, _incy, _rs, number_WG, _dependencies);
+    } else {
+      constexpr int localSize = 512;
+      constexpr index_t number_WG = 512;
+      return blas::internal::_dot_impl<localSize, 32, true>(
+          sb_handle, _N, _vx, _incx, _vy, _incy, _rs, number_WG, _dependencies);
+    }
   } else {
-    constexpr int localSize = 512;
-    constexpr index_t number_WG = 512;
-    return blas::internal::_dot_impl<localSize, 32>(
-        sb_handle, _N, _vx, _incx, _vy, _incy, _rs, number_WG, _dependencies);
+    if (_N < (1 << 18)) {
+      constexpr index_t localSize = 1024;
+      const index_t number_WG = (_N + localSize - 1) / localSize;
+      return blas::internal::_dot_impl<static_cast<int>(localSize), 32, false>(
+          sb_handle, _N, _vx, _incx, _vy, _incy, _rs, number_WG, _dependencies);
+    } else {
+      constexpr int localSize = 512;
+      constexpr index_t number_WG = 512;
+      return blas::internal::_dot_impl<localSize, 32, false>(
+          sb_handle, _N, _vx, _incx, _vy, _incy, _rs, number_WG, _dependencies);
+    }
   }
 }
 }  // namespace backend
diff --git a/src/interface/blas1_interface.hpp b/src/interface/blas1_interface.hpp
index 7f6ee962e..bc458d42e 100644
--- a/src/interface/blas1_interface.hpp
+++ b/src/interface/blas1_interface.hpp
@@ -548,9 +548,9 @@ typename sb_handle_t::event_t _nrm2(
  *                      implementation use a kernel implementation which doesn't
  *                      require local memory.
  */
-template <int localSize, int localMemSize, typename sb_handle_t,
-          typename container_0_t, typename container_1_t, typename index_t,
-          typename increment_t>
+template <int localSize, int localMemSize, bool usmManagedMem,
+          typename sb_handle_t, typename container_0_t, typename container_1_t,
+          typename index_t, typename increment_t>
 typename sb_handle_t::event_t _nrm2_impl(
     sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
     container_1_t _rs, const index_t number_WG,
@@ -561,7 +561,8 @@ typename sb_handle_t::event_t _nrm2_impl(
                              static_cast<index_t>(1));
   auto prdOp = make_op<UnaryOp, SquareOperator>(vx);
 
-  auto assignOp = make_wg_atomic_reduction<AddOperator>(rs, prdOp);
+  auto assignOp =
+      make_wg_atomic_reduction<AddOperator, usmManagedMem>(rs, prdOp);
   typename sb_handle_t::event_t ret0;
   if constexpr (localMemSize != 0) {
     ret0 = sb_handle.execute(assignOp, static_cast<index_t>(localSize),
@@ -596,8 +597,8 @@ typename sb_handle_t::event_t _nrm2_impl(
  *                      implementation use a kernel implementation which doesn't
  *                      require local memory.
  */
-template <int localSize, int localMemSize, typename sb_handle_t,
-          typename container_0_t, typename container_1_t,
+template <int localSize, int localMemSize, bool usmManagedMem,
+          typename sb_handle_t, typename container_0_t, typename container_1_t,
           typename container_2_t, typename index_t, typename increment_t>
 typename sb_handle_t::event_t _dot_impl(
     sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
@@ -613,7 +614,8 @@ typename sb_handle_t::event_t _dot_impl(
                              static_cast<index_t>(1));
 
   auto prdOp = make_op<BinaryOpConst, ProductOperator>(vx, vy);
-  auto wgReductionOp = make_wg_atomic_reduction<AddOperator>(rs, prdOp);
+  auto wgReductionOp =
+      make_wg_atomic_reduction<AddOperator, usmManagedMem>(rs, prdOp);
 
   if constexpr (localMemSize) {
     ret_event =

From 030a5add0decbf8f3f7d604398b6aa84d7b54cf4 Mon Sep 17 00:00:00 2001
From: nscipione <nicolo.scipione@codeplay.com>
Date: Thu, 21 Mar 2024 09:21:55 +0000
Subject: [PATCH 07/12] Renaming template parameter and variable, avoid memory
 check if using buffer only

Signed-off-by: nscipione <nicolo.scipione@codeplay.com>
---
 include/operations/blas1_trees.h           |  9 +++--
 src/interface/blas1/backend/amd_gpu.hpp    | 30 ++++++++++-----
 src/operations/blas1/WGAtomicReduction.hpp | 44 +++++++++++++---------
 3 files changed, 52 insertions(+), 31 deletions(-)

diff --git a/include/operations/blas1_trees.h b/include/operations/blas1_trees.h
index 27a34b5ec..d005e1915 100644
--- a/include/operations/blas1_trees.h
+++ b/include/operations/blas1_trees.h
@@ -208,7 +208,8 @@ struct AssignReduction {
  * function below.
  *
  */
-template <typename operator_t, bool managed_mem, typename lhs_t, typename rhs_t>
+template <typename operator_t, bool usmManagedMem, typename lhs_t,
+          typename rhs_t>
 struct WGAtomicReduction {
   using value_t = typename lhs_t::value_t;
   using index_t = typename rhs_t::index_t;
@@ -304,11 +305,11 @@ inline AssignReduction<operator_t, lhs_t, rhs_t> make_assign_reduction(
       lhs_, rhs_, local_num_thread_, global_num_thread_);
 }
 
-template <typename operator_t, bool managed_mem = false, typename lhs_t,
+template <typename operator_t, bool usmManagedMem = false, typename lhs_t,
           typename rhs_t>
-inline WGAtomicReduction<operator_t, managed_mem, lhs_t, rhs_t>
+inline WGAtomicReduction<operator_t, usmManagedMem, lhs_t, rhs_t>
 make_wg_atomic_reduction(lhs_t &lhs_, rhs_t &rhs_) {
-  return WGAtomicReduction<operator_t, managed_mem, lhs_t, rhs_t>(lhs_, rhs_);
+  return WGAtomicReduction<operator_t, usmManagedMem, lhs_t, rhs_t>(lhs_, rhs_);
 }
 
 template <bool is_max, bool is_step0, typename lhs_t, typename rhs_t>
diff --git a/src/interface/blas1/backend/amd_gpu.hpp b/src/interface/blas1/backend/amd_gpu.hpp
index 7ec252995..e6d8e44ed 100644
--- a/src/interface/blas1/backend/amd_gpu.hpp
+++ b/src/interface/blas1/backend/amd_gpu.hpp
@@ -43,13 +43,17 @@ typename sb_handle_t::event_t _asum(
    *address space causing a big performance degradation, but making the kernel
    *behaves correctly also with managed memory (aka malloc_shared allocation).
    **/
-  bool managed_mem{false};
+#ifdef SB_ENABLE_USM
+  bool usm_managed_mem{false};
   if constexpr (std::is_pointer_v<decltype(_rs)>) {
-    managed_mem =
+    usm_managed_mem =
         sycl::usm::alloc::shared ==
         sycl::get_pointer_type(_rs, sb_handle.get_queue().get_context());
   }
-  if (managed_mem) {
+#else
+  constexpr bool usm_managed_mem{false};
+#endif
+  if (usm_managed_mem) {
     if (_N < (1 << 18)) {
       constexpr index_t localSize = 1024;
       const index_t number_WG = (_N + localSize - 1) / localSize;
@@ -133,13 +137,17 @@ typename sb_handle_t::event_t _nrm2(
   /**
    * Read comment in _asum above.
    **/
-  bool managed_mem{false};
+#ifdef SB_ENABLE_USM
+  bool usm_managed_mem{false};
   if constexpr (std::is_pointer_v<decltype(_rs)>) {
-    managed_mem =
+    usm_managed_mem =
         sycl::usm::alloc::shared ==
         sycl::get_pointer_type(_rs, sb_handle.get_queue().get_context());
   }
-  if (managed_mem) {
+#else
+  constexpr bool usm_managed_mem{false};
+#endif
+  if (usm_managed_mem) {
     if (_N < (1 << 18)) {
       constexpr index_t localSize = 1024;
       const index_t number_WG = (_N + localSize - 1) / localSize;
@@ -179,13 +187,17 @@ typename sb_handle_t::event_t _dot(
   /**
    * Read comment in _asum above.
    **/
-  bool managed_mem{false};
+#ifdef SB_ENABLE_USM
+  bool usm_managed_mem{false};
   if constexpr (std::is_pointer_v<decltype(_rs)>) {
-    managed_mem =
+    usm_managed_mem =
         sycl::usm::alloc::shared ==
         sycl::get_pointer_type(_rs, sb_handle.get_queue().get_context());
   }
-  if (managed_mem) {
+#else
+  constexpr bool usm_managed_mem{false};
+#endif
+  if (usm_managed_mem) {
     if (_N < (1 << 18)) {
       constexpr index_t localSize = 1024;
       const index_t number_WG = (_N + localSize - 1) / localSize;
diff --git a/src/operations/blas1/WGAtomicReduction.hpp b/src/operations/blas1/WGAtomicReduction.hpp
index 779c33b43..21756c7c0 100644
--- a/src/operations/blas1/WGAtomicReduction.hpp
+++ b/src/operations/blas1/WGAtomicReduction.hpp
@@ -35,29 +35,34 @@ namespace blas {
  * and atomics operation to combine the results.
  *
  * */
-template <typename operator_t, bool managed_mem, typename lhs_t, typename rhs_t>
-WGAtomicReduction<operator_t, managed_mem, lhs_t, rhs_t>::WGAtomicReduction(
+template <typename operator_t, bool usmManagedMem, typename lhs_t,
+          typename rhs_t>
+WGAtomicReduction<operator_t, usmManagedMem, lhs_t, rhs_t>::WGAtomicReduction(
     lhs_t& _l, rhs_t& _r)
     : lhs_(_l), rhs_(_r){};
 
-template <typename operator_t, bool managed_mem, typename lhs_t, typename rhs_t>
+template <typename operator_t, bool usmManagedMem, typename lhs_t,
+          typename rhs_t>
 PORTBLAS_INLINE
-    typename WGAtomicReduction<operator_t, managed_mem, lhs_t, rhs_t>::index_t
-    WGAtomicReduction<operator_t, managed_mem, lhs_t, rhs_t>::get_size() const {
+    typename WGAtomicReduction<operator_t, usmManagedMem, lhs_t, rhs_t>::index_t
+    WGAtomicReduction<operator_t, usmManagedMem, lhs_t, rhs_t>::get_size()
+        const {
   return rhs_.get_size();
 }
 
-template <typename operator_t, bool managed_mem, typename lhs_t, typename rhs_t>
+template <typename operator_t, bool usmManagedMem, typename lhs_t,
+          typename rhs_t>
 PORTBLAS_INLINE bool
-WGAtomicReduction<operator_t, managed_mem, lhs_t, rhs_t>::valid_thread(
+WGAtomicReduction<operator_t, usmManagedMem, lhs_t, rhs_t>::valid_thread(
     cl::sycl::nd_item<1> ndItem) const {
   return true;
 }
 
-template <typename operator_t, bool managed_mem, typename lhs_t, typename rhs_t>
+template <typename operator_t, bool usmManagedMem, typename lhs_t,
+          typename rhs_t>
 PORTBLAS_INLINE
-    typename WGAtomicReduction<operator_t, managed_mem, lhs_t, rhs_t>::value_t
-    WGAtomicReduction<operator_t, managed_mem, lhs_t, rhs_t>::eval(
+    typename WGAtomicReduction<operator_t, usmManagedMem, lhs_t, rhs_t>::value_t
+    WGAtomicReduction<operator_t, usmManagedMem, lhs_t, rhs_t>::eval(
         cl::sycl::nd_item<1> ndItem) {
   auto atomic_res =
       cl::sycl::atomic_ref<value_t, cl::sycl::memory_order::relaxed,
@@ -85,11 +90,12 @@ PORTBLAS_INLINE
   return {};
 }
 
-template <typename operator_t, bool managed_mem, typename lhs_t, typename rhs_t>
+template <typename operator_t, bool usmManagedMem, typename lhs_t,
+          typename rhs_t>
 template <typename sharedT>
 PORTBLAS_INLINE
-    typename WGAtomicReduction<operator_t, managed_mem, lhs_t, rhs_t>::value_t
-    WGAtomicReduction<operator_t, managed_mem, lhs_t, rhs_t>::eval(
+    typename WGAtomicReduction<operator_t, usmManagedMem, lhs_t, rhs_t>::value_t
+    WGAtomicReduction<operator_t, usmManagedMem, lhs_t, rhs_t>::eval(
         sharedT scratch, cl::sycl::nd_item<1> ndItem) {
   const auto size = get_size();
   const int lid = static_cast<int>(ndItem.get_global_linear_id());
@@ -120,7 +126,7 @@ PORTBLAS_INLINE
                                       cl::sycl::plus<value_t>());
   }
   if (ndItem.get_local_id()[0] == 0) {
-    if constexpr (!managed_mem) {
+    if constexpr (!usmManagedMem) {
       auto atomic_res =
           cl::sycl::atomic_ref<value_t, cl::sycl::memory_order::relaxed,
                                cl::sycl::memory_scope::device,
@@ -140,15 +146,17 @@ PORTBLAS_INLINE
   return {};
 }
 
-template <typename operator_t, bool managed_mem, typename lhs_t, typename rhs_t>
-PORTBLAS_INLINE void WGAtomicReduction<operator_t, managed_mem, lhs_t,
+template <typename operator_t, bool usmManagedMem, typename lhs_t,
+          typename rhs_t>
+PORTBLAS_INLINE void WGAtomicReduction<operator_t, usmManagedMem, lhs_t,
                                        rhs_t>::bind(cl::sycl::handler& h) {
   lhs_.bind(h);
   rhs_.bind(h);
 }
 
-template <typename operator_t, bool managed_mem, typename lhs_t, typename rhs_t>
-PORTBLAS_INLINE void WGAtomicReduction<operator_t, managed_mem, lhs_t,
+template <typename operator_t, bool usmManagedMem, typename lhs_t,
+          typename rhs_t>
+PORTBLAS_INLINE void WGAtomicReduction<operator_t, usmManagedMem, lhs_t,
                                        rhs_t>::adjust_access_displacement() {
   lhs_.adjust_access_displacement();
   rhs_.adjust_access_displacement();

From eeec76116bf41a18d459b48ed96751ff8fa72856 Mon Sep 17 00:00:00 2001
From: Ouadie EL FAROUKI <ouadie.elfarouki@codeplay.com>
Date: Mon, 25 Mar 2024 14:52:05 +0000
Subject: [PATCH 08/12] Added clarifications regarding half support

---
 README.md                                   | 4 ++--
 src/interface/blas3/backend/default_cpu.hpp | 3 ++-
 src/interface/blas3/backend/intel_gpu.hpp   | 3 ++-
 src/interface/blas3/backend/nvidia_gpu.hpp  | 3 ++-
 4 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 7b1ee2b70..3752e5bf7 100644
--- a/README.md
+++ b/README.md
@@ -491,10 +491,10 @@ Some of the supported options are:
 | `BLAS_MEMPOOL_BENCHMARK` | `ON`/`OFF` |  Determines whether to enable the scratchpad memory pool for benchmark execution. `OFF` by default |
 | `BLAS_ENABLE_CONST_INPUT` | `ON`/`OFF` | Determines whether to enable kernel instantiation with const input buffer (`ON` by default) |
 | `BLAS_ENABLE_EXTENSIONS` | `ON`/`OFF` | Determines whether to enable portBLAS extensions (`ON` by default) |
-| `BLAS_DATA_TYPES` | `float;double` | Determines the floating-point types to instantiate BLAS operations for. Default is `float` |
-| `BLAS_INDEX_TYPES` | `int32_t;int64_t` | Determines the type(s) to use for `index_t` and `increment_t`. Default is `int` |
+| `BLAS_DATA_TYPES` | `float;double` | Determines the floating-point types to instantiate BLAS operations for. Default is `float`. Enabling other types such as complex or half requires setting their respective options *(next)*. |
 | `BLAS_ENABLE_COMPLEX` | `ON`/`OFF` | Determines whether to enable Complex data type support *(GEMM Operators only)* (`OFF` by default) |
 | `BLAS_ENABLE_HALF` | `ON`/`OFF` | Determines whether to enable Half data type support *(Support is limited to some Level 1 operators and Gemm)* (`OFF` by default) |
+| `BLAS_INDEX_TYPES` | `int32_t;int64_t` | Determines the type(s) to use for `index_t` and `increment_t`. Default is `int` |
 
 ## ComputeCpp Compilation *(Deprecated)*
 
diff --git a/src/interface/blas3/backend/default_cpu.hpp b/src/interface/blas3/backend/default_cpu.hpp
index bf5dec3b3..dbe475d1b 100644
--- a/src/interface/blas3/backend/default_cpu.hpp
+++ b/src/interface/blas3/backend/default_cpu.hpp
@@ -132,7 +132,8 @@ _gemm(sb_handle_t& sb_handle, index_t _M, index_t _N, index_t _K,
       container_2_t _c, index_t _ldc, index_t _stridec, index_t batch_size,
       gemm_batch_type_t batch_type,
       const typename sb_handle_t::event_t& _dependencies) {
-  // Unused configuration cases
+  // The symmetric matrice(s) cases are not enabled with half (fp16) as the Symm
+  // operator's specification doesn't include half floating point data.
   if constexpr (s_a || s_b) {
     return _dependencies;
   } else {
diff --git a/src/interface/blas3/backend/intel_gpu.hpp b/src/interface/blas3/backend/intel_gpu.hpp
index bf56e684e..8cc8180ef 100644
--- a/src/interface/blas3/backend/intel_gpu.hpp
+++ b/src/interface/blas3/backend/intel_gpu.hpp
@@ -222,7 +222,8 @@ _gemm(sb_handle_t& sb_handle, index_t _M, index_t _N, index_t _K,
       container_2_t _c, index_t _ldc, index_t _stridec, index_t batch_size,
       gemm_batch_type_t batch_type,
       const typename sb_handle_t::event_t& _dependencies) {
-  // Unused configuration cases
+  // The symmetric matrice(s) cases are not enabled with half (fp16) as the Symm
+  // operator's specification doesn't include half floating point data.
   if constexpr (s_a && s_b || ((s_a && _t_b) || (s_b && _t_a))) {
     return _dependencies;
   } else {
diff --git a/src/interface/blas3/backend/nvidia_gpu.hpp b/src/interface/blas3/backend/nvidia_gpu.hpp
index a60933694..a6f8dc419 100644
--- a/src/interface/blas3/backend/nvidia_gpu.hpp
+++ b/src/interface/blas3/backend/nvidia_gpu.hpp
@@ -188,7 +188,8 @@ _gemm(sb_handle_t& sb_handle, index_t _M, index_t _N, index_t _K,
       container_2_t _c, index_t _ldc, index_t _stridec, index_t batch_size,
       gemm_batch_type_t batch_type,
       const typename sb_handle_t::event_t& _dependencies) {
-  // Unused configuration cases
+  // The symmetric matrice(s) cases are not enabled with half (fp16) as the Symm
+  // operator's specification doesn't include half floating point data.
   if constexpr (s_a && s_b || ((s_a && _t_b) || (s_b && _t_a))) {
     return _dependencies;
   } else {

From 0201a502ba92b79a6d061a0c3b3a545c84396e8b Mon Sep 17 00:00:00 2001
From: nscipione <nicolo.scipione@codeplay.com>
Date: Mon, 25 Mar 2024 16:16:01 +0000
Subject: [PATCH 09/12] Add documentation for new template parameter added

Add documentation for `usmManagedMem` template parameter.

Signed-off-by: nscipione <nicolo.scipione@codeplay.com>
---
 src/interface/blas1_interface.hpp | 60 ++++++++++++++++++++++++++-----
 1 file changed, 51 insertions(+), 9 deletions(-)

diff --git a/src/interface/blas1_interface.hpp b/src/interface/blas1_interface.hpp
index bc458d42e..7c8180aa5 100644
--- a/src/interface/blas1_interface.hpp
+++ b/src/interface/blas1_interface.hpp
@@ -219,12 +219,26 @@ typename sb_handle_t::event_t _asum(
  * the platform being compiled for and other parameters, provides different
  * template parameters to ensure the most optimal kernel is constructed.
  *
- * @tparam localSize  specifies the number of threads per work group used by
- *                    the kernel
- * @tparam localMemSize specifies the size of local shared memory to use, which
+ * @tparam localSize    Specifies the number of threads per work group used by
+ *                      the kernel
+ * @tparam localMemSize Specifies the size of local shared memory to use, which
  *                      is device and implementation dependent. If 0 the
  *                      implementation use a kernel implementation which doesn't
  *                      require local memory.
+ * @tparam usmManagedMem Specifies if usm memory allocation is automatically
+ *                       managed  or not. The memory automatically managed
+ *                       requires that atomic address space is set to generic.
+ *                       This is a strict requirement only for AMD gpus, since
+ *                       otherwise it will rely on pcie atomics
+ *                       which we cannot enforce, guarantee or check due to its
+ *                       hardware nature. Other targets do not have the same
+ *                       strong dependency and managed memory is handled
+ *                       correctly in any case by default. It is automatically
+ *                       initialized to false to reduce verbosity of
+ *                       initialization for many targets since only one of them,
+ *                       with specific allocation type, requires a different
+ *                       value. Having a default value allows the compiler to
+ *                       handle automatically other templates.
  */
 template <int localSize, int localMemSize, bool usmManagedMem,
           typename sb_handle_t, typename container_0_t, typename container_1_t,
@@ -541,12 +555,26 @@ typename sb_handle_t::event_t _nrm2(
  * the platform being compiled for and other parameters, provides different
  * template parameters to ensure the most optimal kernel is constructed.
  *
- * @tparam localSize  specifies the number of threads per work group used by
- *                    the kernel
- * @tparam localMemSize specifies the size of local shared memory to use, which
+ * @tparam localSize    Specifies the number of threads per work group used by
+ *                      the kernel
+ * @tparam localMemSize Specifies the size of local shared memory to use, which
  *                      is device and implementation dependent. If 0 the
  *                      implementation use a kernel implementation which doesn't
  *                      require local memory.
+ * @tparam usmManagedMem Specifies if usm memory allocation is automatically
+ *                       managed  or not. The memory automatically managed
+ *                       requires that atomic address space is set to generic.
+ *                       This is a strict requirement only for AMD gpus, since
+ *                       otherwise it will rely on pcie atomics
+ *                       which we cannot enforce, guarantee or check due to its
+ *                       hardware nature. Other targets do not have the same
+ *                       strong dependency and managed memory is handled
+ *                       correctly in any case by default. It is automatically
+ *                       initialized to false to reduce verbosity of
+ *                       initialization for many targets since only one of them,
+ *                       with specific allocation type, requires a different
+ *                       value. Having a default value allows the compiler to
+ *                       handle automatically other templates.
  */
 template <int localSize, int localMemSize, bool usmManagedMem,
           typename sb_handle_t, typename container_0_t, typename container_1_t,
@@ -590,12 +618,26 @@ typename sb_handle_t::event_t _nrm2_impl(
  * different template parameters / configuration to ensure the adequate kernel
  * is called.
  *
- * @tparam localSize  specifies the number of threads per work group used by
- *                    the kernel
- * @tparam localMemSize specifies the size of local shared memory to use, which
+ * @tparam localSize    Specifies the number of threads per work group used by
+ *                      the kernel
+ * @tparam localMemSize Specifies the size of local shared memory to use, which
  *                      is device and implementation dependent. If 0 the
  *                      implementation use a kernel implementation which doesn't
  *                      require local memory.
+ * @tparam usmManagedMem Specifies if usm memory allocation is automatically
+ *                       managed  or not. The memory automatically managed
+ *                       requires that atomic address space is set to generic.
+ *                       This is a strict requirement only for AMD gpus, since
+ *                       otherwise it will rely on pcie atomics
+ *                       which we cannot enforce, guarantee or check due to its
+ *                       hardware nature. Other targets do not have the same
+ *                       strong dependency and managed memory is handled
+ *                       correctly in any case by default. It is automatically
+ *                       initialized to false to reduce verbosity of
+ *                       initialization for many targets since only one of them,
+ *                       with specific allocation type, requires a different
+ *                       value. Having a default value allows the compiler to
+ *                       handle automatically other templates.
  */
 template <int localSize, int localMemSize, bool usmManagedMem,
           typename sb_handle_t, typename container_0_t, typename container_1_t,

From 550def052611eb514124f25b582e12a6015bf0ee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Scipione?= <nicolo.scipione@codeplay.com>
Date: Tue, 26 Mar 2024 17:42:55 +0100
Subject: [PATCH 10/12] Apply suggestions from code review

Co-authored-by: HJA Bird <hja.bird@gmail.com>
Co-authored-by: pgorlani <92453485+pgorlani@users.noreply.github.com>
---
 src/interface/blas1/backend/amd_gpu.hpp    | 14 ++++++-------
 src/operations/blas1/WGAtomicReduction.hpp | 23 ++++++++--------------
 2 files changed, 15 insertions(+), 22 deletions(-)

diff --git a/src/interface/blas1/backend/amd_gpu.hpp b/src/interface/blas1/backend/amd_gpu.hpp
index e6d8e44ed..3a1409b8d 100644
--- a/src/interface/blas1/backend/amd_gpu.hpp
+++ b/src/interface/blas1/backend/amd_gpu.hpp
@@ -35,13 +35,13 @@ typename sb_handle_t::event_t _asum(
     sb_handle_t& sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
     container_1_t _rs, const typename sb_handle_t::event_t& _dependencies) {
   /**
-   * This compile time check is absolutely necessary for AMD gpu.
-   * AMD atomic operations required a specific combination of hardware that we
-   *cannot check neither enforce to users. Since reduction operators kernel
-   *implementation useses atomic operation without that particular combination
-   *the operator may fail silently. This check enforce a different atomic
-   *address space causing a big performance degradation, but making the kernel
-   *behaves correctly also with managed memory (aka malloc_shared allocation).
+   * This compile time check is absolutely necessary for AMD GPUs.
+   * AMD's atomic operations require a specific combination of hardware that
+   * cannot be checked nor enforced. Since the reduction operator kernel
+   * implementation uses atomic operations, without that particular hardware combination
+   * the reduction may silently fail. This check enforces a different atomic
+   * address space causing a big performance degradation, but also making the kernel
+   * behave correctly with managed memory (aka malloc_shared allocation).
    **/
 #ifdef SB_ENABLE_USM
   bool usm_managed_mem{false};
diff --git a/src/operations/blas1/WGAtomicReduction.hpp b/src/operations/blas1/WGAtomicReduction.hpp
index 21756c7c0..ca46b8269 100644
--- a/src/operations/blas1/WGAtomicReduction.hpp
+++ b/src/operations/blas1/WGAtomicReduction.hpp
@@ -126,21 +126,14 @@ PORTBLAS_INLINE
                                       cl::sycl::plus<value_t>());
   }
   if (ndItem.get_local_id()[0] == 0) {
-    if constexpr (!usmManagedMem) {
-      auto atomic_res =
-          cl::sycl::atomic_ref<value_t, cl::sycl::memory_order::relaxed,
-                               cl::sycl::memory_scope::device,
-                               cl::sycl::access::address_space::global_space>(
-              lhs_.get_data()[0]);
-      atomic_res += val;
-    } else {
-      auto atomic_res =
-          cl::sycl::atomic_ref<value_t, cl::sycl::memory_order::relaxed,
-                               cl::sycl::memory_scope::device,
-                               cl::sycl::access::address_space::generic_space>(
-              lhs_.get_data()[0]);
-      atomic_res += val;
-    }
+    constexpr cl::sycl::access::address_space addr_sp =
+        usmManagedMem ? cl::sycl::access::address_space::generic_space
+                      : cl::sycl::access::address_space::global_space;
+    auto atomic_res =
+        cl::sycl::atomic_ref<value_t, cl::sycl::memory_order::relaxed,
+                             cl::sycl::memory_scope::device, addr_sp>(
+            lhs_.get_data()[0]);
+    atomic_res += val;
   }
 
   return {};

From 3568b4858dfb87d2cabc82eed9555991358f350b Mon Sep 17 00:00:00 2001
From: nscipione <nicolo.scipione@codeplay.com>
Date: Thu, 28 Mar 2024 14:05:11 +0000
Subject: [PATCH 11/12] Moving memory alloc type into helper function and
 addressing other PR comments

Addressing PR comments on memory allocation type function checker and updating
comment.
Fixing documentation.

Signed-off-by: nscipione <nicolo.scipione@codeplay.com>
---
 include/portblas_helper.h               | 10 +++++
 src/interface/blas1/backend/amd_gpu.hpp | 40 ++++++++-----------
 src/interface/blas1_interface.hpp       | 52 +++++++++----------------
 3 files changed, 45 insertions(+), 57 deletions(-)

diff --git a/include/portblas_helper.h b/include/portblas_helper.h
index 4ef250829..a07ba745a 100644
--- a/include/portblas_helper.h
+++ b/include/portblas_helper.h
@@ -220,6 +220,16 @@ inline cl::sycl::event fill(cl::sycl::queue q, element_t *buff, element_t value,
 }
 #endif
 
+template <typename sb_handle_t, typename containerT>
+inline bool is_malloc_shared(sb_handle_t &sb_handle, const containerT _rs) {
+  if constexpr (std::is_pointer_v<containerT>) {
+    return sycl::usm::alloc::shared ==
+           sycl::get_pointer_type(_rs, sb_handle.get_queue().get_context());
+  } else {
+    return false;
+  }
+}
+
 }  // end namespace helper
 }  // end namespace blas
 #endif  // PORTBLAS_HELPER_H
diff --git a/src/interface/blas1/backend/amd_gpu.hpp b/src/interface/blas1/backend/amd_gpu.hpp
index 3a1409b8d..f61bc6e8b 100644
--- a/src/interface/blas1/backend/amd_gpu.hpp
+++ b/src/interface/blas1/backend/amd_gpu.hpp
@@ -25,6 +25,7 @@
 #ifndef PORTBLAS_ASUM_AMD_GPU_BACKEND_HPP
 #define PORTBLAS_ASUM_AMD_GPU_BACKEND_HPP
 #include "interface/blas1_interface.h"
+#include "portblas_helper.h"
 
 namespace blas {
 namespace asum {
@@ -38,18 +39,11 @@ typename sb_handle_t::event_t _asum(
    * This compile time check is absolutely necessary for AMD GPUs.
    * AMD's atomic operations require a specific combination of hardware that
    * cannot be checked nor enforced. Since the reduction operator kernel
-   * implementation uses atomic operations, without that particular hardware combination
-   * the reduction may silently fail. This check enforces a different atomic
-   * address space causing a big performance degradation, but also making the kernel
-   * behave correctly with managed memory (aka malloc_shared allocation).
+   * implementation uses atomic operations, without that particular hardware
+   * combination the reduction may silently fail.
    **/
 #ifdef SB_ENABLE_USM
-  bool usm_managed_mem{false};
-  if constexpr (std::is_pointer_v<decltype(_rs)>) {
-    usm_managed_mem =
-        sycl::usm::alloc::shared ==
-        sycl::get_pointer_type(_rs, sb_handle.get_queue().get_context());
-  }
+  const bool usm_managed_mem = blas::helper::is_malloc_shared(sb_handle, _rs);
 #else
   constexpr bool usm_managed_mem{false};
 #endif
@@ -135,15 +129,14 @@ typename sb_handle_t::event_t _nrm2(
     sb_handle_t& sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
     container_1_t _rs, const typename sb_handle_t::event_t& _dependencies) {
   /**
-   * Read comment in _asum above.
+   * This compile time check is absolutely necessary for AMD GPUs.
+   * AMD's atomic operations require a specific combination of hardware that
+   * cannot be checked nor enforced. Since the reduction operator kernel
+   * implementation uses atomic operations, without that particular hardware
+   * combination the reduction may silently fail.
    **/
 #ifdef SB_ENABLE_USM
-  bool usm_managed_mem{false};
-  if constexpr (std::is_pointer_v<decltype(_rs)>) {
-    usm_managed_mem =
-        sycl::usm::alloc::shared ==
-        sycl::get_pointer_type(_rs, sb_handle.get_queue().get_context());
-  }
+  const bool usm_managed_mem = blas::helper::is_malloc_shared(sb_handle, _rs);
 #else
   constexpr bool usm_managed_mem{false};
 #endif
@@ -185,15 +178,14 @@ typename sb_handle_t::event_t _dot(
     container_1_t _vy, increment_t _incy, container_2_t _rs,
     const typename sb_handle_t::event_t& _dependencies) {
   /**
-   * Read comment in _asum above.
+   * This compile time check is absolutely necessary for AMD GPUs.
+   * AMD's atomic operations require a specific combination of hardware that
+   * cannot be checked nor enforced. Since the reduction operator kernel
+   * implementation uses atomic operations, without that particular hardware
+   * combination the reduction may silently fail.
    **/
 #ifdef SB_ENABLE_USM
-  bool usm_managed_mem{false};
-  if constexpr (std::is_pointer_v<decltype(_rs)>) {
-    usm_managed_mem =
-        sycl::usm::alloc::shared ==
-        sycl::get_pointer_type(_rs, sb_handle.get_queue().get_context());
-  }
+  const bool usm_managed_mem = blas::helper::is_malloc_shared(sb_handle, _rs);
 #else
   constexpr bool usm_managed_mem{false};
 #endif
diff --git a/src/interface/blas1_interface.hpp b/src/interface/blas1_interface.hpp
index 7c8180aa5..e92027823 100644
--- a/src/interface/blas1_interface.hpp
+++ b/src/interface/blas1_interface.hpp
@@ -226,19 +226,14 @@ typename sb_handle_t::event_t _asum(
  *                      implementation use a kernel implementation which doesn't
  *                      require local memory.
  * @tparam usmManagedMem Specifies if usm memory allocation is automatically
- *                       managed  or not. The memory automatically managed
+ *                       managed  or not. Automatically managed memory
  *                       requires that atomic address space is set to generic.
- *                       This is a strict requirement only for AMD gpus, since
- *                       otherwise it will rely on pcie atomics
- *                       which we cannot enforce, guarantee or check due to its
- *                       hardware nature. Other targets do not have the same
+ *                       This is a strict requirement only for AMD GPUs, since
+ *                       AMD's implementation of atomics may depend on specific
+ *                       hardware configurations (PCIe atomics) that cannot be
+ *                       checked at runtime. Other targets do not have the same
  *                       strong dependency and managed memory is handled
- *                       correctly in any case by default. It is automatically
- *                       initialized to false to reduce verbosity of
- *                       initialization for many targets since only one of them,
- *                       with specific allocation type, requires a different
- *                       value. Having a default value allows the compiler to
- *                       handle automatically other templates.
+ *                       correctly by default.
  */
 template <int localSize, int localMemSize, bool usmManagedMem,
           typename sb_handle_t, typename container_0_t, typename container_1_t,
@@ -562,19 +557,14 @@ typename sb_handle_t::event_t _nrm2(
  *                      implementation use a kernel implementation which doesn't
  *                      require local memory.
  * @tparam usmManagedMem Specifies if usm memory allocation is automatically
- *                       managed  or not. The memory automatically managed
+ *                       managed  or not. Automatically managed memory
  *                       requires that atomic address space is set to generic.
- *                       This is a strict requirement only for AMD gpus, since
- *                       otherwise it will rely on pcie atomics
- *                       which we cannot enforce, guarantee or check due to its
- *                       hardware nature. Other targets do not have the same
+ *                       This is a strict requirement only for AMD GPUs, since
+ *                       AMD's implementation of atomics may depend on specific
+ *                       hardware configurations (PCIe atomics) that cannot be
+ *                       checked at runtime. Other targets do not have the same
  *                       strong dependency and managed memory is handled
- *                       correctly in any case by default. It is automatically
- *                       initialized to false to reduce verbosity of
- *                       initialization for many targets since only one of them,
- *                       with specific allocation type, requires a different
- *                       value. Having a default value allows the compiler to
- *                       handle automatically other templates.
+ *                       correctly by default.
  */
 template <int localSize, int localMemSize, bool usmManagedMem,
           typename sb_handle_t, typename container_0_t, typename container_1_t,
@@ -625,19 +615,15 @@ typename sb_handle_t::event_t _nrm2_impl(
  *                      implementation use a kernel implementation which doesn't
  *                      require local memory.
  * @tparam usmManagedMem Specifies if usm memory allocation is automatically
- *                       managed  or not. The memory automatically managed
+ *                       managed  or not. Automatically managed memory
  *                       requires that atomic address space is set to generic.
- *                       This is a strict requirement only for AMD gpus, since
- *                       otherwise it will rely on pcie atomics
- *                       which we cannot enforce, guarantee or check due to its
- *                       hardware nature. Other targets do not have the same
+ *                       This is a strict requirement only for AMD GPUs, since
+ *                       AMD's implementation of atomics may depend on specific
+ *                       hardware configurations (PCIe atomics) that cannot be
+ *                       checked at runtime. Other targets do not have the same
  *                       strong dependency and managed memory is handled
- *                       correctly in any case by default. It is automatically
- *                       initialized to false to reduce verbosity of
- *                       initialization for many targets since only one of them,
- *                       with specific allocation type, requires a different
- *                       value. Having a default value allows the compiler to
- *                       handle automatically other templates.
+ *                       correctly by default.
+
  */
 template <int localSize, int localMemSize, bool usmManagedMem,
           typename sb_handle_t, typename container_0_t, typename container_1_t,

From aca6ee0df0d2b9ae6bd80a82157685769c53f2d9 Mon Sep 17 00:00:00 2001
From: Ouadie EL FAROUKI <ouadie.elfarouki@codeplay.com>
Date: Wed, 3 Apr 2024 16:40:29 +0100
Subject: [PATCH 12/12] Addressed review comments

---
 src/interface/blas3/backend/intel_gpu.hpp  | 2 +-
 src/interface/blas3/backend/nvidia_gpu.hpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/interface/blas3/backend/intel_gpu.hpp b/src/interface/blas3/backend/intel_gpu.hpp
index 8cc8180ef..1fc837dd3 100644
--- a/src/interface/blas3/backend/intel_gpu.hpp
+++ b/src/interface/blas3/backend/intel_gpu.hpp
@@ -224,7 +224,7 @@ _gemm(sb_handle_t& sb_handle, index_t _M, index_t _N, index_t _K,
       const typename sb_handle_t::event_t& _dependencies) {
   // The symmetric matrice(s) cases are not enabled with half (fp16) as the Symm
   // operator's specification doesn't include half floating point data.
-  if constexpr (s_a && s_b || ((s_a && _t_b) || (s_b && _t_a))) {
+  if constexpr (s_a || s_b) {
     return _dependencies;
   } else {
     if (batch_type == gemm_batch_type_t::interleaved) {
diff --git a/src/interface/blas3/backend/nvidia_gpu.hpp b/src/interface/blas3/backend/nvidia_gpu.hpp
index a6f8dc419..00b036d4f 100644
--- a/src/interface/blas3/backend/nvidia_gpu.hpp
+++ b/src/interface/blas3/backend/nvidia_gpu.hpp
@@ -190,7 +190,7 @@ _gemm(sb_handle_t& sb_handle, index_t _M, index_t _N, index_t _K,
       const typename sb_handle_t::event_t& _dependencies) {
   // The symmetric matrice(s) cases are not enabled with half (fp16) as the Symm
   // operator's specification doesn't include half floating point data.
-  if constexpr (s_a && s_b || ((s_a && _t_b) || (s_b && _t_a))) {
+  if constexpr (s_a || s_b) {
     return _dependencies;
   } else {
     if (batch_type == gemm_batch_type_t::interleaved) {