codeplaysoftware · s-Nick · Apr 11, 2024 · Mar 6, 2024 · Mar 6, 2024 · Mar 7, 2024
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -113,12 +113,6 @@ option(BLAS_ENABLE_EXTENSIONS "Whether to enable portBLAS extensions" ON)
 option(BLAS_ENABLE_COMPLEX "Whether to enable complex data type for GEMM" OFF)
 option(BLAS_ENABLE_HALF "Whether to enable sycl::half data type for supported operators" OFF)
 
-if(((NOT INSTALL_HEADER_ONLY) AND (TUNING_TARGET STREQUAL "DEFAULT_CPU"))
-    OR (INSTALL_HEADER_ONLY AND (NOT TUNING_TARGET)))
-  set(BLAS_ENABLE_HALF OFF)
-  message(STATUS "FP16 operations are not supported for CPU targets. BLAS_ENABLE_HALF is disabled")
-endif()
-
 if (SYCL_COMPILER MATCHES "adaptivecpp") 
   if(BLAS_ENABLE_COMPLEX)
     message(STATUS "SYCL Complex data is not supported on AdaptiveCpp/hipSYCL. Complex

diff --git a/README.md b/README.md
@@ -491,10 +491,10 @@ Some of the supported options are:
 | `BLAS_MEMPOOL_BENCHMARK` | `ON`/`OFF` |  Determines whether to enable the scratchpad memory pool for benchmark execution. `OFF` by default |
 | `BLAS_ENABLE_CONST_INPUT` | `ON`/`OFF` | Determines whether to enable kernel instantiation with const input buffer (`ON` by default) |
 | `BLAS_ENABLE_EXTENSIONS` | `ON`/`OFF` | Determines whether to enable portBLAS extensions (`ON` by default) |
-| `BLAS_DATA_TYPES` | `float;double` | Determines the floating-point types to instantiate BLAS operations for. Default is `float` |
-| `BLAS_INDEX_TYPES` | `int32_t;int64_t` | Determines the type(s) to use for `index_t` and `increment_t`. Default is `int` |
+| `BLAS_DATA_TYPES` | `float;double` | Determines the floating-point types to instantiate BLAS operations for. Default is `float`. Enabling other types such as complex or half requires setting their respective options *(next)*. |
 | `BLAS_ENABLE_COMPLEX` | `ON`/`OFF` | Determines whether to enable Complex data type support *(GEMM Operators only)* (`OFF` by default) |
 | `BLAS_ENABLE_HALF` | `ON`/`OFF` | Determines whether to enable Half data type support *(Support is limited to some Level 1 operators and Gemm)* (`OFF` by default) |
+| `BLAS_INDEX_TYPES` | `int32_t;int64_t` | Determines the type(s) to use for `index_t` and `increment_t`. Default is `int` |
 
 ## ComputeCpp Compilation *(Deprecated)*
 

diff --git a/cmake/CmakeFunctionHelper.cmake b/cmake/CmakeFunctionHelper.cmake
@@ -702,6 +702,15 @@ else() # default cpu backend
     add_gemm_configuration(
       "${data}" 64 "false" "false" "false"
       64 2 2 4 4 1 1 1 1 4 4 1 1 1 float float "no_local" "standard" "full" 4 "interleaved" "false" "false")
+
+    if(BLAS_ENABLE_HALF)
+      add_gemm_configuration(
+        "half"  128 "false" "false" "false"
+        64 4 4 8 8 1 1 1 1 1 1 1 1 1 float float "no_local" "standard" "full" 1 "strided" "false" "false")
+      add_gemm_configuration(
+        "half" 64 "false" "false" "false"
+        64 2 2 4 4 1 1 1 1 4 4 1 1 1 float float "no_local" "standard" "full" 4 "interleaved" "false" "false")
+    endif()
   endforeach()
 
   if(BLAS_ENABLE_COMPLEX)

diff --git a/cmake/Modules/SYCL.cmake b/cmake/Modules/SYCL.cmake
@@ -26,11 +26,13 @@ include(CheckCXXCompilerFlag)
 include(ConfigurePORTBLAS)
 
 # find_package(AdaptiveCpp) requires ACPP_TARGETS to be set, so set it to a default value before find_package(AdaptiveCpp)
-if(SYCL_COMPILER MATCHES "adaptivecpp" AND NOT ACPP_TARGETS AND NOT ENV{ACPP_TARGETS})
-  message(STATUS "Using `omp` as ACPP_TARGETS")
-  set(ACPP_TARGETS "omp")
-else()
-  message(STATUS "Using ${ACPP_TARGETS} as ACPP_TARGETS")
+if(SYCL_COMPILER MATCHES "adaptivecpp")
+  if(NOT ACPP_TARGETS AND NOT ENV{ACPP_TARGETS})
+    message(STATUS "Using `omp` as ACPP_TARGETS")
+    set(ACPP_TARGETS "omp")
+  else()
+    message(STATUS "Using ${ACPP_TARGETS} as ACPP_TARGETS")
+  endif()
 endif()
 
 check_cxx_compiler_flag("--acpp-targets" has_acpp)

diff --git a/include/container/sycl_iterator.h b/include/container/sycl_iterator.h
@@ -194,51 +194,27 @@ template <cl::sycl::access::mode acc_md_t>
 inline typename BufferIterator<element_t>::template accessor_t<acc_md_t>
 BufferIterator<element_t>::get_range_accessor(cl::sycl::handler& cgh,
                                               size_t size) {
-  if constexpr (acc_md_t == cl::sycl::access::mode::read) {
-    return typename BufferIterator<element_t>::template accessor_t<acc_md_t>(
-        buffer_, cgh, cl::sycl::range<1>(size),
-        cl::sycl::id<1>(BufferIterator<element_t>::get_offset()));
-  } else {
-    // Skip data initialization if not accessing in read mode only
-    return typename BufferIterator<element_t>::template accessor_t<acc_md_t>(
-        buffer_, cgh, cl::sycl::range<1>(size),
-        cl::sycl::id<1>(BufferIterator<element_t>::get_offset()),
-        cl::sycl::property::no_init{});
-  }
+  return typename BufferIterator<element_t>::template accessor_t<acc_md_t>(
+      buffer_, cgh, cl::sycl::range<1>(size),
+      cl::sycl::id<1>(BufferIterator<element_t>::get_offset()));
 }
 
 template <typename element_t>
 template <cl::sycl::access::mode acc_md_t>
 inline typename BufferIterator<element_t>::template accessor_t<acc_md_t>
 BufferIterator<element_t>::get_range_accessor(cl::sycl::handler& cgh) {
-  if constexpr (acc_md_t == cl::sycl::access::mode::read) {
-    return BufferIterator<element_t>::get_range_accessor<acc_md_t>(
-        cgh, BufferIterator<element_t>::get_size());
-  } else {
-    // Skip data initialization if not accessing in read mode only
-    return BufferIterator<element_t>::get_range_accessor<acc_md_t>(
-        cgh, BufferIterator<element_t>::get_size(),
-        cl::sycl::property::no_init{});
-  }
+  return BufferIterator<element_t>::get_range_accessor<acc_md_t>(
+      cgh, BufferIterator<element_t>::get_size());
 }
 
 template <typename element_t>
 template <cl::sycl::access::mode acc_md_t>
 inline typename BufferIterator<element_t>::template placeholder_accessor_t<
     acc_md_t>
 BufferIterator<element_t>::get_range_accessor(size_t size) {
-  if constexpr (acc_md_t == cl::sycl::access::mode::read) {
-    return typename BufferIterator<element_t>::template placeholder_accessor_t<
-        acc_md_t>(buffer_, cl::sycl::range<1>(size),
-                  cl::sycl::id<1>(BufferIterator<element_t>::get_offset()));
-
-  } else {
-    // Skip data initialization if not accessing in read mode only
-    return typename BufferIterator<element_t>::template placeholder_accessor_t<
-        acc_md_t>(buffer_, cl::sycl::range<1>(size),
-                  cl::sycl::id<1>(BufferIterator<element_t>::get_offset()),
-                  cl::sycl::property::no_init{});
-  }
+  return typename BufferIterator<element_t>::template placeholder_accessor_t<
+      acc_md_t>(buffer_, cl::sycl::range<1>(size),
+                cl::sycl::id<1>(BufferIterator<element_t>::get_offset()));
 }
 
 template <typename element_t>

diff --git a/include/interface/blas1_interface.h b/include/interface/blas1_interface.h
@@ -136,9 +136,9 @@ typename sb_handle_t::event_t _asum(
  * \brief Prototype for the internal implementation of the ASUM operation. See
  * documentation in the blas1_interface.hpp file for details.
  */
-template <int localSize, int localMemSize, typename sb_handle_t,
-          typename container_0_t, typename container_1_t, typename index_t,
-          typename increment_t>
+template <int localSize, int localMemSize, bool usmManagedMem = false,
+          typename sb_handle_t, typename container_0_t, typename container_1_t,
+          typename index_t, typename increment_t>
 typename sb_handle_t::event_t _asum_impl(
     sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
     container_1_t _rs, const index_t number_WG,
@@ -257,9 +257,9 @@ typename sb_handle_t::event_t _nrm2(
  * \brief Prototype for the internal implementation of the NRM2 operator. See
  * documentation in the blas1_interface.hpp file for details.
  */
-template <int localSize, int localMemSize, typename sb_handle_t,
-          typename container_0_t, typename container_1_t, typename index_t,
-          typename increment_t>
+template <int localSize, int localMemSize, bool usmManagedMem = false,
+          typename sb_handle_t, typename container_0_t, typename container_1_t,
+          typename index_t, typename increment_t>
 typename sb_handle_t::event_t _nrm2_impl(
     sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
     container_1_t _rs, const index_t number_WG,
@@ -269,8 +269,8 @@ typename sb_handle_t::event_t _nrm2_impl(
  * \brief Prototype for the internal implementation of the Dot operator. See
  * documentation in the blas1_interface.hpp file for details.
  */
-template <int localSize, int localMemSize, typename sb_handle_t,
-          typename container_0_t, typename container_1_t,
+template <int localSize, int localMemSize, bool usmManagedMem = false,
+          typename sb_handle_t, typename container_0_t, typename container_1_t,
           typename container_2_t, typename index_t, typename increment_t>
 typename sb_handle_t::event_t _dot_impl(
     sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx,

diff --git a/include/operations/blas1_trees.h b/include/operations/blas1_trees.h
@@ -208,7 +208,8 @@ struct AssignReduction {
  * function below.
  *
  */
-template <typename operator_t, typename lhs_t, typename rhs_t>
+template <typename operator_t, bool usmManagedMem, typename lhs_t,
+          typename rhs_t>
 struct WGAtomicReduction {
   using value_t = typename lhs_t::value_t;
   using index_t = typename rhs_t::index_t;
@@ -304,10 +305,11 @@ inline AssignReduction<operator_t, lhs_t, rhs_t> make_assign_reduction(
       lhs_, rhs_, local_num_thread_, global_num_thread_);
 }
 
-template <typename operator_t, typename lhs_t, typename rhs_t>
-inline WGAtomicReduction<operator_t, lhs_t, rhs_t> make_wg_atomic_reduction(
-    lhs_t &lhs_, rhs_t &rhs_) {
-  return WGAtomicReduction<operator_t, lhs_t, rhs_t>(lhs_, rhs_);
+template <typename operator_t, bool usmManagedMem = false, typename lhs_t,
+          typename rhs_t>
+inline WGAtomicReduction<operator_t, usmManagedMem, lhs_t, rhs_t>
+make_wg_atomic_reduction(lhs_t &lhs_, rhs_t &rhs_) {
+  return WGAtomicReduction<operator_t, usmManagedMem, lhs_t, rhs_t>(lhs_, rhs_);
 }
 
 template <bool is_max, bool is_step0, typename lhs_t, typename rhs_t>

diff --git a/include/portblas_helper.h b/include/portblas_helper.h
@@ -220,6 +220,16 @@ inline cl::sycl::event fill(cl::sycl::queue q, element_t *buff, element_t value,
 }
 #endif
 
+template <typename sb_handle_t, typename containerT>
+inline bool is_malloc_shared(sb_handle_t &sb_handle, const containerT _rs) {
+  if constexpr (std::is_pointer_v<containerT>) {
+    return sycl::usm::alloc::shared ==
+           sycl::get_pointer_type(_rs, sb_handle.get_queue().get_context());
+  } else {
+    return false;
+  }
+}
+
 }  // end namespace helper
 }  // end namespace blas
 #endif  // PORTBLAS_HELPER_H
diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt
@@ -1,3 +1,31 @@
+#/***************************************************************************
+# *
+# *  @license
+# *  Copyright (C) Codeplay Software Limited
+# *  Licensed under the Apache License, Version 2.0 (the "License");
+# *  you may not use this file except in compliance with the License.
+# *  You may obtain a copy of the License at
+# *
+# *      http://www.apache.org/licenses/LICENSE-2.0
+# *
+# *  For your convenience, a copy of the License has been included in this
+# *  repository.
+# *
+# *  Unless required by applicable law or agreed to in writing, software
+# *  distributed under the License is distributed on an "AS IS" BASIS,
+# *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# *  See the License for the specific language governing permissions and
+# *  limitations under the License.
+# *
+# *  portBLAS: BLAS implementation using SYCL
+# *
+# *  @filename CMakeLists.txt
+# *
+# **************************************************************************/
+cmake_minimum_required(VERSION 3.4.3)
+
+project(portBLASSample LANGUAGES CXX)
+
 set(PORTBLAS_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../include)
 set(PORTBLAS_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../src)
 list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR})

diff --git a/src/interface/blas1/backend/amd_gpu.hpp b/src/interface/blas1/backend/amd_gpu.hpp
@@ -25,6 +25,7 @@
 #ifndef PORTBLAS_ASUM_AMD_GPU_BACKEND_HPP
 #define PORTBLAS_ASUM_AMD_GPU_BACKEND_HPP
 #include "interface/blas1_interface.h"
+#include "portblas_helper.h"
 
 namespace blas {
 namespace asum {
@@ -34,16 +35,42 @@ template <typename sb_handle_t, typename container_0_t, typename container_1_t,
 typename sb_handle_t::event_t _asum(
     sb_handle_t& sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
     container_1_t _rs, const typename sb_handle_t::event_t& _dependencies) {
-  if (_N < (1 << 18)) {
-    constexpr index_t localSize = 1024;
-    const index_t number_WG = (_N + localSize - 1) / localSize;
-    return blas::internal::_asum_impl<static_cast<int>(localSize), 32>(
-        sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
+  /**
+   * This compile time check is absolutely necessary for AMD GPUs.
+   * AMD's atomic operations require a specific combination of hardware that
+   * cannot be checked nor enforced. Since the reduction operator kernel
+   * implementation uses atomic operations, without that particular hardware
+   * combination the reduction may silently fail.
+   **/
+#ifdef SB_ENABLE_USM
+  const bool usm_managed_mem = blas::helper::is_malloc_shared(sb_handle, _rs);
+#else
+  constexpr bool usm_managed_mem{false};
+#endif
+  if (usm_managed_mem) {
+    if (_N < (1 << 18)) {
+      constexpr index_t localSize = 1024;
+      const index_t number_WG = (_N + localSize - 1) / localSize;
+      return blas::internal::_asum_impl<static_cast<int>(localSize), 32, true>(
+          sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
+    } else {
+      constexpr int localSize = 512;
+      constexpr index_t number_WG = 256;
+      return blas::internal::_asum_impl<localSize, 32, true>(
+          sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
+    }
   } else {
-    constexpr int localSize = 512;
-    constexpr index_t number_WG = 256;
-    return blas::internal::_asum_impl<localSize, 32>(
-        sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
+    if (_N < (1 << 18)) {
+      constexpr index_t localSize = 1024;
+      const index_t number_WG = (_N + localSize - 1) / localSize;
+      return blas::internal::_asum_impl<static_cast<int>(localSize), 32, false>(
+          sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
+    } else {
+      constexpr int localSize = 512;
+      constexpr index_t number_WG = 256;
+      return blas::internal::_asum_impl<localSize, 32, false>(
+          sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
+    }
   }
 }
 }  // namespace backend
@@ -101,16 +128,42 @@ template <typename sb_handle_t, typename container_0_t, typename container_1_t,
 typename sb_handle_t::event_t _nrm2(
     sb_handle_t& sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
     container_1_t _rs, const typename sb_handle_t::event_t& _dependencies) {
-  if (_N < (1 << 18)) {
-    constexpr index_t localSize = 1024;
-    const index_t number_WG = (_N + localSize - 1) / localSize;
-    return blas::internal::_nrm2_impl<static_cast<int>(localSize), 32>(
-        sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
+  /**
+   * This compile time check is absolutely necessary for AMD GPUs.
+   * AMD's atomic operations require a specific combination of hardware that
+   * cannot be checked nor enforced. Since the reduction operator kernel
+   * implementation uses atomic operations, without that particular hardware
+   * combination the reduction may silently fail.
+   **/
+#ifdef SB_ENABLE_USM
+  const bool usm_managed_mem = blas::helper::is_malloc_shared(sb_handle, _rs);
+#else
+  constexpr bool usm_managed_mem{false};
+#endif
+  if (usm_managed_mem) {
+    if (_N < (1 << 18)) {
+      constexpr index_t localSize = 1024;
+      const index_t number_WG = (_N + localSize - 1) / localSize;
+      return blas::internal::_nrm2_impl<static_cast<int>(localSize), 32, true>(
+          sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
+    } else {
+      constexpr int localSize = 512;
+      constexpr index_t number_WG = 512;
+      return blas::internal::_nrm2_impl<localSize, 32, true>(
+          sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
+    }
   } else {
-    constexpr int localSize = 512;
-    constexpr index_t number_WG = 512;
-    return blas::internal::_nrm2_impl<localSize, 32>(
-        sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
+    if (_N < (1 << 18)) {
+      constexpr index_t localSize = 1024;
+      const index_t number_WG = (_N + localSize - 1) / localSize;
+      return blas::internal::_nrm2_impl<static_cast<int>(localSize), 32, false>(
+          sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
+    } else {
+      constexpr int localSize = 512;
+      constexpr index_t number_WG = 512;
+      return blas::internal::_nrm2_impl<localSize, 32, false>(
+          sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
+    }
   }
 }
 }  // namespace backend
@@ -124,16 +177,42 @@ typename sb_handle_t::event_t _dot(
     sb_handle_t& sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
     container_1_t _vy, increment_t _incy, container_2_t _rs,
     const typename sb_handle_t::event_t& _dependencies) {
-  if (_N < (1 << 18)) {
-    constexpr index_t localSize = 1024;
-    const index_t number_WG = (_N + localSize - 1) / localSize;
-    return blas::internal::_dot_impl<static_cast<int>(localSize), 32>(
-        sb_handle, _N, _vx, _incx, _vy, _incy, _rs, number_WG, _dependencies);
+  /**
+   * This compile time check is absolutely necessary for AMD GPUs.
+   * AMD's atomic operations require a specific combination of hardware that
+   * cannot be checked nor enforced. Since the reduction operator kernel
+   * implementation uses atomic operations, without that particular hardware
+   * combination the reduction may silently fail.
+   **/
+#ifdef SB_ENABLE_USM
+  const bool usm_managed_mem = blas::helper::is_malloc_shared(sb_handle, _rs);
+#else
+  constexpr bool usm_managed_mem{false};
+#endif
+  if (usm_managed_mem) {
+    if (_N < (1 << 18)) {
+      constexpr index_t localSize = 1024;
+      const index_t number_WG = (_N + localSize - 1) / localSize;
+      return blas::internal::_dot_impl<static_cast<int>(localSize), 32, true>(
+          sb_handle, _N, _vx, _incx, _vy, _incy, _rs, number_WG, _dependencies);
+    } else {
+      constexpr int localSize = 512;
+      constexpr index_t number_WG = 512;
+      return blas::internal::_dot_impl<localSize, 32, true>(
+          sb_handle, _N, _vx, _incx, _vy, _incy, _rs, number_WG, _dependencies);
+    }
   } else {
-    constexpr int localSize = 512;
-    constexpr index_t number_WG = 512;
-    return blas::internal::_dot_impl<localSize, 32>(
-        sb_handle, _N, _vx, _incx, _vy, _incy, _rs, number_WG, _dependencies);
+    if (_N < (1 << 18)) {
+      constexpr index_t localSize = 1024;
+      const index_t number_WG = (_N + localSize - 1) / localSize;
+      return blas::internal::_dot_impl<static_cast<int>(localSize), 32, false>(
+          sb_handle, _N, _vx, _incx, _vy, _incy, _rs, number_WG, _dependencies);
+    } else {
+      constexpr int localSize = 512;
+      constexpr index_t number_WG = 512;
+      return blas::internal::_dot_impl<localSize, 32, false>(
+          sb_handle, _N, _vx, _incx, _vy, _incy, _rs, number_WG, _dependencies);
+    }
   }
 }
 }  // namespace backend