Skip to content
This repository has been archived by the owner on Jan 13, 2025. It is now read-only.

Fixed issues encountered through oneMKL portBLAS backend #504

Merged
merged 12 commits into from
Apr 11, 2024
Merged
6 changes: 0 additions & 6 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -113,12 +113,6 @@ option(BLAS_ENABLE_EXTENSIONS "Whether to enable portBLAS extensions" ON)
option(BLAS_ENABLE_COMPLEX "Whether to enable complex data type for GEMM" OFF)
option(BLAS_ENABLE_HALF "Whether to enable sycl::half data type for supported operators" OFF)

if(((NOT INSTALL_HEADER_ONLY) AND (TUNING_TARGET STREQUAL "DEFAULT_CPU"))
OR (INSTALL_HEADER_ONLY AND (NOT TUNING_TARGET)))
set(BLAS_ENABLE_HALF OFF)
message(STATUS "FP16 operations are not supported for CPU targets. BLAS_ENABLE_HALF is disabled")
endif()

if (SYCL_COMPILER MATCHES "adaptivecpp")
if(BLAS_ENABLE_COMPLEX)
message(STATUS "SYCL Complex data is not supported on AdaptiveCpp/hipSYCL. Complex
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -491,10 +491,10 @@ Some of the supported options are:
| `BLAS_MEMPOOL_BENCHMARK` | `ON`/`OFF` | Determines whether to enable the scratchpad memory pool for benchmark execution. `OFF` by default |
| `BLAS_ENABLE_CONST_INPUT` | `ON`/`OFF` | Determines whether to enable kernel instantiation with const input buffer (`ON` by default) |
| `BLAS_ENABLE_EXTENSIONS` | `ON`/`OFF` | Determines whether to enable portBLAS extensions (`ON` by default) |
| `BLAS_DATA_TYPES` | `float;double` | Determines the floating-point types to instantiate BLAS operations for. Default is `float` |
| `BLAS_INDEX_TYPES` | `int32_t;int64_t` | Determines the type(s) to use for `index_t` and `increment_t`. Default is `int` |
| `BLAS_DATA_TYPES` | `float;double` | Determines the floating-point types to instantiate BLAS operations for. Default is `float`. Enabling other types such as complex or half requires setting their respective options *(next)*. |
| `BLAS_ENABLE_COMPLEX` | `ON`/`OFF` | Determines whether to enable Complex data type support *(GEMM Operators only)* (`OFF` by default) |
| `BLAS_ENABLE_HALF` | `ON`/`OFF` | Determines whether to enable Half data type support *(Support is limited to some Level 1 operators and Gemm)* (`OFF` by default) |
| `BLAS_INDEX_TYPES` | `int32_t;int64_t` | Determines the type(s) to use for `index_t` and `increment_t`. Default is `int` |

## ComputeCpp Compilation *(Deprecated)*

Expand Down
9 changes: 9 additions & 0 deletions cmake/CmakeFunctionHelper.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -702,6 +702,15 @@ else() # default cpu backend
add_gemm_configuration(
"${data}" 64 "false" "false" "false"
64 2 2 4 4 1 1 1 1 4 4 1 1 1 float float "no_local" "standard" "full" 4 "interleaved" "false" "false")

if(BLAS_ENABLE_HALF)
add_gemm_configuration(
"half" 128 "false" "false" "false"
64 4 4 8 8 1 1 1 1 1 1 1 1 1 float float "no_local" "standard" "full" 1 "strided" "false" "false")
add_gemm_configuration(
"half" 64 "false" "false" "false"
64 2 2 4 4 1 1 1 1 4 4 1 1 1 float float "no_local" "standard" "full" 4 "interleaved" "false" "false")
endif()
endforeach()

if(BLAS_ENABLE_COMPLEX)
Expand Down
12 changes: 7 additions & 5 deletions cmake/Modules/SYCL.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,13 @@ include(CheckCXXCompilerFlag)
include(ConfigurePORTBLAS)

# find_package(AdaptiveCpp) requires ACPP_TARGETS to be set, so set it to a default value before find_package(AdaptiveCpp)
if(SYCL_COMPILER MATCHES "adaptivecpp" AND NOT ACPP_TARGETS AND NOT ENV{ACPP_TARGETS})
message(STATUS "Using `omp` as ACPP_TARGETS")
set(ACPP_TARGETS "omp")
else()
message(STATUS "Using ${ACPP_TARGETS} as ACPP_TARGETS")
if(SYCL_COMPILER MATCHES "adaptivecpp")
if(NOT ACPP_TARGETS AND NOT ENV{ACPP_TARGETS})
message(STATUS "Using `omp` as ACPP_TARGETS")
set(ACPP_TARGETS "omp")
else()
message(STATUS "Using ${ACPP_TARGETS} as ACPP_TARGETS")
endif()
endif()

check_cxx_compiler_flag("--acpp-targets" has_acpp)
Expand Down
40 changes: 8 additions & 32 deletions include/container/sycl_iterator.h
Original file line number Diff line number Diff line change
Expand Up @@ -194,51 +194,27 @@ template <cl::sycl::access::mode acc_md_t>
inline typename BufferIterator<element_t>::template accessor_t<acc_md_t>
BufferIterator<element_t>::get_range_accessor(cl::sycl::handler& cgh,
size_t size) {
if constexpr (acc_md_t == cl::sycl::access::mode::read) {
return typename BufferIterator<element_t>::template accessor_t<acc_md_t>(
buffer_, cgh, cl::sycl::range<1>(size),
cl::sycl::id<1>(BufferIterator<element_t>::get_offset()));
} else {
// Skip data initialization if not accessing in read mode only
return typename BufferIterator<element_t>::template accessor_t<acc_md_t>(
buffer_, cgh, cl::sycl::range<1>(size),
cl::sycl::id<1>(BufferIterator<element_t>::get_offset()),
cl::sycl::property::no_init{});
Comment on lines -202 to -206
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this have a performance impact for devices where this code was originally working?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@hjabird This was actually added to get rid of some warnings regarding buffers initialization when using AdaptiveCpp, but introducing it caused some tests to fail through the portBLAS backend API in oneMKL, that's why we're reverting. No performance aspects have been tested/verified but this change was tested with AdaptiveCpp (warnings are back but no errors/tests failures).

}
return typename BufferIterator<element_t>::template accessor_t<acc_md_t>(
buffer_, cgh, cl::sycl::range<1>(size),
cl::sycl::id<1>(BufferIterator<element_t>::get_offset()));
}

template <typename element_t>
template <cl::sycl::access::mode acc_md_t>
inline typename BufferIterator<element_t>::template accessor_t<acc_md_t>
BufferIterator<element_t>::get_range_accessor(cl::sycl::handler& cgh) {
if constexpr (acc_md_t == cl::sycl::access::mode::read) {
return BufferIterator<element_t>::get_range_accessor<acc_md_t>(
cgh, BufferIterator<element_t>::get_size());
} else {
// Skip data initialization if not accessing in read mode only
return BufferIterator<element_t>::get_range_accessor<acc_md_t>(
cgh, BufferIterator<element_t>::get_size(),
cl::sycl::property::no_init{});
}
return BufferIterator<element_t>::get_range_accessor<acc_md_t>(
cgh, BufferIterator<element_t>::get_size());
}

template <typename element_t>
template <cl::sycl::access::mode acc_md_t>
inline typename BufferIterator<element_t>::template placeholder_accessor_t<
acc_md_t>
BufferIterator<element_t>::get_range_accessor(size_t size) {
if constexpr (acc_md_t == cl::sycl::access::mode::read) {
return typename BufferIterator<element_t>::template placeholder_accessor_t<
acc_md_t>(buffer_, cl::sycl::range<1>(size),
cl::sycl::id<1>(BufferIterator<element_t>::get_offset()));

} else {
// Skip data initialization if not accessing in read mode only
return typename BufferIterator<element_t>::template placeholder_accessor_t<
acc_md_t>(buffer_, cl::sycl::range<1>(size),
cl::sycl::id<1>(BufferIterator<element_t>::get_offset()),
cl::sycl::property::no_init{});
}
return typename BufferIterator<element_t>::template placeholder_accessor_t<
acc_md_t>(buffer_, cl::sycl::range<1>(size),
cl::sycl::id<1>(BufferIterator<element_t>::get_offset()));
}

template <typename element_t>
Expand Down
16 changes: 8 additions & 8 deletions include/interface/blas1_interface.h
Original file line number Diff line number Diff line change
Expand Up @@ -136,9 +136,9 @@ typename sb_handle_t::event_t _asum(
* \brief Prototype for the internal implementation of the ASUM operation. See
* documentation in the blas1_interface.hpp file for details.
*/
template <int localSize, int localMemSize, typename sb_handle_t,
typename container_0_t, typename container_1_t, typename index_t,
typename increment_t>
template <int localSize, int localMemSize, bool usmManagedMem = false,
Rbiessy marked this conversation as resolved.
Show resolved Hide resolved
typename sb_handle_t, typename container_0_t, typename container_1_t,
typename index_t, typename increment_t>
typename sb_handle_t::event_t _asum_impl(
sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
container_1_t _rs, const index_t number_WG,
Expand Down Expand Up @@ -257,9 +257,9 @@ typename sb_handle_t::event_t _nrm2(
* \brief Prototype for the internal implementation of the NRM2 operator. See
* documentation in the blas1_interface.hpp file for details.
*/
template <int localSize, int localMemSize, typename sb_handle_t,
typename container_0_t, typename container_1_t, typename index_t,
typename increment_t>
template <int localSize, int localMemSize, bool usmManagedMem = false,
typename sb_handle_t, typename container_0_t, typename container_1_t,
typename index_t, typename increment_t>
typename sb_handle_t::event_t _nrm2_impl(
sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
container_1_t _rs, const index_t number_WG,
Expand All @@ -269,8 +269,8 @@ typename sb_handle_t::event_t _nrm2_impl(
* \brief Prototype for the internal implementation of the Dot operator. See
* documentation in the blas1_interface.hpp file for details.
*/
template <int localSize, int localMemSize, typename sb_handle_t,
typename container_0_t, typename container_1_t,
template <int localSize, int localMemSize, bool usmManagedMem = false,
typename sb_handle_t, typename container_0_t, typename container_1_t,
typename container_2_t, typename index_t, typename increment_t>
typename sb_handle_t::event_t _dot_impl(
sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
Expand Down
12 changes: 7 additions & 5 deletions include/operations/blas1_trees.h
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,8 @@ struct AssignReduction {
* function below.
*
*/
template <typename operator_t, typename lhs_t, typename rhs_t>
template <typename operator_t, bool usmManagedMem, typename lhs_t,
typename rhs_t>
struct WGAtomicReduction {
using value_t = typename lhs_t::value_t;
using index_t = typename rhs_t::index_t;
Expand Down Expand Up @@ -304,10 +305,11 @@ inline AssignReduction<operator_t, lhs_t, rhs_t> make_assign_reduction(
lhs_, rhs_, local_num_thread_, global_num_thread_);
}

template <typename operator_t, typename lhs_t, typename rhs_t>
inline WGAtomicReduction<operator_t, lhs_t, rhs_t> make_wg_atomic_reduction(
lhs_t &lhs_, rhs_t &rhs_) {
return WGAtomicReduction<operator_t, lhs_t, rhs_t>(lhs_, rhs_);
template <typename operator_t, bool usmManagedMem = false, typename lhs_t,
typename rhs_t>
inline WGAtomicReduction<operator_t, usmManagedMem, lhs_t, rhs_t>
make_wg_atomic_reduction(lhs_t &lhs_, rhs_t &rhs_) {
return WGAtomicReduction<operator_t, usmManagedMem, lhs_t, rhs_t>(lhs_, rhs_);
}

template <bool is_max, bool is_step0, typename lhs_t, typename rhs_t>
Expand Down
10 changes: 10 additions & 0 deletions include/portblas_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,16 @@ inline cl::sycl::event fill(cl::sycl::queue q, element_t *buff, element_t value,
}
#endif

template <typename sb_handle_t, typename containerT>
inline bool is_malloc_shared(sb_handle_t &sb_handle, const containerT _rs) {
if constexpr (std::is_pointer_v<containerT>) {
return sycl::usm::alloc::shared ==
sycl::get_pointer_type(_rs, sb_handle.get_queue().get_context());
} else {
return false;
}
}

} // end namespace helper
} // end namespace blas
#endif // PORTBLAS_HELPER_H
28 changes: 28 additions & 0 deletions samples/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,31 @@
#/***************************************************************************
# *
# * @license
# * Copyright (C) Codeplay Software Limited
# * Licensed under the Apache License, Version 2.0 (the "License");
# * you may not use this file except in compliance with the License.
# * You may obtain a copy of the License at
# *
# * http://www.apache.org/licenses/LICENSE-2.0
# *
# * For your convenience, a copy of the License has been included in this
# * repository.
# *
# * Unless required by applicable law or agreed to in writing, software
# * distributed under the License is distributed on an "AS IS" BASIS,
# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# * See the License for the specific language governing permissions and
# * limitations under the License.
# *
# * portBLAS: BLAS implementation using SYCL
# *
# * @filename CMakeLists.txt
# *
# **************************************************************************/
cmake_minimum_required(VERSION 3.4.3)
OuadiElfarouki marked this conversation as resolved.
Show resolved Hide resolved
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm curious about the choice of 3.4.3 - has this been tested?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes I just used the same version required by the main CMakeLists.txt, as I got some warnings/errors if I still remember when building the samples alone (header only with portBLAS)


project(portBLASSample LANGUAGES CXX)

set(PORTBLAS_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../include)
set(PORTBLAS_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../src)
list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR})
Expand Down
133 changes: 106 additions & 27 deletions src/interface/blas1/backend/amd_gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#ifndef PORTBLAS_ASUM_AMD_GPU_BACKEND_HPP
#define PORTBLAS_ASUM_AMD_GPU_BACKEND_HPP
#include "interface/blas1_interface.h"
#include "portblas_helper.h"

namespace blas {
namespace asum {
Expand All @@ -34,16 +35,42 @@ template <typename sb_handle_t, typename container_0_t, typename container_1_t,
typename sb_handle_t::event_t _asum(
sb_handle_t& sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
container_1_t _rs, const typename sb_handle_t::event_t& _dependencies) {
if (_N < (1 << 18)) {
constexpr index_t localSize = 1024;
const index_t number_WG = (_N + localSize - 1) / localSize;
return blas::internal::_asum_impl<static_cast<int>(localSize), 32>(
sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
/**
* This compile time check is absolutely necessary for AMD GPUs.
* AMD's atomic operations require a specific combination of hardware that
* cannot be checked nor enforced. Since the reduction operator kernel
* implementation uses atomic operations, without that particular hardware
* combination the reduction may silently fail.
**/
#ifdef SB_ENABLE_USM
const bool usm_managed_mem = blas::helper::is_malloc_shared(sb_handle, _rs);
#else
constexpr bool usm_managed_mem{false};
#endif
if (usm_managed_mem) {
if (_N < (1 << 18)) {
constexpr index_t localSize = 1024;
const index_t number_WG = (_N + localSize - 1) / localSize;
return blas::internal::_asum_impl<static_cast<int>(localSize), 32, true>(
sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
} else {
constexpr int localSize = 512;
constexpr index_t number_WG = 256;
return blas::internal::_asum_impl<localSize, 32, true>(
sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
}
} else {
constexpr int localSize = 512;
constexpr index_t number_WG = 256;
return blas::internal::_asum_impl<localSize, 32>(
sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
if (_N < (1 << 18)) {
constexpr index_t localSize = 1024;
const index_t number_WG = (_N + localSize - 1) / localSize;
return blas::internal::_asum_impl<static_cast<int>(localSize), 32, false>(
sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
} else {
constexpr int localSize = 512;
constexpr index_t number_WG = 256;
return blas::internal::_asum_impl<localSize, 32, false>(
sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
}
hjabird marked this conversation as resolved.
Show resolved Hide resolved
}
}
} // namespace backend
Expand Down Expand Up @@ -101,16 +128,42 @@ template <typename sb_handle_t, typename container_0_t, typename container_1_t,
typename sb_handle_t::event_t _nrm2(
sb_handle_t& sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
container_1_t _rs, const typename sb_handle_t::event_t& _dependencies) {
if (_N < (1 << 18)) {
constexpr index_t localSize = 1024;
const index_t number_WG = (_N + localSize - 1) / localSize;
return blas::internal::_nrm2_impl<static_cast<int>(localSize), 32>(
sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
/**
* This compile time check is absolutely necessary for AMD GPUs.
* AMD's atomic operations require a specific combination of hardware that
* cannot be checked nor enforced. Since the reduction operator kernel
* implementation uses atomic operations, without that particular hardware
* combination the reduction may silently fail.
**/
#ifdef SB_ENABLE_USM
const bool usm_managed_mem = blas::helper::is_malloc_shared(sb_handle, _rs);
#else
constexpr bool usm_managed_mem{false};
#endif
if (usm_managed_mem) {
if (_N < (1 << 18)) {
constexpr index_t localSize = 1024;
const index_t number_WG = (_N + localSize - 1) / localSize;
return blas::internal::_nrm2_impl<static_cast<int>(localSize), 32, true>(
sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
} else {
constexpr int localSize = 512;
constexpr index_t number_WG = 512;
return blas::internal::_nrm2_impl<localSize, 32, true>(
sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
}
} else {
constexpr int localSize = 512;
constexpr index_t number_WG = 512;
return blas::internal::_nrm2_impl<localSize, 32>(
sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
if (_N < (1 << 18)) {
constexpr index_t localSize = 1024;
const index_t number_WG = (_N + localSize - 1) / localSize;
return blas::internal::_nrm2_impl<static_cast<int>(localSize), 32, false>(
sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
} else {
constexpr int localSize = 512;
constexpr index_t number_WG = 512;
return blas::internal::_nrm2_impl<localSize, 32, false>(
sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
}
}
}
} // namespace backend
Expand All @@ -124,16 +177,42 @@ typename sb_handle_t::event_t _dot(
sb_handle_t& sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
container_1_t _vy, increment_t _incy, container_2_t _rs,
const typename sb_handle_t::event_t& _dependencies) {
if (_N < (1 << 18)) {
constexpr index_t localSize = 1024;
const index_t number_WG = (_N + localSize - 1) / localSize;
return blas::internal::_dot_impl<static_cast<int>(localSize), 32>(
sb_handle, _N, _vx, _incx, _vy, _incy, _rs, number_WG, _dependencies);
/**
* This compile time check is absolutely necessary for AMD GPUs.
* AMD's atomic operations require a specific combination of hardware that
* cannot be checked nor enforced. Since the reduction operator kernel
* implementation uses atomic operations, without that particular hardware
* combination the reduction may silently fail.
**/
#ifdef SB_ENABLE_USM
const bool usm_managed_mem = blas::helper::is_malloc_shared(sb_handle, _rs);
#else
constexpr bool usm_managed_mem{false};
#endif
if (usm_managed_mem) {
if (_N < (1 << 18)) {
constexpr index_t localSize = 1024;
const index_t number_WG = (_N + localSize - 1) / localSize;
return blas::internal::_dot_impl<static_cast<int>(localSize), 32, true>(
sb_handle, _N, _vx, _incx, _vy, _incy, _rs, number_WG, _dependencies);
} else {
constexpr int localSize = 512;
constexpr index_t number_WG = 512;
return blas::internal::_dot_impl<localSize, 32, true>(
sb_handle, _N, _vx, _incx, _vy, _incy, _rs, number_WG, _dependencies);
}
} else {
constexpr int localSize = 512;
constexpr index_t number_WG = 512;
return blas::internal::_dot_impl<localSize, 32>(
sb_handle, _N, _vx, _incx, _vy, _incy, _rs, number_WG, _dependencies);
if (_N < (1 << 18)) {
constexpr index_t localSize = 1024;
const index_t number_WG = (_N + localSize - 1) / localSize;
return blas::internal::_dot_impl<static_cast<int>(localSize), 32, false>(
sb_handle, _N, _vx, _incx, _vy, _incy, _rs, number_WG, _dependencies);
} else {
constexpr int localSize = 512;
constexpr index_t number_WG = 512;
return blas::internal::_dot_impl<localSize, 32, false>(
sb_handle, _N, _vx, _incx, _vy, _incy, _rs, number_WG, _dependencies);
}
}
}
} // namespace backend
Expand Down
Loading
Loading