From 4da32a5ef07af931695a373e45be5a041f892015 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Widera?= Date: Fri, 1 Dec 2023 13:39:06 +0100 Subject: [PATCH] remove `ALPAKA_ASSERT_OFFLOAD`, introduce `ALPAKA_ASSERT_ACC` fix #2186 fix #2001 Provide a device side assert `ALPAKA_ASSERT_ACC` which can be disabled by defining `ALPAKA_DISABLE_ASSERT_ACC` the C++ code or by the CMake option `alpaka_ASSERT_ACC_ENABLE`. For CPU devices or host side code the assert behaves like`ALPAKA_ASSERT`. Co-authored-by: Andrea Bocci --- cmake/alpakaCommon.cmake | 9 ++- example/bufferCopy/src/bufferCopy.cpp | 2 +- .../shared/dyn/BlockSharedMemDynMember.hpp | 2 +- .../st/detail/BlockSharedMemStMemberImpl.hpp | 8 +-- include/alpaka/core/Assert.hpp | 55 ++++++++++++++----- include/alpaka/idx/bt/IdxBtOmp.hpp | 2 +- include/alpaka/warp/WarpGenericSycl.hpp | 4 +- 7 files changed, 55 insertions(+), 27 deletions(-) diff --git a/cmake/alpakaCommon.cmake b/cmake/alpakaCommon.cmake index 65dcdb86428d..70598559bee6 100644 --- a/cmake/alpakaCommon.cmake +++ b/cmake/alpakaCommon.cmake @@ -139,6 +139,12 @@ if(alpaka_DISABLE_VENDOR_RNG) target_compile_definitions(alpaka INTERFACE "ALPAKA_DISABLE_VENDOR_RNG") endif() +# Device side assert +option(alpaka_ASSERT_ACC_ENABLE "Enable device side asserts. In case value is OFF device side asserts will be disabled even if NDEBUG is not defined." ON) +if(!alpaka_ASSERT_ACC_ENABLE) + target_compile_definitions(alpaka INTERFACE "ALPAKA_DISABLE_ASSERT_ACC") +endif() + #------------------------------------------------------------------------------- # Debug output of common variables. if(${alpaka_DEBUG} GREATER 1) @@ -731,9 +737,6 @@ if(alpaka_ACC_SYCL_ENABLE) endif() target_compile_definitions(alpaka INTERFACE "ALPAKA_DEBUG=${alpaka_DEBUG}") -if(alpaka_DEBUG_OFFLOAD_ASSUME_HOST) - target_compile_definitions(alpaka INTERFACE "ALPAKA_DEBUG_OFFLOAD_ASSUME_HOST") -endif() target_compile_definitions(alpaka INTERFACE "ALPAKA_BLOCK_SHARED_DYN_MEMBER_ALLOC_KIB=${alpaka_BLOCK_SHARED_DYN_MEMBER_ALLOC_KIB}") diff --git a/example/bufferCopy/src/bufferCopy.cpp b/example/bufferCopy/src/bufferCopy.cpp index b1e53ff20f7e..998df9539175 100644 --- a/example/bufferCopy/src/bufferCopy.cpp +++ b/example/bufferCopy/src/bufferCopy.cpp @@ -39,7 +39,7 @@ struct TestBufferKernel for(size_t z = idx[0]; z < data.extent(0); z += gridSize[0]) for(size_t y = idx[1]; y < data.extent(1); y += gridSize[1]) for(size_t x = idx[2]; x < data.extent(2); x += gridSize[2]) - ALPAKA_ASSERT_OFFLOAD( + ALPAKA_ASSERT_ACC( data(z, y, x) == alpaka::mapIdx<1u>(Vec{z, y, x}, Vec{data.extent(0), data.extent(1), data.extent(2)})[0]); } diff --git a/include/alpaka/block/shared/dyn/BlockSharedMemDynMember.hpp b/include/alpaka/block/shared/dyn/BlockSharedMemDynMember.hpp index fc1cced8ad4b..c6a323989d21 100644 --- a/include/alpaka/block/shared/dyn/BlockSharedMemDynMember.hpp +++ b/include/alpaka/block/shared/dyn/BlockSharedMemDynMember.hpp @@ -42,7 +42,7 @@ namespace alpaka public: BlockSharedMemDynMember(std::size_t sizeBytes) : m_dynPitch(getPitch(sizeBytes)) { - ALPAKA_ASSERT_OFFLOAD(static_cast(sizeBytes) <= staticAllocBytes()); + ALPAKA_ASSERT_ACC(static_cast(sizeBytes) <= staticAllocBytes()); } auto dynMemBegin() const -> uint8_t* diff --git a/include/alpaka/block/shared/st/detail/BlockSharedMemStMemberImpl.hpp b/include/alpaka/block/shared/st/detail/BlockSharedMemStMemberImpl.hpp index 1cb4922556b6..eb09790ff859 100644 --- a/include/alpaka/block/shared/st/detail/BlockSharedMemStMemberImpl.hpp +++ b/include/alpaka/block/shared/st/detail/BlockSharedMemStMemberImpl.hpp @@ -39,7 +39,7 @@ namespace alpaka::detail : m_mem(mem) , m_capacity(static_cast(capacity)) { - ALPAKA_ASSERT_OFFLOAD((m_mem == nullptr) == (m_capacity == 0u)); + ALPAKA_ASSERT_ACC((m_mem == nullptr) == (m_capacity == 0u)); } #else BlockSharedMemStMemberImpl(std::uint8_t* mem, std::size_t) : m_mem(mem) @@ -52,12 +52,12 @@ namespace alpaka::detail { // Add meta data chunk in front of the user data m_allocdBytes = varChunkEnd(m_allocdBytes); - ALPAKA_ASSERT_OFFLOAD(m_allocdBytes <= m_capacity); + ALPAKA_ASSERT_ACC(m_allocdBytes <= m_capacity); auto* meta = getLatestVarPtr(); // Allocate variable m_allocdBytes = varChunkEnd(m_allocdBytes); - ALPAKA_ASSERT_OFFLOAD(m_allocdBytes <= m_capacity); + ALPAKA_ASSERT_ACC(m_allocdBytes <= m_capacity); // Update meta data with id and offset for the allocated variable. meta->id = id; @@ -87,7 +87,7 @@ namespace alpaka::detail // Adjust offset to be aligned std::uint32_t const alignedMetaDataOffset = varChunkEnd(off) - static_cast(sizeof(MetaData)); - ALPAKA_ASSERT_OFFLOAD( + ALPAKA_ASSERT_ACC( (alignedMetaDataOffset + static_cast(sizeof(MetaData))) <= m_allocdBytes); auto* metaDataPtr = reinterpret_cast(m_mem + alignedMetaDataOffset); off = metaDataPtr->offset; diff --git a/include/alpaka/core/Assert.hpp b/include/alpaka/core/Assert.hpp index 55e1560934ef..7ad2a2b0734e 100644 --- a/include/alpaka/core/Assert.hpp +++ b/include/alpaka/core/Assert.hpp @@ -9,22 +9,47 @@ #include #include +//! The assert can be explicit disabled by defining NDEBUG #define ALPAKA_ASSERT(...) assert(__VA_ARGS__) -#if defined(ALPAKA_DEBUG_OFFLOAD_ASSUME_HOST) || defined(SYCL_EXT_ONEAPI_ASSERT) -# define ALPAKA_ASSERT_OFFLOAD(EXPRESSION) ALPAKA_ASSERT(EXPRESSION) -#elif defined __AMDGCN__ && (!defined NDEBUG) -# define ALPAKA_ASSERT_OFFLOAD(EXPRESSION) \ - do \ - { \ - if(!(EXPRESSION)) \ - __builtin_trap(); \ - } while(false) +//! Macro which expands to a noop. +//! Macro enforces an semicolon after the call. +#define ALPAKA_NOOP(...) \ + do \ + { \ + } while(false) + +//! ALPAKA_ASSERT_ACC_IMPL is an assert-like macro. +//! It can be disabled setting the ALPAKA_DISABLE_ASSERT_ACC preprocessor symbol or the NDEBUG preprocessor symbol. +#if !defined(ALPAKA_DISABLE_ASSERT_ACC) +# define ALPAKA_ASSERT_ACC_IMPL(...) ALPAKA_ASSERT(__VA_ARGS__) +#else +# define ALPAKA_ASSERT_ACC_IMPL(...) ALPAKA_NOOP(__VA_ARGS__) +#endif + +//! ALPAKA_ASSERT_ACC is an assert-like macro. +//! +//! In device code for a GPU or SYCL backend it can be disabled setting the ALPAKA_DISABLE_ASSERT_ACC preprocessor +//! symbol or the NDEBUG preprocessor symbol. In device code for a native C++ CPU backend and in host code, it is +//! equivalent to ALPAKA_ASSERT, and can be disabled setting the NDEBUG preprocessor symbol. +#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && defined(__CUDA_ARCH__) +// CUDA device code +# define ALPAKA_ASSERT_ACC(...) ALPAKA_ASSERT_ACC_IMPL(__VA_ARGS__) +#elif defined(ALPAKA_ACC_GPU_HIP_ENABLED) && defined(__HIP_DEVICE_COMPILE__) +// HIP/ROCm device code +# define ALPAKA_ASSERT_ACC(...) ALPAKA_ASSERT_ACC_IMPL(__VA_ARGS__) +#elif defined(ALPAKA_ACC_SYCL_ENABLED) && defined(__SYCL_DEVICE_ONLY__) +// SYCL/oneAPI device code +# if defined(SYCL_EXT_ONEAPI_ASSERT) +# define ALPAKA_ASSERT_ACC(...) ALPAKA_ASSERT_ACC_IMPL(__VA_ARGS__) +# else +# define ALPAKA_ASSERT_ACC(...) ALPAKA_NOOP(__VA_ARGS__) +# endif +// add here any other #elif conditions for non-CPU backends +// ... #else -# define ALPAKA_ASSERT_OFFLOAD(EXPRESSION) \ - do \ - { \ - } while(false) +// CPU backend, or host code +# define ALPAKA_ASSERT_ACC(...) ALPAKA_ASSERT(__VA_ARGS__) #endif namespace alpaka::core @@ -38,7 +63,7 @@ namespace alpaka::core [[maybe_unused]] TArg const& arg) { if constexpr(std::is_signed_v) - ALPAKA_ASSERT_OFFLOAD(arg >= 0); + ALPAKA_ASSERT_ACC(arg >= 0); // Nothing to do for unsigned types. } @@ -63,7 +88,7 @@ namespace alpaka::core [[maybe_unused]] TRhs const& rhs) { if constexpr(std::is_signed_v || (TLhs::value != 0u)) - ALPAKA_ASSERT_OFFLOAD(TLhs::value > rhs); + ALPAKA_ASSERT_ACC(TLhs::value > rhs); // Nothing to do for unsigned types comparing to zero. } diff --git a/include/alpaka/idx/bt/IdxBtOmp.hpp b/include/alpaka/idx/bt/IdxBtOmp.hpp index eb9930471eae..df5a96a87b9d 100644 --- a/include/alpaka/idx/bt/IdxBtOmp.hpp +++ b/include/alpaka/idx/bt/IdxBtOmp.hpp @@ -45,7 +45,7 @@ namespace alpaka static auto getIdx(bt::IdxBtOmp const& /* idx */, TWorkDiv const& workDiv) -> Vec { // We assume that the thread id is positive. - ALPAKA_ASSERT_OFFLOAD(::omp_get_thread_num() >= 0); + ALPAKA_ASSERT_ACC(::omp_get_thread_num() >= 0); // \TODO: Would it be faster to precompute the index and cache it inside an array? return mapIdx( Vec, TIdx>(static_cast(::omp_get_thread_num())), diff --git a/include/alpaka/warp/WarpGenericSycl.hpp b/include/alpaka/warp/WarpGenericSycl.hpp index 425d97a25859..51957ba79a27 100644 --- a/include/alpaka/warp/WarpGenericSycl.hpp +++ b/include/alpaka/warp/WarpGenericSycl.hpp @@ -119,8 +119,8 @@ namespace alpaka::warp::trait template static auto shfl(warp::WarpGenericSycl const& warp, T value, std::int32_t srcLane, std::int32_t width) { - ALPAKA_ASSERT_OFFLOAD(width > 0); - ALPAKA_ASSERT_OFFLOAD(srcLane >= 0); + ALPAKA_ASSERT_ACC(width > 0); + ALPAKA_ASSERT_ACC(srcLane >= 0); /* If width < srcLane the sub-group needs to be split into assumed subdivisions. The first item of each subdivision has the assumed index 0. The srcLane index is relative to the subdivisions.