From f05838ac0e88558630aeda50337d06e3d06188e8 Mon Sep 17 00:00:00 2001 From: Beatriz Navidad Vilches Date: Wed, 28 Aug 2024 20:41:41 +0200 Subject: [PATCH 1/5] fix: set CMAKE_MODULE_PATH --- Tutorials/reduction/benchmark/CMakeLists.txt | 1 + Tutorials/reduction/test/CMakeLists.txt | 1 + 2 files changed, 2 insertions(+) diff --git a/Tutorials/reduction/benchmark/CMakeLists.txt b/Tutorials/reduction/benchmark/CMakeLists.txt index dcde666a7..acf265ac2 100644 --- a/Tutorials/reduction/benchmark/CMakeLists.txt +++ b/Tutorials/reduction/benchmark/CMakeLists.txt @@ -141,6 +141,7 @@ foreach(VER RANGE 0 10) ${Sources} PROPERTIES LANGUAGE ${GPU_RUNTIME} ) + SET(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${ROCM_ROOT}/cmake) find_package(HIP MODULE REQUIRED) target_include_directories( ${TargetName} diff --git a/Tutorials/reduction/test/CMakeLists.txt b/Tutorials/reduction/test/CMakeLists.txt index d235d669e..3dd046be8 100644 --- a/Tutorials/reduction/test/CMakeLists.txt +++ b/Tutorials/reduction/test/CMakeLists.txt @@ -162,6 +162,7 @@ foreach(VER RANGE 1 10) ${Sources} PROPERTIES LANGUAGE ${GPU_RUNTIME} ) + SET(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${ROCM_ROOT}/cmake) find_package(HIP MODULE REQUIRED) target_include_directories( ${TargetName} From 52412db3a4a922d81ec619ecac4aa9d88194e4e5 Mon Sep 17 00:00:00 2001 From: Beatriz Navidad Vilches Date: Thu, 29 Aug 2024 11:45:11 +0200 Subject: [PATCH 2/5] fix: allow __host__/__device__ annotated lambdas for CUDA backend --- Tutorials/reduction/benchmark/CMakeLists.txt | 2 ++ Tutorials/reduction/example/CMakeLists.txt | 2 ++ Tutorials/reduction/test/CMakeLists.txt | 2 ++ 3 files changed, 6 insertions(+) diff --git a/Tutorials/reduction/benchmark/CMakeLists.txt b/Tutorials/reduction/benchmark/CMakeLists.txt index acf265ac2..2af2d4db1 100644 --- a/Tutorials/reduction/benchmark/CMakeLists.txt +++ b/Tutorials/reduction/benchmark/CMakeLists.txt @@ -24,6 +24,8 @@ project(reduction_benchmarks LANGUAGES CXX) if("${GPU_RUNTIME}" STREQUAL "CUDA") cmake_minimum_required(VERSION 3.25.2) + # Allow __device__ or __host__ annotated lambdas + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --extended-lambda") else() cmake_minimum_required(VERSION 3.21) # Add -fPIE flag to compiler. diff --git a/Tutorials/reduction/example/CMakeLists.txt b/Tutorials/reduction/example/CMakeLists.txt index dba24cb64..225dc8a59 100644 --- a/Tutorials/reduction/example/CMakeLists.txt +++ b/Tutorials/reduction/example/CMakeLists.txt @@ -24,6 +24,8 @@ project(reduction_examples LANGUAGES CXX) if("${GPU_RUNTIME}" STREQUAL "CUDA") cmake_minimum_required(VERSION 3.25.2) + # Allow __device__ or __host__ annotated lambdas + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --extended-lambda") else() cmake_minimum_required(VERSION 3.21) # Add -fPIE flag to compiler. diff --git a/Tutorials/reduction/test/CMakeLists.txt b/Tutorials/reduction/test/CMakeLists.txt index 3dd046be8..15b5d2d7a 100644 --- a/Tutorials/reduction/test/CMakeLists.txt +++ b/Tutorials/reduction/test/CMakeLists.txt @@ -24,6 +24,8 @@ project(reduction_tests LANGUAGES CXX) if("${GPU_RUNTIME}" STREQUAL "CUDA") cmake_minimum_required(VERSION 3.25.2) + # Allow __device__ or __host__ annotated lambdas + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --extended-lambda") else() cmake_minimum_required(VERSION 3.21) # Add -fPIE flag to compiler. From d77603eb8565a9b06307386d87b0a41d9704c197 Mon Sep 17 00:00:00 2001 From: Beatriz Navidad Vilches Date: Fri, 30 Aug 2024 09:22:11 +0200 Subject: [PATCH 3/5] fix: a __global__ function cannot be a member function --- Tutorials/reduction/include/Reduction/v1.hpp | 67 ++++---- Tutorials/reduction/include/Reduction/v10.hpp | 125 +++++++-------- Tutorials/reduction/include/Reduction/v2.hpp | 71 ++++----- Tutorials/reduction/include/Reduction/v3.hpp | 66 ++++---- Tutorials/reduction/include/Reduction/v4.hpp | 67 ++++---- Tutorials/reduction/include/Reduction/v5.hpp | 95 ++++++----- Tutorials/reduction/include/Reduction/v6.hpp | 86 +++++----- Tutorials/reduction/include/Reduction/v7.hpp | 86 +++++----- Tutorials/reduction/include/Reduction/v8.hpp | 104 ++++++------ Tutorials/reduction/include/Reduction/v9.hpp | 150 +++++++++--------- 10 files changed, 434 insertions(+), 483 deletions(-) diff --git a/Tutorials/reduction/include/Reduction/v1.hpp b/Tutorials/reduction/include/Reduction/v1.hpp index 189fb4dc5..420214a41 100644 --- a/Tutorials/reduction/include/Reduction/v1.hpp +++ b/Tutorials/reduction/include/Reduction/v1.hpp @@ -47,6 +47,32 @@ namespace reduction { +template +__global__ static void kernel(T* front, T* back, F op, T zero_elem, uint32_t front_size) +{ + extern __shared__ T shared[]; + + auto read_global_safe = [&](const uint32_t i) { return i < front_size ? front[i] : zero_elem; }; + + const uint32_t tid = threadIdx.x, bid = blockIdx.x, gid = bid * blockDim.x + tid; + + // Read input from front buffer to shared + shared[tid] = read_global_safe(gid); + __syncthreads(); + + // Shared reduction + for(uint32_t i = 1; i < blockDim.x; i *= 2) + { + if(tid % (2 * i) == 0) + shared[tid] = op(shared[tid], shared[tid + i]); + __syncthreads(); + } + + // Write result from shared to back buffer + if(tid == 0) + back[bid] = shared[0]; +} + template class v1 { @@ -89,16 +115,11 @@ class v1 std::size_t curr = input.size(); while(curr > 1) { - hipLaunchKernelGGL(kernel, - dim3(new_size(factor, curr)), - dim3(block_size), - factor * sizeof(T), - hipStreamDefault, - front, - back, - kernel_op, - zero_elem, - curr); + HIP_KERNEL_NAME( + kernel)<<>>(front, back, kernel_op, zero_elem, curr); hip::check(hipGetLastError(), "hipKernelLaunchGGL"); curr = new_size(factor, curr); @@ -136,31 +157,5 @@ class v1 { return actual / factor + (actual % factor == 0 ? 0 : 1); } - - __global__ static void kernel(T* front, T* back, F op, T zero_elem, uint32_t front_size) - { - extern __shared__ T shared[]; - - auto read_global_safe - = [&](const uint32_t i) { return i < front_size ? front[i] : zero_elem; }; - - const uint32_t tid = threadIdx.x, bid = blockIdx.x, gid = bid * blockDim.x + tid; - - // Read input from front buffer to shared - shared[tid] = read_global_safe(gid); - __syncthreads(); - - // Shared reduction - for(uint32_t i = 1; i < blockDim.x; i *= 2) - { - if(tid % (2 * i) == 0) - shared[tid] = op(shared[tid], shared[tid + i]); - __syncthreads(); - } - - // Write result from shared to back buffer - if(tid == 0) - back[bid] = shared[0]; - } }; } // namespace reduction diff --git a/Tutorials/reduction/include/Reduction/v10.hpp b/Tutorials/reduction/include/Reduction/v10.hpp index 1559c2dd2..347dc9e59 100644 --- a/Tutorials/reduction/include/Reduction/v10.hpp +++ b/Tutorials/reduction/include/Reduction/v10.hpp @@ -49,6 +49,52 @@ namespace reduction { +template +__global__ static __launch_bounds__(BlockSize) void kernel( + T* front, T* back, F op, T zero_elem, uint32_t front_size) +{ + static constexpr uint32_t WarpCount = BlockSize / WarpSize; + __shared__ T shared[WarpCount]; + + auto read_global_safe = [&](const uint32_t i) { return i < front_size ? front[i] : zero_elem; }; + auto read_shared_safe = [&](const uint32_t i) { return i < WarpCount ? shared[i] : zero_elem; }; + + const uint32_t tid = threadIdx.x, bid = blockIdx.x, gid = bid * blockDim.x + tid, + gsi = gridDim.x * blockDim.x, wid = tid / WarpSize, lid = tid % WarpSize; + + // Read input from front buffer to local + T res = read_global_safe(gid); + const uint32_t num_global_passes = front_size > gsi ? front_size / gsi : 0; + for(uint32_t i = 0; i < num_global_passes; ++i) + res = op(res, read_global_safe((i + 1) * gsi + gid)); + + // Perform warp reductions and communicate results via shared + tmp::static_for, + tmp::select, tmp::divide_ceil, tmp::constant<0>>>( + [&]() + { + if(wid < ActiveWarps) + { + // Warp reduction + tmp::static_for, tmp::divide<2>>( + [&]() { res = op(res, __shfl_down(res, Delta)); }); + + // Write warp result from local to shared + if(lid == 0) + shared[wid] = res; + } + __syncthreads(); + + // Read warp result from shared to local + res = read_shared_safe(tid); + }); + + // Write result from local to back buffer + if(tid == 0) + back[bid] = res; +} + template class v10 { @@ -97,21 +143,21 @@ class v10 block_size, [&]() noexcept { - tmp::static_switch(warp_size, - [&]() noexcept - { - hipLaunchKernelGGL( - kernel, - dim3(block_count), - dim3(BlockSize), - 0, - 0, - front, - back, - kernel_op, - zero_elem, - step_size); - }); + tmp::static_switch( + warp_size, + [&]() noexcept + { + HIP_KERNEL_NAME( + kernel)<<>>( + front, + back, + kernel_op, + zero_elem, + step_size); + }); }); }; @@ -158,54 +204,5 @@ class v10 { return actual / factor + (actual % factor == 0 ? 0 : 1); } - - template - __global__ static __launch_bounds__(BlockSize) void kernel( - T* front, T* back, F op, T zero_elem, uint32_t front_size) - { - static constexpr uint32_t WarpCount = BlockSize / WarpSize; - __shared__ T shared[WarpCount]; - - auto read_global_safe - = [&](const uint32_t i) { return i < front_size ? front[i] : zero_elem; }; - auto read_shared_safe - = [&](const uint32_t i) { return i < WarpCount ? shared[i] : zero_elem; }; - - const uint32_t tid = threadIdx.x, bid = blockIdx.x, gid = bid * blockDim.x + tid, - gsi = gridDim.x * blockDim.x, wid = tid / WarpSize, lid = tid % WarpSize; - - // Read input from front buffer to local - T res = read_global_safe(gid); - const uint32_t num_global_passes = front_size > gsi ? front_size / gsi : 0; - for(uint32_t i = 0; i < num_global_passes; ++i) - res = op(res, read_global_safe((i + 1) * gsi + gid)); - - // Perform warp reductions and communicate results via shared - tmp::static_for< - WarpCount, - tmp::not_equal<0>, - tmp::select, tmp::divide_ceil, tmp::constant<0>>>( - [&]() - { - if(wid < ActiveWarps) - { - // Warp reduction - tmp::static_for, tmp::divide<2>>( - [&]() { res = op(res, __shfl_down(res, Delta)); }); - - // Write warp result from local to shared - if(lid == 0) - shared[wid] = res; - } - __syncthreads(); - - // Read warp result from shared to local - res = read_shared_safe(tid); - }); - - // Write result from local to back buffer - if(tid == 0) - back[bid] = res; - } }; } // namespace reduction diff --git a/Tutorials/reduction/include/Reduction/v2.hpp b/Tutorials/reduction/include/Reduction/v2.hpp index c44740031..ed64929f6 100644 --- a/Tutorials/reduction/include/Reduction/v2.hpp +++ b/Tutorials/reduction/include/Reduction/v2.hpp @@ -47,6 +47,34 @@ namespace reduction { +template +__global__ static void kernel(T* front, T* back, F op, T zero_elem, uint32_t front_size) +{ + extern __shared__ T shared[]; + + auto read_global_safe = [&](const uint32_t i) { return i < front_size ? front[i] : zero_elem; }; + + const uint32_t tid = threadIdx.x, bid = blockIdx.x, gid = bid * blockDim.x + tid; + + // Read input from front buffer to shared + shared[tid] = read_global_safe(gid); + __syncthreads(); + + // Shared reduction + for(uint32_t i = 1; i < blockDim.x; i *= 2) + { + uint32_t index = 2 * i * tid; + + if(index < blockDim.x) + shared[index] = op(shared[index], shared[index + i]); + __syncthreads(); + } + + // Write result from shared to back buffer + if(tid == 0) + back[bid] = shared[0]; +} + template class v2 { @@ -89,16 +117,11 @@ class v2 std::size_t curr = input.size(); while(curr > 1) { - hipLaunchKernelGGL(kernel, - dim3(new_size(factor, curr)), - dim3(block_size), - factor * sizeof(T), - hipStreamDefault, - front, - back, - kernel_op, - zero_elem, - curr); + HIP_KERNEL_NAME( + kernel)<<>>(front, back, kernel_op, zero_elem, curr); hip::check(hipGetLastError(), "hipKernelLaunchGGL"); curr = new_size(factor, curr); @@ -136,33 +159,5 @@ class v2 { return actual / factor + (actual % factor == 0 ? 0 : 1); } - - __global__ static void kernel(T* front, T* back, F op, T zero_elem, uint32_t front_size) - { - extern __shared__ T shared[]; - - auto read_global_safe - = [&](const uint32_t i) { return i < front_size ? front[i] : zero_elem; }; - - const uint32_t tid = threadIdx.x, bid = blockIdx.x, gid = bid * blockDim.x + tid; - - // Read input from front buffer to shared - shared[tid] = read_global_safe(gid); - __syncthreads(); - - // Shared reduction - for(uint32_t i = 1; i < blockDim.x; i *= 2) - { - uint32_t index = 2 * i * tid; - - if(index < blockDim.x) - shared[index] = op(shared[index], shared[index + i]); - __syncthreads(); - } - - // Write result from shared to back buffer - if(tid == 0) - back[bid] = shared[0]; - } }; } // namespace reduction diff --git a/Tutorials/reduction/include/Reduction/v3.hpp b/Tutorials/reduction/include/Reduction/v3.hpp index 41d116311..a1c947c20 100644 --- a/Tutorials/reduction/include/Reduction/v3.hpp +++ b/Tutorials/reduction/include/Reduction/v3.hpp @@ -48,6 +48,31 @@ namespace reduction { template +__global__ static void kernel(T* front, T* back, F op, T zero_elem, uint32_t front_size) +{ + extern __shared__ T shared[]; + + auto read_global_safe = [&](const uint32_t i) { return i < front_size ? front[i] : zero_elem; }; + + const uint32_t tid = threadIdx.x, bid = blockIdx.x, gid = bid * blockDim.x + tid; + + // Read input from front buffer to shared + shared[tid] = read_global_safe(gid); + __syncthreads(); + + // Shared reduction + for(uint32_t i = blockDim.x / 2; i != 0; i /= 2) + { + if(tid < i) + shared[tid] = op(shared[tid], shared[tid + i]); + __syncthreads(); + } + + // Write result from shared to back buffer + if(tid == 0) + back[bid] = shared[0]; +} +template class v3 { public: @@ -89,16 +114,11 @@ class v3 std::size_t curr = input.size(); while(curr > 1) { - hipLaunchKernelGGL(kernel, - dim3(new_size(factor, curr)), - dim3(block_size), - block_size * sizeof(T), - hipStreamDefault, - front, - back, - kernel_op, - zero_elem, - curr); + HIP_KERNEL_NAME( + kernel)<<>>(front, back, kernel_op, zero_elem, curr); hip::check(hipGetLastError(), "hipKernelLaunchGGL"); curr = new_size(factor, curr); @@ -136,31 +156,5 @@ class v3 { return actual / factor + (actual % factor == 0 ? 0 : 1); } - - __global__ static void kernel(T* front, T* back, F op, T zero_elem, uint32_t front_size) - { - extern __shared__ T shared[]; - - auto read_global_safe - = [&](const uint32_t i) { return i < front_size ? front[i] : zero_elem; }; - - const uint32_t tid = threadIdx.x, bid = blockIdx.x, gid = bid * blockDim.x + tid; - - // Read input from front buffer to shared - shared[tid] = read_global_safe(gid); - __syncthreads(); - - // Shared reduction - for(uint32_t i = blockDim.x / 2; i != 0; i /= 2) - { - if(tid < i) - shared[tid] = op(shared[tid], shared[tid + i]); - __syncthreads(); - } - - // Write result from shared to back buffer - if(tid == 0) - back[bid] = shared[0]; - } }; } // namespace reduction diff --git a/Tutorials/reduction/include/Reduction/v4.hpp b/Tutorials/reduction/include/Reduction/v4.hpp index abf0107bd..78492ddca 100644 --- a/Tutorials/reduction/include/Reduction/v4.hpp +++ b/Tutorials/reduction/include/Reduction/v4.hpp @@ -47,6 +47,32 @@ namespace reduction { +template +__global__ static void kernel(T* front, T* back, F op, T zero_elem, uint32_t front_size) +{ + extern __shared__ T shared[]; + + auto read_global_safe = [&](const uint32_t i) { return i < front_size ? front[i] : zero_elem; }; + + const uint32_t tid = threadIdx.x, bid = blockIdx.x, gid = bid * (blockDim.x * 2) + tid; + + // Read input from front buffer to shared + shared[tid] = op(read_global_safe(gid), read_global_safe(gid + blockDim.x)); + __syncthreads(); + + // Shared reduction + for(uint32_t i = blockDim.x / 2; i != 0; i /= 2) + { + if(tid < i) + shared[tid] = op(shared[tid], shared[tid + i]); + __syncthreads(); + } + + // Write result from shared to back buffer + if(tid == 0) + back[bid] = shared[0]; +} + template class v4 { @@ -89,16 +115,11 @@ class v4 std::size_t curr = input.size(); while(curr > 1) { - hipLaunchKernelGGL(kernel, - dim3(new_size(factor, curr)), - dim3(block_size), - block_size * sizeof(T), - hipStreamDefault, - front, - back, - kernel_op, - zero_elem, - curr); + HIP_KERNEL_NAME( + kernel)<<>>(front, back, kernel_op, zero_elem, curr); hip::check(hipGetLastError(), "hipKernelLaunchGGL"); curr = new_size(factor, curr); @@ -136,31 +157,5 @@ class v4 { return actual / factor + (actual % factor == 0 ? 0 : 1); } - - __global__ static void kernel(T* front, T* back, F op, T zero_elem, uint32_t front_size) - { - extern __shared__ T shared[]; - - auto read_global_safe - = [&](const uint32_t i) { return i < front_size ? front[i] : zero_elem; }; - - const uint32_t tid = threadIdx.x, bid = blockIdx.x, gid = bid * (blockDim.x * 2) + tid; - - // Read input from front buffer to shared - shared[tid] = op(read_global_safe(gid), read_global_safe(gid + blockDim.x)); - __syncthreads(); - - // Shared reduction - for(uint32_t i = blockDim.x / 2; i != 0; i /= 2) - { - if(tid < i) - shared[tid] = op(shared[tid], shared[tid + i]); - __syncthreads(); - } - - // Write result from shared to back buffer - if(tid == 0) - back[bid] = shared[0]; - } }; } // namespace reduction diff --git a/Tutorials/reduction/include/Reduction/v5.hpp b/Tutorials/reduction/include/Reduction/v5.hpp index d54be3a24..1f41f6399 100644 --- a/Tutorials/reduction/include/Reduction/v5.hpp +++ b/Tutorials/reduction/include/Reduction/v5.hpp @@ -49,6 +49,39 @@ namespace reduction { +template +__global__ static void kernel(T* front, T* back, F op, T zero_elem, uint32_t front_size) +{ + extern __shared__ T shared[]; + + auto read_global_safe = [&](const uint32_t i) { return i < front_size ? front[i] : zero_elem; }; + + const uint32_t tid = threadIdx.x, bid = blockIdx.x, gid = bid * (blockDim.x * 2) + tid; + + // Read input from front buffer to shared + shared[tid] = op(read_global_safe(gid), read_global_safe(gid + blockDim.x)); + __syncthreads(); + + // Shared reduction + for(uint32_t i = blockDim.x / 2; i > WarpSize; i /= 2) + { + if(tid < i) + shared[tid] = op(shared[tid], shared[tid + i]); + __syncthreads(); + } + // Warp reduction + tmp::static_for, tmp::divide<2>>( + [&]() + { + if(tid < I) + shared[tid] = op(shared[tid], shared[tid + I]); + }); + + // Write result from shared to back buffer + if(tid == 0) + back[bid] = shared[0]; +} + template class v5 { @@ -92,21 +125,19 @@ class v5 auto kernel_dispatcher = [&](std::size_t step_size) { - tmp::static_switch(warp_size, - [&]() noexcept - { - hipLaunchKernelGGL( - kernel, - dim3(new_size(factor, step_size)), - dim3(block_size), - block_size * sizeof(T), - hipStreamDefault, - front, - back, - kernel_op, - zero_elem, - step_size); - }); + tmp::static_switch( + warp_size, + [&]() noexcept + { + HIP_KERNEL_NAME(kernel)<<>>(front, + back, + kernel_op, + zero_elem, + step_size); + }); }; hipEvent_t start, end; @@ -156,39 +187,5 @@ class v5 { return actual / factor + (actual % factor == 0 ? 0 : 1); } - - template - __global__ static void kernel(T* front, T* back, F op, T zero_elem, uint32_t front_size) - { - extern __shared__ T shared[]; - - auto read_global_safe - = [&](const uint32_t i) { return i < front_size ? front[i] : zero_elem; }; - - const uint32_t tid = threadIdx.x, bid = blockIdx.x, gid = bid * (blockDim.x * 2) + tid; - - // Read input from front buffer to shared - shared[tid] = op(read_global_safe(gid), read_global_safe(gid + blockDim.x)); - __syncthreads(); - - // Shared reduction - for(uint32_t i = blockDim.x / 2; i > WarpSize; i /= 2) - { - if(tid < i) - shared[tid] = op(shared[tid], shared[tid + i]); - __syncthreads(); - } - // Warp reduction - tmp::static_for, tmp::divide<2>>( - [&]() - { - if(tid < I) - shared[tid] = op(shared[tid], shared[tid + I]); - }); - - // Write result from shared to back buffer - if(tid == 0) - back[bid] = shared[0]; - } }; } // namespace reduction diff --git a/Tutorials/reduction/include/Reduction/v6.hpp b/Tutorials/reduction/include/Reduction/v6.hpp index 8d59c8c94..fd7799b6c 100644 --- a/Tutorials/reduction/include/Reduction/v6.hpp +++ b/Tutorials/reduction/include/Reduction/v6.hpp @@ -49,6 +49,41 @@ namespace reduction { +template +__global__ static __launch_bounds__(BlockSize) void kernel( + T* front, T* back, F op, T zero_elem, uint32_t front_size) +{ + __shared__ T shared[BlockSize]; + + auto read_global_safe = [&](const uint32_t i) { return i < front_size ? front[i] : zero_elem; }; + + const uint32_t tid = threadIdx.x, bid = blockIdx.x, gid = bid * (blockDim.x * 2) + tid; + + // Read input from front buffer to shared + shared[tid] = op(read_global_safe(gid), read_global_safe(gid + blockDim.x)); + __syncthreads(); + + // Shared reduction + tmp::static_for, tmp::divide<2>>( + [&]() + { + if(tid < I) + shared[tid] = op(shared[tid], shared[tid + I]); + __syncthreads(); + }); + // Warp reduction + tmp::static_for, tmp::divide<2>>( + [&]() + { + if(tid < I) + shared[tid] = op(shared[tid], shared[tid + I]); + }); + + // Write result from shared to back buffer + if(tid == 0) + back[bid] = shared[0]; +} + template class v6 { @@ -100,16 +135,11 @@ class v6 warp_size, [&]() noexcept { - hipLaunchKernelGGL(kernel, - dim3(new_size(factor, step_size)), - dim3(BlockSize), - 0, - hipStreamDefault, - front, - back, - kernel_op, - zero_elem, - step_size); + HIP_KERNEL_NAME(kernel)<<< + dim3(new_size(factor, step_size)), + dim3(BlockSize), + 0, + hipStreamDefault>>>(front, back, kernel_op, zero_elem, step_size); }); }); }; @@ -161,41 +191,5 @@ class v6 { return actual / factor + (actual % factor == 0 ? 0 : 1); } - - template - __global__ static __launch_bounds__(BlockSize) void kernel( - T* front, T* back, F op, T zero_elem, uint32_t front_size) - { - __shared__ T shared[BlockSize]; - - auto read_global_safe - = [&](const uint32_t i) { return i < front_size ? front[i] : zero_elem; }; - - const uint32_t tid = threadIdx.x, bid = blockIdx.x, gid = bid * (blockDim.x * 2) + tid; - - // Read input from front buffer to shared - shared[tid] = op(read_global_safe(gid), read_global_safe(gid + blockDim.x)); - __syncthreads(); - - // Shared reduction - tmp::static_for, tmp::divide<2>>( - [&]() - { - if(tid < I) - shared[tid] = op(shared[tid], shared[tid + I]); - __syncthreads(); - }); - // Warp reduction - tmp::static_for, tmp::divide<2>>( - [&]() - { - if(tid < I) - shared[tid] = op(shared[tid], shared[tid + I]); - }); - - // Write result from shared to back buffer - if(tid == 0) - back[bid] = shared[0]; - } }; } // namespace reduction diff --git a/Tutorials/reduction/include/Reduction/v7.hpp b/Tutorials/reduction/include/Reduction/v7.hpp index 5f3bb2003..2dc9e6243 100644 --- a/Tutorials/reduction/include/Reduction/v7.hpp +++ b/Tutorials/reduction/include/Reduction/v7.hpp @@ -49,6 +49,41 @@ namespace reduction { +template +__global__ static __launch_bounds__(BlockSize) void kernel( + T* front, T* back, F op, T zero_elem, uint32_t front_size) +{ + __shared__ T shared[BlockSize]; + + auto read_global_safe = [&](const uint32_t i) { return i < front_size ? front[i] : zero_elem; }; + + const uint32_t tid = threadIdx.x, bid = blockIdx.x, gid = bid * (blockDim.x * 2) + tid; + + // Read input from front buffer to shared + shared[tid] = op(read_global_safe(gid), read_global_safe(gid + blockDim.x)); + __syncthreads(); + + // Shared reduction + tmp::static_for, tmp::divide<2>>( + [&]() + { + if(tid < I) + shared[tid] = op(shared[tid], shared[tid + I]); + __syncthreads(); + }); + // Warp reduction + if(tid < WarpSize) + { + T res = op(shared[tid], shared[tid + WarpSize]); + tmp::static_for, tmp::divide<2>>( + [&]() { res = op(res, __shfl_down(res, Delta)); }); + + // Write result from shared to back buffer + if(tid == 0) + back[bid] = res; + } +} + template class v7 { @@ -100,16 +135,11 @@ class v7 warp_size, [&]() noexcept { - hipLaunchKernelGGL(kernel, - dim3(new_size(factor, step_size)), - dim3(BlockSize), - 0, - hipStreamDefault, - front, - back, - kernel_op, - zero_elem, - step_size); + HIP_KERNEL_NAME(kernel)<<< + dim3(new_size(factor, step_size)), + dim3(BlockSize), + 0, + hipStreamDefault>>>(front, back, kernel_op, zero_elem, step_size); }); }); }; @@ -161,41 +191,5 @@ class v7 { return actual / factor + (actual % factor == 0 ? 0 : 1); } - - template - __global__ static __launch_bounds__(BlockSize) void kernel( - T* front, T* back, F op, T zero_elem, uint32_t front_size) - { - __shared__ T shared[BlockSize]; - - auto read_global_safe - = [&](const uint32_t i) { return i < front_size ? front[i] : zero_elem; }; - - const uint32_t tid = threadIdx.x, bid = blockIdx.x, gid = bid * (blockDim.x * 2) + tid; - - // Read input from front buffer to shared - shared[tid] = op(read_global_safe(gid), read_global_safe(gid + blockDim.x)); - __syncthreads(); - - // Shared reduction - tmp::static_for, tmp::divide<2>>( - [&]() - { - if(tid < I) - shared[tid] = op(shared[tid], shared[tid + I]); - __syncthreads(); - }); - // Warp reduction - if(tid < WarpSize) - { - T res = op(shared[tid], shared[tid + WarpSize]); - tmp::static_for, tmp::divide<2>>( - [&]() { res = op(res, __shfl_down(res, Delta)); }); - - // Write result from shared to back buffer - if(tid == 0) - back[bid] = res; - } - } }; } // namespace reduction diff --git a/Tutorials/reduction/include/Reduction/v8.hpp b/Tutorials/reduction/include/Reduction/v8.hpp index 2a2a9c972..eb28a16a5 100644 --- a/Tutorials/reduction/include/Reduction/v8.hpp +++ b/Tutorials/reduction/include/Reduction/v8.hpp @@ -49,6 +49,49 @@ namespace reduction { +template +__global__ static __launch_bounds__(BlockSize) void kernel( + T* front, T* back, F op, T zero_elem, uint32_t front_size) +{ + static constexpr uint32_t WarpCount = BlockSize / WarpSize; + __shared__ T shared[WarpCount]; + + auto read_global_safe = [&](const uint32_t i) { return i < front_size ? front[i] : zero_elem; }; + auto read_shared_safe = [&](const uint32_t i) { return i < WarpCount ? shared[i] : zero_elem; }; + + const uint32_t tid = threadIdx.x, bid = blockIdx.x, gid = bid * (blockDim.x * 2) + tid, + wid = tid / WarpSize, lid = tid % WarpSize; + + // Read input from front buffer to local + T res = op(read_global_safe(gid), read_global_safe(gid + blockDim.x)); + + // Perform warp reductions and communicate results via shared + tmp::static_for, + tmp::select, tmp::divide_ceil, tmp::constant<0>>>( + [&]() + { + if(wid < ActiveWarps) + { + // Warp reduction + tmp::static_for, tmp::divide<2>>( + [&]() { res = op(res, __shfl_down(res, Delta)); }); + + // Write warp result from local to shared + if(lid == 0) + shared[wid] = res; + } + __syncthreads(); + + // Read warp result from shared to local + res = read_shared_safe(tid); + }); + + // Write result from local to back buffer + if(tid == 0) + back[bid] = res; +} + template class v8 { @@ -100,16 +143,11 @@ class v8 warp_size, [&]() noexcept { - hipLaunchKernelGGL(kernel, - dim3(new_size(factor, step_size)), - dim3(BlockSize), - 0, - hipStreamDefault, - front, - back, - kernel_op, - zero_elem, - step_size); + HIP_KERNEL_NAME(kernel)<<< + dim3(new_size(factor, step_size)), + dim3(BlockSize), + 0, + hipStreamDefault>>>(front, back, kernel_op, zero_elem, step_size); }); }); }; @@ -161,51 +199,5 @@ class v8 { return actual / factor + (actual % factor == 0 ? 0 : 1); } - - template - __global__ static __launch_bounds__(BlockSize) void kernel( - T* front, T* back, F op, T zero_elem, uint32_t front_size) - { - static constexpr uint32_t WarpCount = BlockSize / WarpSize; - __shared__ T shared[WarpCount]; - - auto read_global_safe - = [&](const uint32_t i) { return i < front_size ? front[i] : zero_elem; }; - auto read_shared_safe - = [&](const uint32_t i) { return i < WarpCount ? shared[i] : zero_elem; }; - - const uint32_t tid = threadIdx.x, bid = blockIdx.x, gid = bid * (blockDim.x * 2) + tid, - wid = tid / WarpSize, lid = tid % WarpSize; - - // Read input from front buffer to local - T res = op(read_global_safe(gid), read_global_safe(gid + blockDim.x)); - - // Perform warp reductions and communicate results via shared - tmp::static_for< - WarpCount, - tmp::not_equal<0>, - tmp::select, tmp::divide_ceil, tmp::constant<0>>>( - [&]() - { - if(wid < ActiveWarps) - { - // Warp reduction - tmp::static_for, tmp::divide<2>>( - [&]() { res = op(res, __shfl_down(res, Delta)); }); - - // Write warp result from local to shared - if(lid == 0) - shared[wid] = res; - } - __syncthreads(); - - // Read warp result from shared to local - res = read_shared_safe(tid); - }); - - // Write result from local to back buffer - if(tid == 0) - back[bid] = res; - } }; } // namespace reduction diff --git a/Tutorials/reduction/include/Reduction/v9.hpp b/Tutorials/reduction/include/Reduction/v9.hpp index b83f67035..5fd402a57 100644 --- a/Tutorials/reduction/include/Reduction/v9.hpp +++ b/Tutorials/reduction/include/Reduction/v9.hpp @@ -50,6 +50,70 @@ namespace reduction { +template +__global__ static __launch_bounds__(BlockSize) void kernel( + T* front, T* back, F op, T zero_elem, uint32_t front_size) +{ + static constexpr uint32_t WarpCount = BlockSize / WarpSize; + + __shared__ T shared[WarpCount]; + + auto read_global_safe = [&](const int32_t i) + { + return [&](std::integer_sequence) + { + if(i + ItemsPerThread < front_size) + return hip::static_array{front[i + I]...}; + else + return hip::static_array{ + (i + I < front_size ? front[i + I] : zero_elem)...}; + }(std::make_integer_sequence()); + }; + auto read_shared_safe = [&](const int32_t i) { return i < WarpCount ? shared[i] : zero_elem; }; + + const int32_t tid = threadIdx.x, bid = blockIdx.x, + gid = bid * (blockDim.x * ItemsPerThread) + tid * ItemsPerThread, + wid = tid / WarpSize, lid = tid % WarpSize; + + T res = [&]() + { + // Read input from front buffer to local + hip::static_array arr = read_global_safe(gid); + + // Reduce ItemsPerThread to scalar + tmp::static_for<1, tmp::less_than, tmp::increment<1>>( + [&]() { get<0>(arr) = op(get<0>(arr), get(arr)); }); + + return get<0>(arr); + }(); + + // Perform warp reductions and communicate results via shared + tmp::static_for, + tmp::select, tmp::divide_ceil, tmp::constant<0>>>( + [&]() + { + if(wid < ActiveWarps) + { + // Warp reduction + tmp::static_for, tmp::divide<2>>( + [&]() { res = op(res, __shfl_down(res, Delta)); }); + + // Write warp result from local to shared + if(lid == 0) + shared[wid] = res; + } + __syncthreads(); + + // Read warp result from shared to local + res = read_shared_safe(tid); + }); + + // Write result from local to back buffer + if(tid == 0) + back[bid] = res; +} + template class v9 { @@ -105,16 +169,16 @@ class v9 items_per_thread, [&]() noexcept { - hipLaunchKernelGGL(kernel, - dim3(new_size(factor, step_size)), - dim3(BlockSize), - 0, - hipStreamDefault, - front, - back, - kernel_op, - zero_elem, - step_size); + HIP_KERNEL_NAME( + kernel)<<< + dim3(new_size(factor, step_size)), + dim3(BlockSize), + 0, + hipStreamDefault>>>(front, + back, + kernel_op, + zero_elem, + step_size); }); }); }); @@ -167,71 +231,5 @@ class v9 { return actual / factor + (actual % factor == 0 ? 0 : 1); } - - template - __global__ static __launch_bounds__(BlockSize) void kernel( - T* front, T* back, F op, T zero_elem, uint32_t front_size) - { - static constexpr uint32_t WarpCount = BlockSize / WarpSize; - - __shared__ T shared[WarpCount]; - - auto read_global_safe = [&](const int32_t i) - { - return [&](std::integer_sequence) - { - if(i + ItemsPerThread < front_size) - return hip::static_array{front[i + I]...}; - else - return hip::static_array{ - (i + I < front_size ? front[i + I] : zero_elem)...}; - }(std::make_integer_sequence()); - }; - auto read_shared_safe - = [&](const int32_t i) { return i < WarpCount ? shared[i] : zero_elem; }; - - const int32_t tid = threadIdx.x, bid = blockIdx.x, - gid = bid * (blockDim.x * ItemsPerThread) + tid * ItemsPerThread, - wid = tid / WarpSize, lid = tid % WarpSize; - - T res = [&]() - { - // Read input from front buffer to local - hip::static_array arr = read_global_safe(gid); - - // Reduce ItemsPerThread to scalar - tmp::static_for<1, tmp::less_than, tmp::increment<1>>( - [&]() { get<0>(arr) = op(get<0>(arr), get(arr)); }); - - return get<0>(arr); - }(); - - // Perform warp reductions and communicate results via shared - tmp::static_for< - WarpCount, - tmp::not_equal<0>, - tmp::select, tmp::divide_ceil, tmp::constant<0>>>( - [&]() - { - if(wid < ActiveWarps) - { - // Warp reduction - tmp::static_for, tmp::divide<2>>( - [&]() { res = op(res, __shfl_down(res, Delta)); }); - - // Write warp result from local to shared - if(lid == 0) - shared[wid] = res; - } - __syncthreads(); - - // Read warp result from shared to local - res = read_shared_safe(tid); - }); - - // Write result from local to back buffer - if(tid == 0) - back[bid] = res; - } }; } // namespace reduction From f08ccc088572f5cb06080015082fa2aca6cb5962 Mon Sep 17 00:00:00 2001 From: Beatriz Navidad Vilches Date: Fri, 30 Aug 2024 11:07:47 +0200 Subject: [PATCH 4/5] fix: do not build the reduction tutorial on Windows --- Tutorials/reduction/CMakeLists.txt | 5 +++++ Tutorials/reduction/benchmark/CMakeLists.txt | 17 ++++++++++++++++- Tutorials/reduction/example/CMakeLists.txt | 17 ++++++++++++++++- Tutorials/reduction/test/CMakeLists.txt | 17 ++++++++++++++++- 4 files changed, 53 insertions(+), 3 deletions(-) diff --git a/Tutorials/reduction/CMakeLists.txt b/Tutorials/reduction/CMakeLists.txt index 3b9627546..b2aa87c51 100644 --- a/Tutorials/reduction/CMakeLists.txt +++ b/Tutorials/reduction/CMakeLists.txt @@ -31,10 +31,15 @@ if("${GPU_RUNTIME}" STREQUAL "CUDA") cmake_minimum_required(VERSION 3.25.2) else() cmake_minimum_required(VERSION 3.21) + if(WIN32) + message(STATUS "The reduction tutorial is not supported on Windows. Not building.") + return() + endif() endif() project(Reduction LANGUAGES CXX) + file(RELATIVE_PATH folder_bin ${CMAKE_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin/${folder_bin}) diff --git a/Tutorials/reduction/benchmark/CMakeLists.txt b/Tutorials/reduction/benchmark/CMakeLists.txt index 2af2d4db1..d13165766 100644 --- a/Tutorials/reduction/benchmark/CMakeLists.txt +++ b/Tutorials/reduction/benchmark/CMakeLists.txt @@ -67,9 +67,24 @@ list(APPEND CMAKE_PREFIX_PATH "${ROCM_ROOT}") set(include_dirs "../../Common") list(APPEND include_dirs "${PROJECT_SOURCE_DIR}") -# For examples targeting NVIDIA, include the HIP header directory. if("${GPU_RUNTIME}" STREQUAL "CUDA") + # For examples targeting NVIDIA, include the HIP header directory. list(APPEND include_dirs "${ROCM_ROOT}/include") + + # Some CUDA versions have issues when compiling for C++20 on Windows, check if we are using those + if(WIN32) + find_package(CUDA) + if(CUDA_FOUND) + set(FAULTY_CUDA_VERSION "12.5") + if(CUDA_VERSION VERSION_LESS_EQUAL ${FAULTY_CUDA_VERSION}) + message(WARNING "CUDA version ${CUDA_VERSION} has issues when compiling for C++20. Not building reduction benchmarks.") + return() + endif() + else() + message(STATUS "CUDA Toolkit not found. Not building reduction benchmarks.") + return() + endif() + endif() endif() # libstdc++ Parallel STL on Ubuntu 20.04 requires explicit linking to TBB diff --git a/Tutorials/reduction/example/CMakeLists.txt b/Tutorials/reduction/example/CMakeLists.txt index 225dc8a59..f5bb26e0e 100644 --- a/Tutorials/reduction/example/CMakeLists.txt +++ b/Tutorials/reduction/example/CMakeLists.txt @@ -67,9 +67,24 @@ list(APPEND CMAKE_PREFIX_PATH "${ROCM_ROOT}") set(include_dirs "../../Common") list(APPEND include_dirs "${PROJECT_SOURCE_DIR}") -# For examples targeting NVIDIA, include the HIP header directory. if("${GPU_RUNTIME}" STREQUAL "CUDA") + # For examples targeting NVIDIA, include the HIP header directory. list(APPEND include_dirs "${ROCM_ROOT}/include") + + # Some CUDA versions have issues when compiling for C++20 on Windows, check if we are using those + if(WIN32) + find_package(CUDA) + if(CUDA_FOUND) + set(FAULTY_CUDA_VERSION "12.5") + if(CUDA_VERSION VERSION_LESS_EQUAL ${FAULTY_CUDA_VERSION}) + message(WARNING "CUDA version ${CUDA_VERSION} has issues when compiling for C++20. Not building reduction examples.") + return() + endif() + else() + message(STATUS "CUDA Toolkit not found. Not building reduction examples.") + return() + endif() + endif() endif() # libstdc++ Parallel STL on Ubuntu 20.04 requires explicit linking to TBB diff --git a/Tutorials/reduction/test/CMakeLists.txt b/Tutorials/reduction/test/CMakeLists.txt index 15b5d2d7a..d01419a47 100644 --- a/Tutorials/reduction/test/CMakeLists.txt +++ b/Tutorials/reduction/test/CMakeLists.txt @@ -67,9 +67,24 @@ list(APPEND CMAKE_PREFIX_PATH "${ROCM_ROOT}") set(include_dirs "../../Common") list(APPEND include_dirs "${PROJECT_SOURCE_DIR}") -# For examples targeting NVIDIA, include the HIP header directory. if("${GPU_RUNTIME}" STREQUAL "CUDA") + # For examples targeting NVIDIA, include the HIP header directory. list(APPEND include_dirs "${ROCM_ROOT}/include") + + # Some CUDA versions have issues when compiling for C++20 on Windows, check if we are using those + if(WIN32) + find_package(CUDA) + if(CUDA_FOUND) + set(FAULTY_CUDA_VERSION "12.5") + if(CUDA_VERSION VERSION_LESS_EQUAL ${FAULTY_CUDA_VERSION}) + message(WARNING "CUDA version ${CUDA_VERSION} has issues when compiling for C++20. Not building reduction tests.") + return() + endif() + else() + message(STATUS "CUDA Toolkit not found. Not building reduction tests.") + return() + endif() + endif() endif() # libstdc++ Parallel STL on Ubuntu 20.04 requires explicit linking to TBB From e7024276133b85acdec9f1ddd639df9d567d5beb Mon Sep 17 00:00:00 2001 From: Beatriz Navidad Vilches Date: Wed, 18 Sep 2024 16:31:23 +0200 Subject: [PATCH 5/5] Fixed CMake linting --- Tutorials/reduction/CMakeLists.txt | 6 ++++-- Tutorials/reduction/benchmark/CMakeLists.txt | 14 ++++++++++---- Tutorials/reduction/example/CMakeLists.txt | 10 ++++++++-- Tutorials/reduction/test/CMakeLists.txt | 12 +++++++++--- 4 files changed, 31 insertions(+), 11 deletions(-) diff --git a/Tutorials/reduction/CMakeLists.txt b/Tutorials/reduction/CMakeLists.txt index b2aa87c51..4eb6b6245 100644 --- a/Tutorials/reduction/CMakeLists.txt +++ b/Tutorials/reduction/CMakeLists.txt @@ -32,14 +32,16 @@ if("${GPU_RUNTIME}" STREQUAL "CUDA") else() cmake_minimum_required(VERSION 3.21) if(WIN32) - message(STATUS "The reduction tutorial is not supported on Windows. Not building.") + message( + STATUS + "The reduction tutorial is not supported on Windows. Not building." + ) return() endif() endif() project(Reduction LANGUAGES CXX) - file(RELATIVE_PATH folder_bin ${CMAKE_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin/${folder_bin}) diff --git a/Tutorials/reduction/benchmark/CMakeLists.txt b/Tutorials/reduction/benchmark/CMakeLists.txt index d13165766..4cb4e8dc8 100644 --- a/Tutorials/reduction/benchmark/CMakeLists.txt +++ b/Tutorials/reduction/benchmark/CMakeLists.txt @@ -70,18 +70,24 @@ list(APPEND include_dirs "${PROJECT_SOURCE_DIR}") if("${GPU_RUNTIME}" STREQUAL "CUDA") # For examples targeting NVIDIA, include the HIP header directory. list(APPEND include_dirs "${ROCM_ROOT}/include") - + # Some CUDA versions have issues when compiling for C++20 on Windows, check if we are using those if(WIN32) find_package(CUDA) if(CUDA_FOUND) set(FAULTY_CUDA_VERSION "12.5") if(CUDA_VERSION VERSION_LESS_EQUAL ${FAULTY_CUDA_VERSION}) - message(WARNING "CUDA version ${CUDA_VERSION} has issues when compiling for C++20. Not building reduction benchmarks.") + message( + WARNING + "CUDA version ${CUDA_VERSION} has issues when compiling for C++20. Not building reduction benchmarks." + ) return() endif() else() - message(STATUS "CUDA Toolkit not found. Not building reduction benchmarks.") + message( + STATUS + "CUDA Toolkit not found. Not building reduction benchmarks." + ) return() endif() endif() @@ -158,7 +164,7 @@ foreach(VER RANGE 0 10) ${Sources} PROPERTIES LANGUAGE ${GPU_RUNTIME} ) - SET(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${ROCM_ROOT}/cmake) + set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${ROCM_ROOT}/cmake) find_package(HIP MODULE REQUIRED) target_include_directories( ${TargetName} diff --git a/Tutorials/reduction/example/CMakeLists.txt b/Tutorials/reduction/example/CMakeLists.txt index f5bb26e0e..e7f6a8d4b 100644 --- a/Tutorials/reduction/example/CMakeLists.txt +++ b/Tutorials/reduction/example/CMakeLists.txt @@ -77,11 +77,17 @@ if("${GPU_RUNTIME}" STREQUAL "CUDA") if(CUDA_FOUND) set(FAULTY_CUDA_VERSION "12.5") if(CUDA_VERSION VERSION_LESS_EQUAL ${FAULTY_CUDA_VERSION}) - message(WARNING "CUDA version ${CUDA_VERSION} has issues when compiling for C++20. Not building reduction examples.") + message( + WARNING + "CUDA version ${CUDA_VERSION} has issues when compiling for C++20. Not building reduction examples." + ) return() endif() else() - message(STATUS "CUDA Toolkit not found. Not building reduction examples.") + message( + STATUS + "CUDA Toolkit not found. Not building reduction examples." + ) return() endif() endif() diff --git a/Tutorials/reduction/test/CMakeLists.txt b/Tutorials/reduction/test/CMakeLists.txt index d01419a47..6c71c6207 100644 --- a/Tutorials/reduction/test/CMakeLists.txt +++ b/Tutorials/reduction/test/CMakeLists.txt @@ -77,11 +77,17 @@ if("${GPU_RUNTIME}" STREQUAL "CUDA") if(CUDA_FOUND) set(FAULTY_CUDA_VERSION "12.5") if(CUDA_VERSION VERSION_LESS_EQUAL ${FAULTY_CUDA_VERSION}) - message(WARNING "CUDA version ${CUDA_VERSION} has issues when compiling for C++20. Not building reduction tests.") + message( + WARNING + "CUDA version ${CUDA_VERSION} has issues when compiling for C++20. Not building reduction tests." + ) return() endif() else() - message(STATUS "CUDA Toolkit not found. Not building reduction tests.") + message( + STATUS + "CUDA Toolkit not found. Not building reduction tests." + ) return() endif() endif() @@ -179,7 +185,7 @@ foreach(VER RANGE 1 10) ${Sources} PROPERTIES LANGUAGE ${GPU_RUNTIME} ) - SET(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${ROCM_ROOT}/cmake) + set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${ROCM_ROOT}/cmake) find_package(HIP MODULE REQUIRED) target_include_directories( ${TargetName}