ROCm · MKKnorr · Dec 12, 2024 · Aug 28, 2024 · Aug 29, 2024 · Aug 30, 2024
@@ -31,6 +31,13 @@ if("${GPU_RUNTIME}" STREQUAL "CUDA")
     cmake_minimum_required(VERSION 3.25.2)
 else()
     cmake_minimum_required(VERSION 3.21)
+    if(WIN32)
+        message(
+            STATUS
+            "The reduction tutorial is not supported on Windows. Not building."
+        )
+        return()
+    endif()
 endif()
 
 project(Reduction LANGUAGES CXX)

@@ -24,6 +24,8 @@ project(reduction_benchmarks LANGUAGES CXX)
 
 if("${GPU_RUNTIME}" STREQUAL "CUDA")
     cmake_minimum_required(VERSION 3.25.2)
+    # Allow __device__ or __host__ annotated lambdas
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --extended-lambda")
 else()
     cmake_minimum_required(VERSION 3.21)
     # Add -fPIE flag to compiler.
@@ -65,9 +67,30 @@ list(APPEND CMAKE_PREFIX_PATH "${ROCM_ROOT}")
 set(include_dirs "../../Common")
 list(APPEND include_dirs "${PROJECT_SOURCE_DIR}")
 
-# For examples targeting NVIDIA, include the HIP header directory.
 if("${GPU_RUNTIME}" STREQUAL "CUDA")
+    # For examples targeting NVIDIA, include the HIP header directory.
     list(APPEND include_dirs "${ROCM_ROOT}/include")
+
+    # Some CUDA versions have issues when compiling for C++20 on Windows, check if we are using those
+    if(WIN32)
+        find_package(CUDA)
+        if(CUDA_FOUND)
+            set(FAULTY_CUDA_VERSION "12.5")
+            if(CUDA_VERSION VERSION_LESS_EQUAL ${FAULTY_CUDA_VERSION})
+                message(
+                    WARNING
+                    "CUDA version ${CUDA_VERSION} has issues when compiling for C++20. Not building reduction benchmarks."
+                )
+                return()
+            endif()
+        else()
+            message(
+                STATUS
+                "CUDA Toolkit not found. Not building reduction benchmarks."
+            )
+            return()
+        endif()
+    endif()
 endif()
 
 # libstdc++ Parallel STL on Ubuntu 20.04 requires explicit linking to TBB
@@ -141,6 +164,7 @@ foreach(VER RANGE 0 10)
             ${Sources}
             PROPERTIES LANGUAGE ${GPU_RUNTIME}
         )
+        set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${ROCM_ROOT}/cmake)
         find_package(HIP MODULE REQUIRED)
         target_include_directories(
             ${TargetName}

@@ -24,6 +24,8 @@ project(reduction_examples LANGUAGES CXX)
 
 if("${GPU_RUNTIME}" STREQUAL "CUDA")
     cmake_minimum_required(VERSION 3.25.2)
+    # Allow __device__ or __host__ annotated lambdas
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --extended-lambda")
 else()
     cmake_minimum_required(VERSION 3.21)
     # Add -fPIE flag to compiler.
@@ -65,9 +67,30 @@ list(APPEND CMAKE_PREFIX_PATH "${ROCM_ROOT}")
 set(include_dirs "../../Common")
 list(APPEND include_dirs "${PROJECT_SOURCE_DIR}")
 
-# For examples targeting NVIDIA, include the HIP header directory.
 if("${GPU_RUNTIME}" STREQUAL "CUDA")
+    # For examples targeting NVIDIA, include the HIP header directory.
     list(APPEND include_dirs "${ROCM_ROOT}/include")
+
+    # Some CUDA versions have issues when compiling for C++20 on Windows, check if we are using those
+    if(WIN32)
+        find_package(CUDA)
+        if(CUDA_FOUND)
+            set(FAULTY_CUDA_VERSION "12.5")
+            if(CUDA_VERSION VERSION_LESS_EQUAL ${FAULTY_CUDA_VERSION})
+                message(
+                    WARNING
+                    "CUDA version ${CUDA_VERSION} has issues when compiling for C++20. Not building reduction examples."
+                )
+                return()
+            endif()
+        else()
+            message(
+                STATUS
+                "CUDA Toolkit not found. Not building reduction examples."
+            )
+            return()
+        endif()
+    endif()
 endif()
 
 # libstdc++ Parallel STL on Ubuntu 20.04 requires explicit linking to TBB

@@ -47,6 +47,32 @@
 
 namespace reduction
 {
+template<typename T, typename F>
+__global__ static void kernel(T* front, T* back, F op, T zero_elem, uint32_t front_size)
+{
+    extern __shared__ T shared[];
+
+    auto read_global_safe = [&](const uint32_t i) { return i < front_size ? front[i] : zero_elem; };
+
+    const uint32_t tid = threadIdx.x, bid = blockIdx.x, gid = bid * blockDim.x + tid;
+
+    // Read input from front buffer to shared
+    shared[tid] = read_global_safe(gid);
+    __syncthreads();
+
+    // Shared reduction
+    for(uint32_t i = 1; i < blockDim.x; i *= 2)
+    {
+        if(tid % (2 * i) == 0)
+            shared[tid] = op(shared[tid], shared[tid + i]);
+        __syncthreads();
+    }
+
+    // Write result from shared to back buffer
+    if(tid == 0)
+        back[bid] = shared[0];
+}
+
 template<typename T, typename F>
 class v1
 {
@@ -89,16 +115,11 @@ class v1
         std::size_t curr = input.size();
         while(curr > 1)
         {
-            hipLaunchKernelGGL(kernel,
-                               dim3(new_size(factor, curr)),
-                               dim3(block_size),
-                               factor * sizeof(T),
-                               hipStreamDefault,
-                               front,
-                               back,
-                               kernel_op,
-                               zero_elem,
-                               curr);
+            HIP_KERNEL_NAME(
+                kernel<T, F>)<<<dim3(new_size(factor, curr)),
+                                dim3(block_size),
+                                factor * sizeof(T),
+                                hipStreamDefault>>>(front, back, kernel_op, zero_elem, curr);
             hip::check(hipGetLastError(), "hipKernelLaunchGGL");
 
             curr = new_size(factor, curr);
@@ -136,31 +157,5 @@ class v1
     {
         return actual / factor + (actual % factor == 0 ? 0 : 1);
     }
-
-    __global__ static void kernel(T* front, T* back, F op, T zero_elem, uint32_t front_size)
-    {
-        extern __shared__ T shared[];
-
-        auto read_global_safe
-            = [&](const uint32_t i) { return i < front_size ? front[i] : zero_elem; };
-
-        const uint32_t tid = threadIdx.x, bid = blockIdx.x, gid = bid * blockDim.x + tid;
-
-        // Read input from front buffer to shared
-        shared[tid] = read_global_safe(gid);
-        __syncthreads();
-
-        // Shared reduction
-        for(uint32_t i = 1; i < blockDim.x; i *= 2)
-        {
-            if(tid % (2 * i) == 0)
-                shared[tid] = op(shared[tid], shared[tid + i]);
-            __syncthreads();
-        }
-
-        // Write result from shared to back buffer
-        if(tid == 0)
-            back[bid] = shared[0];
-    }
 };
 } // namespace reduction
@@ -49,6 +49,52 @@
 
 namespace reduction
 {
+template<typename T, typename F, uint32_t BlockSize, uint32_t WarpSize>
+__global__ static __launch_bounds__(BlockSize) void kernel(
+    T* front, T* back, F op, T zero_elem, uint32_t front_size)
+{
+    static constexpr uint32_t WarpCount = BlockSize / WarpSize;
+    __shared__ T              shared[WarpCount];
+
+    auto read_global_safe = [&](const uint32_t i) { return i < front_size ? front[i] : zero_elem; };
+    auto read_shared_safe = [&](const uint32_t i) { return i < WarpCount ? shared[i] : zero_elem; };
+
+    const uint32_t tid = threadIdx.x, bid = blockIdx.x, gid = bid * blockDim.x + tid,
+                   gsi = gridDim.x * blockDim.x, wid = tid / WarpSize, lid = tid % WarpSize;
+
+    // Read input from front buffer to local
+    T              res               = read_global_safe(gid);
+    const uint32_t num_global_passes = front_size > gsi ? front_size / gsi : 0;
+    for(uint32_t i = 0; i < num_global_passes; ++i)
+        res = op(res, read_global_safe((i + 1) * gsi + gid));
+
+    // Perform warp reductions and communicate results via shared
+    tmp::static_for<WarpCount,
+                    tmp::not_equal<0>,
+                    tmp::select<tmp::not_equal<1>, tmp::divide_ceil<WarpSize>, tmp::constant<0>>>(
+        [&]<uint32_t ActiveWarps>()
+        {
+            if(wid < ActiveWarps)
+            {
+                // Warp reduction
+                tmp::static_for<WarpSize / 2, tmp::not_equal<0>, tmp::divide<2>>(
+                    [&]<int Delta>() { res = op(res, __shfl_down(res, Delta)); });
+
+                // Write warp result from local to shared
+                if(lid == 0)
+                    shared[wid] = res;
+            }
+            __syncthreads();
+
+            // Read warp result from shared to local
+            res = read_shared_safe(tid);
+        });
+
+    // Write result from local to back buffer
+    if(tid == 0)
+        back[bid] = res;
+}
+
 template<typename T, typename F>
 class v10
 {
@@ -97,21 +143,21 @@ class v10
                 block_size,
                 [&]<int BlockSize>() noexcept
                 {
-                    tmp::static_switch<std::array{32, 64}>(warp_size,
-                                                           [&]<int WarpSize>() noexcept
-                                                           {
-                                                               hipLaunchKernelGGL(
-                                                                   kernel<BlockSize, WarpSize>,
-                                                                   dim3(block_count),
-                                                                   dim3(BlockSize),
-                                                                   0,
-                                                                   0,
-                                                                   front,
-                                                                   back,
-                                                                   kernel_op,
-                                                                   zero_elem,
-                                                                   step_size);
-                                                           });
+                    tmp::static_switch<std::array{32, 64}>(
+                        warp_size,
+                        [&]<int WarpSize>() noexcept
+                        {
+                            HIP_KERNEL_NAME(
+                                kernel<T,
+                                       F,
+                                       BlockSize,
+                                       WarpSize>)<<<dim3(block_count), dim3(BlockSize), 0, 0>>>(
+                                front,
+                                back,
+                                kernel_op,
+                                zero_elem,
+                                step_size);
+                        });
                 });
         };
 
@@ -158,54 +204,5 @@ class v10
     {
         return actual / factor + (actual % factor == 0 ? 0 : 1);
     }
-
-    template<uint32_t BlockSize, uint32_t WarpSize>
-    __global__ static __launch_bounds__(BlockSize) void kernel(
-        T* front, T* back, F op, T zero_elem, uint32_t front_size)
-    {
-        static constexpr uint32_t WarpCount = BlockSize / WarpSize;
-        __shared__ T              shared[WarpCount];
-
-        auto read_global_safe
-            = [&](const uint32_t i) { return i < front_size ? front[i] : zero_elem; };
-        auto read_shared_safe
-            = [&](const uint32_t i) { return i < WarpCount ? shared[i] : zero_elem; };
-
-        const uint32_t tid = threadIdx.x, bid = blockIdx.x, gid = bid * blockDim.x + tid,
-                       gsi = gridDim.x * blockDim.x, wid = tid / WarpSize, lid = tid % WarpSize;
-
-        // Read input from front buffer to local
-        T              res               = read_global_safe(gid);
-        const uint32_t num_global_passes = front_size > gsi ? front_size / gsi : 0;
-        for(uint32_t i = 0; i < num_global_passes; ++i)
-            res = op(res, read_global_safe((i + 1) * gsi + gid));
-
-        // Perform warp reductions and communicate results via shared
-        tmp::static_for<
-            WarpCount,
-            tmp::not_equal<0>,
-            tmp::select<tmp::not_equal<1>, tmp::divide_ceil<WarpSize>, tmp::constant<0>>>(
-            [&]<uint32_t ActiveWarps>()
-            {
-                if(wid < ActiveWarps)
-                {
-                    // Warp reduction
-                    tmp::static_for<WarpSize / 2, tmp::not_equal<0>, tmp::divide<2>>(
-                        [&]<int Delta>() { res = op(res, __shfl_down(res, Delta)); });
-
-                    // Write warp result from local to shared
-                    if(lid == 0)
-                        shared[wid] = res;
-                }
-                __syncthreads();
-
-                // Read warp result from shared to local
-                res = read_shared_safe(tid);
-            });
-
-        // Write result from local to back buffer
-        if(tid == 0)
-            back[bid] = res;
-    }
 };
 } // namespace reduction