From f05838ac0e88558630aeda50337d06e3d06188e8 Mon Sep 17 00:00:00 2001
From: Beatriz Navidad Vilches <beatriz@streamhpc.com>
Date: Wed, 28 Aug 2024 20:41:41 +0200
Subject: [PATCH 1/5] fix: set CMAKE_MODULE_PATH

---
 Tutorials/reduction/benchmark/CMakeLists.txt | 1 +
 Tutorials/reduction/test/CMakeLists.txt      | 1 +
 2 files changed, 2 insertions(+)

diff --git a/Tutorials/reduction/benchmark/CMakeLists.txt b/Tutorials/reduction/benchmark/CMakeLists.txt
index dcde666a7..acf265ac2 100644
--- a/Tutorials/reduction/benchmark/CMakeLists.txt
+++ b/Tutorials/reduction/benchmark/CMakeLists.txt
@@ -141,6 +141,7 @@ foreach(VER RANGE 0 10)
             ${Sources}
             PROPERTIES LANGUAGE ${GPU_RUNTIME}
         )
+        SET(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${ROCM_ROOT}/cmake)
         find_package(HIP MODULE REQUIRED)
         target_include_directories(
             ${TargetName}
diff --git a/Tutorials/reduction/test/CMakeLists.txt b/Tutorials/reduction/test/CMakeLists.txt
index d235d669e..3dd046be8 100644
--- a/Tutorials/reduction/test/CMakeLists.txt
+++ b/Tutorials/reduction/test/CMakeLists.txt
@@ -162,6 +162,7 @@ foreach(VER RANGE 1 10)
             ${Sources}
             PROPERTIES LANGUAGE ${GPU_RUNTIME}
         )
+        SET(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${ROCM_ROOT}/cmake)
         find_package(HIP MODULE REQUIRED)
         target_include_directories(
             ${TargetName}

From 52412db3a4a922d81ec619ecac4aa9d88194e4e5 Mon Sep 17 00:00:00 2001
From: Beatriz Navidad Vilches <beatriz@streamhpc.com>
Date: Thu, 29 Aug 2024 11:45:11 +0200
Subject: [PATCH 2/5] fix: allow __host__/__device__ annotated lambdas for CUDA
 backend

---
 Tutorials/reduction/benchmark/CMakeLists.txt | 2 ++
 Tutorials/reduction/example/CMakeLists.txt   | 2 ++
 Tutorials/reduction/test/CMakeLists.txt      | 2 ++
 3 files changed, 6 insertions(+)

diff --git a/Tutorials/reduction/benchmark/CMakeLists.txt b/Tutorials/reduction/benchmark/CMakeLists.txt
index acf265ac2..2af2d4db1 100644
--- a/Tutorials/reduction/benchmark/CMakeLists.txt
+++ b/Tutorials/reduction/benchmark/CMakeLists.txt
@@ -24,6 +24,8 @@ project(reduction_benchmarks LANGUAGES CXX)
 
 if("${GPU_RUNTIME}" STREQUAL "CUDA")
     cmake_minimum_required(VERSION 3.25.2)
+    # Allow __device__ or __host__ annotated lambdas
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --extended-lambda")
 else()
     cmake_minimum_required(VERSION 3.21)
     # Add -fPIE flag to compiler.
diff --git a/Tutorials/reduction/example/CMakeLists.txt b/Tutorials/reduction/example/CMakeLists.txt
index dba24cb64..225dc8a59 100644
--- a/Tutorials/reduction/example/CMakeLists.txt
+++ b/Tutorials/reduction/example/CMakeLists.txt
@@ -24,6 +24,8 @@ project(reduction_examples LANGUAGES CXX)
 
 if("${GPU_RUNTIME}" STREQUAL "CUDA")
     cmake_minimum_required(VERSION 3.25.2)
+    # Allow __device__ or __host__ annotated lambdas
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --extended-lambda")
 else()
     cmake_minimum_required(VERSION 3.21)
     # Add -fPIE flag to compiler.
diff --git a/Tutorials/reduction/test/CMakeLists.txt b/Tutorials/reduction/test/CMakeLists.txt
index 3dd046be8..15b5d2d7a 100644
--- a/Tutorials/reduction/test/CMakeLists.txt
+++ b/Tutorials/reduction/test/CMakeLists.txt
@@ -24,6 +24,8 @@ project(reduction_tests LANGUAGES CXX)
 
 if("${GPU_RUNTIME}" STREQUAL "CUDA")
     cmake_minimum_required(VERSION 3.25.2)
+    # Allow __device__ or __host__ annotated lambdas
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --extended-lambda")
 else()
     cmake_minimum_required(VERSION 3.21)
     # Add -fPIE flag to compiler.

From d77603eb8565a9b06307386d87b0a41d9704c197 Mon Sep 17 00:00:00 2001
From: Beatriz Navidad Vilches <beatriz@streamhpc.com>
Date: Fri, 30 Aug 2024 09:22:11 +0200
Subject: [PATCH 3/5] fix: a __global__ function cannot be a member function

---
 Tutorials/reduction/include/Reduction/v1.hpp  |  67 ++++----
 Tutorials/reduction/include/Reduction/v10.hpp | 125 +++++++--------
 Tutorials/reduction/include/Reduction/v2.hpp  |  71 ++++-----
 Tutorials/reduction/include/Reduction/v3.hpp  |  66 ++++----
 Tutorials/reduction/include/Reduction/v4.hpp  |  67 ++++----
 Tutorials/reduction/include/Reduction/v5.hpp  |  95 ++++++-----
 Tutorials/reduction/include/Reduction/v6.hpp  |  86 +++++-----
 Tutorials/reduction/include/Reduction/v7.hpp  |  86 +++++-----
 Tutorials/reduction/include/Reduction/v8.hpp  | 104 ++++++------
 Tutorials/reduction/include/Reduction/v9.hpp  | 150 +++++++++---------
 10 files changed, 434 insertions(+), 483 deletions(-)

diff --git a/Tutorials/reduction/include/Reduction/v1.hpp b/Tutorials/reduction/include/Reduction/v1.hpp
index 189fb4dc5..420214a41 100644
--- a/Tutorials/reduction/include/Reduction/v1.hpp
+++ b/Tutorials/reduction/include/Reduction/v1.hpp
@@ -47,6 +47,32 @@
 
 namespace reduction
 {
+template<typename T, typename F>
+__global__ static void kernel(T* front, T* back, F op, T zero_elem, uint32_t front_size)
+{
+    extern __shared__ T shared[];
+
+    auto read_global_safe = [&](const uint32_t i) { return i < front_size ? front[i] : zero_elem; };
+
+    const uint32_t tid = threadIdx.x, bid = blockIdx.x, gid = bid * blockDim.x + tid;
+
+    // Read input from front buffer to shared
+    shared[tid] = read_global_safe(gid);
+    __syncthreads();
+
+    // Shared reduction
+    for(uint32_t i = 1; i < blockDim.x; i *= 2)
+    {
+        if(tid % (2 * i) == 0)
+            shared[tid] = op(shared[tid], shared[tid + i]);
+        __syncthreads();
+    }
+
+    // Write result from shared to back buffer
+    if(tid == 0)
+        back[bid] = shared[0];
+}
+
 template<typename T, typename F>
 class v1
 {
@@ -89,16 +115,11 @@ class v1
         std::size_t curr = input.size();
         while(curr > 1)
         {
-            hipLaunchKernelGGL(kernel,
-                               dim3(new_size(factor, curr)),
-                               dim3(block_size),
-                               factor * sizeof(T),
-                               hipStreamDefault,
-                               front,
-                               back,
-                               kernel_op,
-                               zero_elem,
-                               curr);
+            HIP_KERNEL_NAME(
+                kernel<T, F>)<<<dim3(new_size(factor, curr)),
+                                dim3(block_size),
+                                factor * sizeof(T),
+                                hipStreamDefault>>>(front, back, kernel_op, zero_elem, curr);
             hip::check(hipGetLastError(), "hipKernelLaunchGGL");
 
             curr = new_size(factor, curr);
@@ -136,31 +157,5 @@ class v1
     {
         return actual / factor + (actual % factor == 0 ? 0 : 1);
     }
-
-    __global__ static void kernel(T* front, T* back, F op, T zero_elem, uint32_t front_size)
-    {
-        extern __shared__ T shared[];
-
-        auto read_global_safe
-            = [&](const uint32_t i) { return i < front_size ? front[i] : zero_elem; };
-
-        const uint32_t tid = threadIdx.x, bid = blockIdx.x, gid = bid * blockDim.x + tid;
-
-        // Read input from front buffer to shared
-        shared[tid] = read_global_safe(gid);
-        __syncthreads();
-
-        // Shared reduction
-        for(uint32_t i = 1; i < blockDim.x; i *= 2)
-        {
-            if(tid % (2 * i) == 0)
-                shared[tid] = op(shared[tid], shared[tid + i]);
-            __syncthreads();
-        }
-
-        // Write result from shared to back buffer
-        if(tid == 0)
-            back[bid] = shared[0];
-    }
 };
 } // namespace reduction
diff --git a/Tutorials/reduction/include/Reduction/v10.hpp b/Tutorials/reduction/include/Reduction/v10.hpp
index 1559c2dd2..347dc9e59 100644
--- a/Tutorials/reduction/include/Reduction/v10.hpp
+++ b/Tutorials/reduction/include/Reduction/v10.hpp
@@ -49,6 +49,52 @@
 
 namespace reduction
 {
+template<typename T, typename F, uint32_t BlockSize, uint32_t WarpSize>
+__global__ static __launch_bounds__(BlockSize) void kernel(
+    T* front, T* back, F op, T zero_elem, uint32_t front_size)
+{
+    static constexpr uint32_t WarpCount = BlockSize / WarpSize;
+    __shared__ T              shared[WarpCount];
+
+    auto read_global_safe = [&](const uint32_t i) { return i < front_size ? front[i] : zero_elem; };
+    auto read_shared_safe = [&](const uint32_t i) { return i < WarpCount ? shared[i] : zero_elem; };
+
+    const uint32_t tid = threadIdx.x, bid = blockIdx.x, gid = bid * blockDim.x + tid,
+                   gsi = gridDim.x * blockDim.x, wid = tid / WarpSize, lid = tid % WarpSize;
+
+    // Read input from front buffer to local
+    T              res               = read_global_safe(gid);
+    const uint32_t num_global_passes = front_size > gsi ? front_size / gsi : 0;
+    for(uint32_t i = 0; i < num_global_passes; ++i)
+        res = op(res, read_global_safe((i + 1) * gsi + gid));
+
+    // Perform warp reductions and communicate results via shared
+    tmp::static_for<WarpCount,
+                    tmp::not_equal<0>,
+                    tmp::select<tmp::not_equal<1>, tmp::divide_ceil<WarpSize>, tmp::constant<0>>>(
+        [&]<uint32_t ActiveWarps>()
+        {
+            if(wid < ActiveWarps)
+            {
+                // Warp reduction
+                tmp::static_for<WarpSize / 2, tmp::not_equal<0>, tmp::divide<2>>(
+                    [&]<int Delta>() { res = op(res, __shfl_down(res, Delta)); });
+
+                // Write warp result from local to shared
+                if(lid == 0)
+                    shared[wid] = res;
+            }
+            __syncthreads();
+
+            // Read warp result from shared to local
+            res = read_shared_safe(tid);
+        });
+
+    // Write result from local to back buffer
+    if(tid == 0)
+        back[bid] = res;
+}
+
 template<typename T, typename F>
 class v10
 {
@@ -97,21 +143,21 @@ class v10
                 block_size,
                 [&]<int BlockSize>() noexcept
                 {
-                    tmp::static_switch<std::array{32, 64}>(warp_size,
-                                                           [&]<int WarpSize>() noexcept
-                                                           {
-                                                               hipLaunchKernelGGL(
-                                                                   kernel<BlockSize, WarpSize>,
-                                                                   dim3(block_count),
-                                                                   dim3(BlockSize),
-                                                                   0,
-                                                                   0,
-                                                                   front,
-                                                                   back,
-                                                                   kernel_op,
-                                                                   zero_elem,
-                                                                   step_size);
-                                                           });
+                    tmp::static_switch<std::array{32, 64}>(
+                        warp_size,
+                        [&]<int WarpSize>() noexcept
+                        {
+                            HIP_KERNEL_NAME(
+                                kernel<T,
+                                       F,
+                                       BlockSize,
+                                       WarpSize>)<<<dim3(block_count), dim3(BlockSize), 0, 0>>>(
+                                front,
+                                back,
+                                kernel_op,
+                                zero_elem,
+                                step_size);
+                        });
                 });
         };
 
@@ -158,54 +204,5 @@ class v10
     {
         return actual / factor + (actual % factor == 0 ? 0 : 1);
     }
-
-    template<uint32_t BlockSize, uint32_t WarpSize>
-    __global__ static __launch_bounds__(BlockSize) void kernel(
-        T* front, T* back, F op, T zero_elem, uint32_t front_size)
-    {
-        static constexpr uint32_t WarpCount = BlockSize / WarpSize;
-        __shared__ T              shared[WarpCount];
-
-        auto read_global_safe
-            = [&](const uint32_t i) { return i < front_size ? front[i] : zero_elem; };
-        auto read_shared_safe
-            = [&](const uint32_t i) { return i < WarpCount ? shared[i] : zero_elem; };
-
-        const uint32_t tid = threadIdx.x, bid = blockIdx.x, gid = bid * blockDim.x + tid,
-                       gsi = gridDim.x * blockDim.x, wid = tid / WarpSize, lid = tid % WarpSize;
-
-        // Read input from front buffer to local
-        T              res               = read_global_safe(gid);
-        const uint32_t num_global_passes = front_size > gsi ? front_size / gsi : 0;
-        for(uint32_t i = 0; i < num_global_passes; ++i)
-            res = op(res, read_global_safe((i + 1) * gsi + gid));
-
-        // Perform warp reductions and communicate results via shared
-        tmp::static_for<
-            WarpCount,
-            tmp::not_equal<0>,
-            tmp::select<tmp::not_equal<1>, tmp::divide_ceil<WarpSize>, tmp::constant<0>>>(
-            [&]<uint32_t ActiveWarps>()
-            {
-                if(wid < ActiveWarps)
-                {
-                    // Warp reduction
-                    tmp::static_for<WarpSize / 2, tmp::not_equal<0>, tmp::divide<2>>(
-                        [&]<int Delta>() { res = op(res, __shfl_down(res, Delta)); });
-
-                    // Write warp result from local to shared
-                    if(lid == 0)
-                        shared[wid] = res;
-                }
-                __syncthreads();
-
-                // Read warp result from shared to local
-                res = read_shared_safe(tid);
-            });
-
-        // Write result from local to back buffer
-        if(tid == 0)
-            back[bid] = res;
-    }
 };
 } // namespace reduction
diff --git a/Tutorials/reduction/include/Reduction/v2.hpp b/Tutorials/reduction/include/Reduction/v2.hpp
index c44740031..ed64929f6 100644
--- a/Tutorials/reduction/include/Reduction/v2.hpp
+++ b/Tutorials/reduction/include/Reduction/v2.hpp
@@ -47,6 +47,34 @@
 
 namespace reduction
 {
+template<typename T, typename F>
+__global__ static void kernel(T* front, T* back, F op, T zero_elem, uint32_t front_size)
+{
+    extern __shared__ T shared[];
+
+    auto read_global_safe = [&](const uint32_t i) { return i < front_size ? front[i] : zero_elem; };
+
+    const uint32_t tid = threadIdx.x, bid = blockIdx.x, gid = bid * blockDim.x + tid;
+
+    // Read input from front buffer to shared
+    shared[tid] = read_global_safe(gid);
+    __syncthreads();
+
+    // Shared reduction
+    for(uint32_t i = 1; i < blockDim.x; i *= 2)
+    {
+        uint32_t index = 2 * i * tid;
+
+        if(index < blockDim.x)
+            shared[index] = op(shared[index], shared[index + i]);
+        __syncthreads();
+    }
+
+    // Write result from shared to back buffer
+    if(tid == 0)
+        back[bid] = shared[0];
+}
+
 template<typename T, typename F>
 class v2
 {
@@ -89,16 +117,11 @@ class v2
         std::size_t curr = input.size();
         while(curr > 1)
         {
-            hipLaunchKernelGGL(kernel,
-                               dim3(new_size(factor, curr)),
-                               dim3(block_size),
-                               factor * sizeof(T),
-                               hipStreamDefault,
-                               front,
-                               back,
-                               kernel_op,
-                               zero_elem,
-                               curr);
+            HIP_KERNEL_NAME(
+                kernel<T, F>)<<<dim3(new_size(factor, curr)),
+                                dim3(block_size),
+                                factor * sizeof(T),
+                                hipStreamDefault>>>(front, back, kernel_op, zero_elem, curr);
             hip::check(hipGetLastError(), "hipKernelLaunchGGL");
 
             curr = new_size(factor, curr);
@@ -136,33 +159,5 @@ class v2
     {
         return actual / factor + (actual % factor == 0 ? 0 : 1);
     }
-
-    __global__ static void kernel(T* front, T* back, F op, T zero_elem, uint32_t front_size)
-    {
-        extern __shared__ T shared[];
-
-        auto read_global_safe
-            = [&](const uint32_t i) { return i < front_size ? front[i] : zero_elem; };
-
-        const uint32_t tid = threadIdx.x, bid = blockIdx.x, gid = bid * blockDim.x + tid;
-
-        // Read input from front buffer to shared
-        shared[tid] = read_global_safe(gid);
-        __syncthreads();
-
-        // Shared reduction
-        for(uint32_t i = 1; i < blockDim.x; i *= 2)
-        {
-            uint32_t index = 2 * i * tid;
-
-            if(index < blockDim.x)
-                shared[index] = op(shared[index], shared[index + i]);
-            __syncthreads();
-        }
-
-        // Write result from shared to back buffer
-        if(tid == 0)
-            back[bid] = shared[0];
-    }
 };
 } // namespace reduction
diff --git a/Tutorials/reduction/include/Reduction/v3.hpp b/Tutorials/reduction/include/Reduction/v3.hpp
index 41d116311..a1c947c20 100644
--- a/Tutorials/reduction/include/Reduction/v3.hpp
+++ b/Tutorials/reduction/include/Reduction/v3.hpp
@@ -48,6 +48,31 @@
 namespace reduction
 {
 template<typename T, typename F>
+__global__ static void kernel(T* front, T* back, F op, T zero_elem, uint32_t front_size)
+{
+    extern __shared__ T shared[];
+
+    auto read_global_safe = [&](const uint32_t i) { return i < front_size ? front[i] : zero_elem; };
+
+    const uint32_t tid = threadIdx.x, bid = blockIdx.x, gid = bid * blockDim.x + tid;
+
+    // Read input from front buffer to shared
+    shared[tid] = read_global_safe(gid);
+    __syncthreads();
+
+    // Shared reduction
+    for(uint32_t i = blockDim.x / 2; i != 0; i /= 2)
+    {
+        if(tid < i)
+            shared[tid] = op(shared[tid], shared[tid + i]);
+        __syncthreads();
+    }
+
+    // Write result from shared to back buffer
+    if(tid == 0)
+        back[bid] = shared[0];
+}
+template<typename T, typename F>
 class v3
 {
 public:
@@ -89,16 +114,11 @@ class v3
         std::size_t curr = input.size();
         while(curr > 1)
         {
-            hipLaunchKernelGGL(kernel,
-                               dim3(new_size(factor, curr)),
-                               dim3(block_size),
-                               block_size * sizeof(T),
-                               hipStreamDefault,
-                               front,
-                               back,
-                               kernel_op,
-                               zero_elem,
-                               curr);
+            HIP_KERNEL_NAME(
+                kernel<T, F>)<<<dim3(new_size(factor, curr)),
+                                dim3(block_size),
+                                block_size * sizeof(T),
+                                hipStreamDefault>>>(front, back, kernel_op, zero_elem, curr);
             hip::check(hipGetLastError(), "hipKernelLaunchGGL");
 
             curr = new_size(factor, curr);
@@ -136,31 +156,5 @@ class v3
     {
         return actual / factor + (actual % factor == 0 ? 0 : 1);
     }
-
-    __global__ static void kernel(T* front, T* back, F op, T zero_elem, uint32_t front_size)
-    {
-        extern __shared__ T shared[];
-
-        auto read_global_safe
-            = [&](const uint32_t i) { return i < front_size ? front[i] : zero_elem; };
-
-        const uint32_t tid = threadIdx.x, bid = blockIdx.x, gid = bid * blockDim.x + tid;
-
-        // Read input from front buffer to shared
-        shared[tid] = read_global_safe(gid);
-        __syncthreads();
-
-        // Shared reduction
-        for(uint32_t i = blockDim.x / 2; i != 0; i /= 2)
-        {
-            if(tid < i)
-                shared[tid] = op(shared[tid], shared[tid + i]);
-            __syncthreads();
-        }
-
-        // Write result from shared to back buffer
-        if(tid == 0)
-            back[bid] = shared[0];
-    }
 };
 } // namespace reduction
diff --git a/Tutorials/reduction/include/Reduction/v4.hpp b/Tutorials/reduction/include/Reduction/v4.hpp
index abf0107bd..78492ddca 100644
--- a/Tutorials/reduction/include/Reduction/v4.hpp
+++ b/Tutorials/reduction/include/Reduction/v4.hpp
@@ -47,6 +47,32 @@
 
 namespace reduction
 {
+template<typename T, typename F>
+__global__ static void kernel(T* front, T* back, F op, T zero_elem, uint32_t front_size)
+{
+    extern __shared__ T shared[];
+
+    auto read_global_safe = [&](const uint32_t i) { return i < front_size ? front[i] : zero_elem; };
+
+    const uint32_t tid = threadIdx.x, bid = blockIdx.x, gid = bid * (blockDim.x * 2) + tid;
+
+    // Read input from front buffer to shared
+    shared[tid] = op(read_global_safe(gid), read_global_safe(gid + blockDim.x));
+    __syncthreads();
+
+    // Shared reduction
+    for(uint32_t i = blockDim.x / 2; i != 0; i /= 2)
+    {
+        if(tid < i)
+            shared[tid] = op(shared[tid], shared[tid + i]);
+        __syncthreads();
+    }
+
+    // Write result from shared to back buffer
+    if(tid == 0)
+        back[bid] = shared[0];
+}
+
 template<typename T, typename F>
 class v4
 {
@@ -89,16 +115,11 @@ class v4
         std::size_t curr = input.size();
         while(curr > 1)
         {
-            hipLaunchKernelGGL(kernel,
-                               dim3(new_size(factor, curr)),
-                               dim3(block_size),
-                               block_size * sizeof(T),
-                               hipStreamDefault,
-                               front,
-                               back,
-                               kernel_op,
-                               zero_elem,
-                               curr);
+            HIP_KERNEL_NAME(
+                kernel<T, F>)<<<dim3(new_size(factor, curr)),
+                                dim3(block_size),
+                                block_size * sizeof(T),
+                                hipStreamDefault>>>(front, back, kernel_op, zero_elem, curr);
             hip::check(hipGetLastError(), "hipKernelLaunchGGL");
 
             curr = new_size(factor, curr);
@@ -136,31 +157,5 @@ class v4
     {
         return actual / factor + (actual % factor == 0 ? 0 : 1);
     }
-
-    __global__ static void kernel(T* front, T* back, F op, T zero_elem, uint32_t front_size)
-    {
-        extern __shared__ T shared[];
-
-        auto read_global_safe
-            = [&](const uint32_t i) { return i < front_size ? front[i] : zero_elem; };
-
-        const uint32_t tid = threadIdx.x, bid = blockIdx.x, gid = bid * (blockDim.x * 2) + tid;
-
-        // Read input from front buffer to shared
-        shared[tid] = op(read_global_safe(gid), read_global_safe(gid + blockDim.x));
-        __syncthreads();
-
-        // Shared reduction
-        for(uint32_t i = blockDim.x / 2; i != 0; i /= 2)
-        {
-            if(tid < i)
-                shared[tid] = op(shared[tid], shared[tid + i]);
-            __syncthreads();
-        }
-
-        // Write result from shared to back buffer
-        if(tid == 0)
-            back[bid] = shared[0];
-    }
 };
 } // namespace reduction
diff --git a/Tutorials/reduction/include/Reduction/v5.hpp b/Tutorials/reduction/include/Reduction/v5.hpp
index d54be3a24..1f41f6399 100644
--- a/Tutorials/reduction/include/Reduction/v5.hpp
+++ b/Tutorials/reduction/include/Reduction/v5.hpp
@@ -49,6 +49,39 @@
 
 namespace reduction
 {
+template<typename T, typename F, uint32_t WarpSize>
+__global__ static void kernel(T* front, T* back, F op, T zero_elem, uint32_t front_size)
+{
+    extern __shared__ T shared[];
+
+    auto read_global_safe = [&](const uint32_t i) { return i < front_size ? front[i] : zero_elem; };
+
+    const uint32_t tid = threadIdx.x, bid = blockIdx.x, gid = bid * (blockDim.x * 2) + tid;
+
+    // Read input from front buffer to shared
+    shared[tid] = op(read_global_safe(gid), read_global_safe(gid + blockDim.x));
+    __syncthreads();
+
+    // Shared reduction
+    for(uint32_t i = blockDim.x / 2; i > WarpSize; i /= 2)
+    {
+        if(tid < i)
+            shared[tid] = op(shared[tid], shared[tid + i]);
+        __syncthreads();
+    }
+    // Warp reduction
+    tmp::static_for<WarpSize, tmp::not_equal<0>, tmp::divide<2>>(
+        [&]<int I>()
+        {
+            if(tid < I)
+                shared[tid] = op(shared[tid], shared[tid + I]);
+        });
+
+    // Write result from shared to back buffer
+    if(tid == 0)
+        back[bid] = shared[0];
+}
+
 template<typename T, typename F>
 class v5
 {
@@ -92,21 +125,19 @@ class v5
 
         auto kernel_dispatcher = [&](std::size_t step_size)
         {
-            tmp::static_switch<std::array{32, 64}>(warp_size,
-                                                   [&]<int WarpSize>() noexcept
-                                                   {
-                                                       hipLaunchKernelGGL(
-                                                           kernel<WarpSize>,
-                                                           dim3(new_size(factor, step_size)),
-                                                           dim3(block_size),
-                                                           block_size * sizeof(T),
-                                                           hipStreamDefault,
-                                                           front,
-                                                           back,
-                                                           kernel_op,
-                                                           zero_elem,
-                                                           step_size);
-                                                   });
+            tmp::static_switch<std::array{32, 64}>(
+                warp_size,
+                [&]<int WarpSize>() noexcept
+                {
+                    HIP_KERNEL_NAME(kernel<T, F, WarpSize>)<<<dim3(new_size(factor, step_size)),
+                                                              dim3(block_size),
+                                                              block_size * sizeof(T),
+                                                              hipStreamDefault>>>(front,
+                                                                                  back,
+                                                                                  kernel_op,
+                                                                                  zero_elem,
+                                                                                  step_size);
+                });
         };
 
         hipEvent_t start, end;
@@ -156,39 +187,5 @@ class v5
     {
         return actual / factor + (actual % factor == 0 ? 0 : 1);
     }
-
-    template<uint32_t WarpSize>
-    __global__ static void kernel(T* front, T* back, F op, T zero_elem, uint32_t front_size)
-    {
-        extern __shared__ T shared[];
-
-        auto read_global_safe
-            = [&](const uint32_t i) { return i < front_size ? front[i] : zero_elem; };
-
-        const uint32_t tid = threadIdx.x, bid = blockIdx.x, gid = bid * (blockDim.x * 2) + tid;
-
-        // Read input from front buffer to shared
-        shared[tid] = op(read_global_safe(gid), read_global_safe(gid + blockDim.x));
-        __syncthreads();
-
-        // Shared reduction
-        for(uint32_t i = blockDim.x / 2; i > WarpSize; i /= 2)
-        {
-            if(tid < i)
-                shared[tid] = op(shared[tid], shared[tid + i]);
-            __syncthreads();
-        }
-        // Warp reduction
-        tmp::static_for<WarpSize, tmp::not_equal<0>, tmp::divide<2>>(
-            [&]<int I>()
-            {
-                if(tid < I)
-                    shared[tid] = op(shared[tid], shared[tid + I]);
-            });
-
-        // Write result from shared to back buffer
-        if(tid == 0)
-            back[bid] = shared[0];
-    }
 };
 } // namespace reduction
diff --git a/Tutorials/reduction/include/Reduction/v6.hpp b/Tutorials/reduction/include/Reduction/v6.hpp
index 8d59c8c94..fd7799b6c 100644
--- a/Tutorials/reduction/include/Reduction/v6.hpp
+++ b/Tutorials/reduction/include/Reduction/v6.hpp
@@ -49,6 +49,41 @@
 
 namespace reduction
 {
+template<typename T, typename F, uint32_t BlockSize, uint32_t WarpSize>
+__global__ static __launch_bounds__(BlockSize) void kernel(
+    T* front, T* back, F op, T zero_elem, uint32_t front_size)
+{
+    __shared__ T shared[BlockSize];
+
+    auto read_global_safe = [&](const uint32_t i) { return i < front_size ? front[i] : zero_elem; };
+
+    const uint32_t tid = threadIdx.x, bid = blockIdx.x, gid = bid * (blockDim.x * 2) + tid;
+
+    // Read input from front buffer to shared
+    shared[tid] = op(read_global_safe(gid), read_global_safe(gid + blockDim.x));
+    __syncthreads();
+
+    // Shared reduction
+    tmp::static_for<BlockSize / 2, tmp::greater_than<WarpSize>, tmp::divide<2>>(
+        [&]<int I>()
+        {
+            if(tid < I)
+                shared[tid] = op(shared[tid], shared[tid + I]);
+            __syncthreads();
+        });
+    // Warp reduction
+    tmp::static_for<WarpSize, tmp::not_equal<0>, tmp::divide<2>>(
+        [&]<int I>()
+        {
+            if(tid < I)
+                shared[tid] = op(shared[tid], shared[tid + I]);
+        });
+
+    // Write result from shared to back buffer
+    if(tid == 0)
+        back[bid] = shared[0];
+}
+
 template<typename T, typename F>
 class v6
 {
@@ -100,16 +135,11 @@ class v6
                         warp_size,
                         [&]<int WarpSize>() noexcept
                         {
-                            hipLaunchKernelGGL(kernel<BlockSize, WarpSize>,
-                                               dim3(new_size(factor, step_size)),
-                                               dim3(BlockSize),
-                                               0,
-                                               hipStreamDefault,
-                                               front,
-                                               back,
-                                               kernel_op,
-                                               zero_elem,
-                                               step_size);
+                            HIP_KERNEL_NAME(kernel<T, F, BlockSize, WarpSize>)<<<
+                                dim3(new_size(factor, step_size)),
+                                dim3(BlockSize),
+                                0,
+                                hipStreamDefault>>>(front, back, kernel_op, zero_elem, step_size);
                         });
                 });
         };
@@ -161,41 +191,5 @@ class v6
     {
         return actual / factor + (actual % factor == 0 ? 0 : 1);
     }
-
-    template<uint32_t BlockSize, uint32_t WarpSize>
-    __global__ static __launch_bounds__(BlockSize) void kernel(
-        T* front, T* back, F op, T zero_elem, uint32_t front_size)
-    {
-        __shared__ T shared[BlockSize];
-
-        auto read_global_safe
-            = [&](const uint32_t i) { return i < front_size ? front[i] : zero_elem; };
-
-        const uint32_t tid = threadIdx.x, bid = blockIdx.x, gid = bid * (blockDim.x * 2) + tid;
-
-        // Read input from front buffer to shared
-        shared[tid] = op(read_global_safe(gid), read_global_safe(gid + blockDim.x));
-        __syncthreads();
-
-        // Shared reduction
-        tmp::static_for<BlockSize / 2, tmp::greater_than<WarpSize>, tmp::divide<2>>(
-            [&]<int I>()
-            {
-                if(tid < I)
-                    shared[tid] = op(shared[tid], shared[tid + I]);
-                __syncthreads();
-            });
-        // Warp reduction
-        tmp::static_for<WarpSize, tmp::not_equal<0>, tmp::divide<2>>(
-            [&]<int I>()
-            {
-                if(tid < I)
-                    shared[tid] = op(shared[tid], shared[tid + I]);
-            });
-
-        // Write result from shared to back buffer
-        if(tid == 0)
-            back[bid] = shared[0];
-    }
 };
 } // namespace reduction
diff --git a/Tutorials/reduction/include/Reduction/v7.hpp b/Tutorials/reduction/include/Reduction/v7.hpp
index 5f3bb2003..2dc9e6243 100644
--- a/Tutorials/reduction/include/Reduction/v7.hpp
+++ b/Tutorials/reduction/include/Reduction/v7.hpp
@@ -49,6 +49,41 @@
 
 namespace reduction
 {
+template<typename T, typename F, uint32_t BlockSize, uint32_t WarpSize>
+__global__ static __launch_bounds__(BlockSize) void kernel(
+    T* front, T* back, F op, T zero_elem, uint32_t front_size)
+{
+    __shared__ T shared[BlockSize];
+
+    auto read_global_safe = [&](const uint32_t i) { return i < front_size ? front[i] : zero_elem; };
+
+    const uint32_t tid = threadIdx.x, bid = blockIdx.x, gid = bid * (blockDim.x * 2) + tid;
+
+    // Read input from front buffer to shared
+    shared[tid] = op(read_global_safe(gid), read_global_safe(gid + blockDim.x));
+    __syncthreads();
+
+    // Shared reduction
+    tmp::static_for<BlockSize / 2, tmp::greater_than<WarpSize>, tmp::divide<2>>(
+        [&]<int I>()
+        {
+            if(tid < I)
+                shared[tid] = op(shared[tid], shared[tid + I]);
+            __syncthreads();
+        });
+    // Warp reduction
+    if(tid < WarpSize)
+    {
+        T res = op(shared[tid], shared[tid + WarpSize]);
+        tmp::static_for<WarpSize / 2, tmp::not_equal<0>, tmp::divide<2>>(
+            [&]<int Delta>() { res = op(res, __shfl_down(res, Delta)); });
+
+        // Write result from shared to back buffer
+        if(tid == 0)
+            back[bid] = res;
+    }
+}
+
 template<typename T, typename F>
 class v7
 {
@@ -100,16 +135,11 @@ class v7
                         warp_size,
                         [&]<int WarpSize>() noexcept
                         {
-                            hipLaunchKernelGGL(kernel<BlockSize, WarpSize>,
-                                               dim3(new_size(factor, step_size)),
-                                               dim3(BlockSize),
-                                               0,
-                                               hipStreamDefault,
-                                               front,
-                                               back,
-                                               kernel_op,
-                                               zero_elem,
-                                               step_size);
+                            HIP_KERNEL_NAME(kernel<T, F, BlockSize, WarpSize>)<<<
+                                dim3(new_size(factor, step_size)),
+                                dim3(BlockSize),
+                                0,
+                                hipStreamDefault>>>(front, back, kernel_op, zero_elem, step_size);
                         });
                 });
         };
@@ -161,41 +191,5 @@ class v7
     {
         return actual / factor + (actual % factor == 0 ? 0 : 1);
     }
-
-    template<uint32_t BlockSize, uint32_t WarpSize>
-    __global__ static __launch_bounds__(BlockSize) void kernel(
-        T* front, T* back, F op, T zero_elem, uint32_t front_size)
-    {
-        __shared__ T shared[BlockSize];
-
-        auto read_global_safe
-            = [&](const uint32_t i) { return i < front_size ? front[i] : zero_elem; };
-
-        const uint32_t tid = threadIdx.x, bid = blockIdx.x, gid = bid * (blockDim.x * 2) + tid;
-
-        // Read input from front buffer to shared
-        shared[tid] = op(read_global_safe(gid), read_global_safe(gid + blockDim.x));
-        __syncthreads();
-
-        // Shared reduction
-        tmp::static_for<BlockSize / 2, tmp::greater_than<WarpSize>, tmp::divide<2>>(
-            [&]<int I>()
-            {
-                if(tid < I)
-                    shared[tid] = op(shared[tid], shared[tid + I]);
-                __syncthreads();
-            });
-        // Warp reduction
-        if(tid < WarpSize)
-        {
-            T res = op(shared[tid], shared[tid + WarpSize]);
-            tmp::static_for<WarpSize / 2, tmp::not_equal<0>, tmp::divide<2>>(
-                [&]<int Delta>() { res = op(res, __shfl_down(res, Delta)); });
-
-            // Write result from shared to back buffer
-            if(tid == 0)
-                back[bid] = res;
-        }
-    }
 };
 } // namespace reduction
diff --git a/Tutorials/reduction/include/Reduction/v8.hpp b/Tutorials/reduction/include/Reduction/v8.hpp
index 2a2a9c972..eb28a16a5 100644
--- a/Tutorials/reduction/include/Reduction/v8.hpp
+++ b/Tutorials/reduction/include/Reduction/v8.hpp
@@ -49,6 +49,49 @@
 
 namespace reduction
 {
+template<typename T, typename F, uint32_t BlockSize, uint32_t WarpSize>
+__global__ static __launch_bounds__(BlockSize) void kernel(
+    T* front, T* back, F op, T zero_elem, uint32_t front_size)
+{
+    static constexpr uint32_t WarpCount = BlockSize / WarpSize;
+    __shared__ T              shared[WarpCount];
+
+    auto read_global_safe = [&](const uint32_t i) { return i < front_size ? front[i] : zero_elem; };
+    auto read_shared_safe = [&](const uint32_t i) { return i < WarpCount ? shared[i] : zero_elem; };
+
+    const uint32_t tid = threadIdx.x, bid = blockIdx.x, gid = bid * (blockDim.x * 2) + tid,
+                   wid = tid / WarpSize, lid = tid % WarpSize;
+
+    // Read input from front buffer to local
+    T res = op(read_global_safe(gid), read_global_safe(gid + blockDim.x));
+
+    // Perform warp reductions and communicate results via shared
+    tmp::static_for<WarpCount,
+                    tmp::not_equal<0>,
+                    tmp::select<tmp::not_equal<1>, tmp::divide_ceil<WarpSize>, tmp::constant<0>>>(
+        [&]<uint32_t ActiveWarps>()
+        {
+            if(wid < ActiveWarps)
+            {
+                // Warp reduction
+                tmp::static_for<WarpSize / 2, tmp::not_equal<0>, tmp::divide<2>>(
+                    [&]<int Delta>() { res = op(res, __shfl_down(res, Delta)); });
+
+                // Write warp result from local to shared
+                if(lid == 0)
+                    shared[wid] = res;
+            }
+            __syncthreads();
+
+            // Read warp result from shared to local
+            res = read_shared_safe(tid);
+        });
+
+    // Write result from local to back buffer
+    if(tid == 0)
+        back[bid] = res;
+}
+
 template<typename T, typename F>
 class v8
 {
@@ -100,16 +143,11 @@ class v8
                         warp_size,
                         [&]<int WarpSize>() noexcept
                         {
-                            hipLaunchKernelGGL(kernel<BlockSize, WarpSize>,
-                                               dim3(new_size(factor, step_size)),
-                                               dim3(BlockSize),
-                                               0,
-                                               hipStreamDefault,
-                                               front,
-                                               back,
-                                               kernel_op,
-                                               zero_elem,
-                                               step_size);
+                            HIP_KERNEL_NAME(kernel<T, F, BlockSize, WarpSize>)<<<
+                                dim3(new_size(factor, step_size)),
+                                dim3(BlockSize),
+                                0,
+                                hipStreamDefault>>>(front, back, kernel_op, zero_elem, step_size);
                         });
                 });
         };
@@ -161,51 +199,5 @@ class v8
     {
         return actual / factor + (actual % factor == 0 ? 0 : 1);
     }
-
-    template<uint32_t BlockSize, uint32_t WarpSize>
-    __global__ static __launch_bounds__(BlockSize) void kernel(
-        T* front, T* back, F op, T zero_elem, uint32_t front_size)
-    {
-        static constexpr uint32_t WarpCount = BlockSize / WarpSize;
-        __shared__ T              shared[WarpCount];
-
-        auto read_global_safe
-            = [&](const uint32_t i) { return i < front_size ? front[i] : zero_elem; };
-        auto read_shared_safe
-            = [&](const uint32_t i) { return i < WarpCount ? shared[i] : zero_elem; };
-
-        const uint32_t tid = threadIdx.x, bid = blockIdx.x, gid = bid * (blockDim.x * 2) + tid,
-                       wid = tid / WarpSize, lid = tid % WarpSize;
-
-        // Read input from front buffer to local
-        T res = op(read_global_safe(gid), read_global_safe(gid + blockDim.x));
-
-        // Perform warp reductions and communicate results via shared
-        tmp::static_for<
-            WarpCount,
-            tmp::not_equal<0>,
-            tmp::select<tmp::not_equal<1>, tmp::divide_ceil<WarpSize>, tmp::constant<0>>>(
-            [&]<uint32_t ActiveWarps>()
-            {
-                if(wid < ActiveWarps)
-                {
-                    // Warp reduction
-                    tmp::static_for<WarpSize / 2, tmp::not_equal<0>, tmp::divide<2>>(
-                        [&]<int Delta>() { res = op(res, __shfl_down(res, Delta)); });
-
-                    // Write warp result from local to shared
-                    if(lid == 0)
-                        shared[wid] = res;
-                }
-                __syncthreads();
-
-                // Read warp result from shared to local
-                res = read_shared_safe(tid);
-            });
-
-        // Write result from local to back buffer
-        if(tid == 0)
-            back[bid] = res;
-    }
 };
 } // namespace reduction
diff --git a/Tutorials/reduction/include/Reduction/v9.hpp b/Tutorials/reduction/include/Reduction/v9.hpp
index b83f67035..5fd402a57 100644
--- a/Tutorials/reduction/include/Reduction/v9.hpp
+++ b/Tutorials/reduction/include/Reduction/v9.hpp
@@ -50,6 +50,70 @@
 
 namespace reduction
 {
+template<typename T, typename F, uint32_t BlockSize, uint32_t WarpSize, uint32_t ItemsPerThread>
+__global__ static __launch_bounds__(BlockSize) void kernel(
+    T* front, T* back, F op, T zero_elem, uint32_t front_size)
+{
+    static constexpr uint32_t WarpCount = BlockSize / WarpSize;
+
+    __shared__ T shared[WarpCount];
+
+    auto read_global_safe = [&](const int32_t i)
+    {
+        return [&]<int32_t... I>(std::integer_sequence<int32_t, I...>)
+        {
+            if(i + ItemsPerThread < front_size)
+                return hip::static_array<T, ItemsPerThread>{front[i + I]...};
+            else
+                return hip::static_array<T, ItemsPerThread>{
+                    (i + I < front_size ? front[i + I] : zero_elem)...};
+        }(std::make_integer_sequence<int32_t, ItemsPerThread>());
+    };
+    auto read_shared_safe = [&](const int32_t i) { return i < WarpCount ? shared[i] : zero_elem; };
+
+    const int32_t tid = threadIdx.x, bid = blockIdx.x,
+                  gid = bid * (blockDim.x * ItemsPerThread) + tid * ItemsPerThread,
+                  wid = tid / WarpSize, lid = tid % WarpSize;
+
+    T res = [&]()
+    {
+        // Read input from front buffer to local
+        hip::static_array<T, ItemsPerThread> arr = read_global_safe(gid);
+
+        // Reduce ItemsPerThread to scalar
+        tmp::static_for<1, tmp::less_than<ItemsPerThread>, tmp::increment<1>>(
+            [&]<int I>() { get<0>(arr) = op(get<0>(arr), get<I>(arr)); });
+
+        return get<0>(arr);
+    }();
+
+    // Perform warp reductions and communicate results via shared
+    tmp::static_for<WarpCount,
+                    tmp::not_equal<0>,
+                    tmp::select<tmp::not_equal<1>, tmp::divide_ceil<WarpSize>, tmp::constant<0>>>(
+        [&]<uint32_t ActiveWarps>()
+        {
+            if(wid < ActiveWarps)
+            {
+                // Warp reduction
+                tmp::static_for<WarpSize / 2, tmp::not_equal<0>, tmp::divide<2>>(
+                    [&]<int Delta>() { res = op(res, __shfl_down(res, Delta)); });
+
+                // Write warp result from local to shared
+                if(lid == 0)
+                    shared[wid] = res;
+            }
+            __syncthreads();
+
+            // Read warp result from shared to local
+            res = read_shared_safe(tid);
+        });
+
+    // Write result from local to back buffer
+    if(tid == 0)
+        back[bid] = res;
+}
+
 template<typename T, typename F>
 class v9
 {
@@ -105,16 +169,16 @@ class v9
                                 items_per_thread,
                                 [&]<int ItemsPerThread>() noexcept
                                 {
-                                    hipLaunchKernelGGL(kernel<BlockSize, WarpSize, ItemsPerThread>,
-                                                       dim3(new_size(factor, step_size)),
-                                                       dim3(BlockSize),
-                                                       0,
-                                                       hipStreamDefault,
-                                                       front,
-                                                       back,
-                                                       kernel_op,
-                                                       zero_elem,
-                                                       step_size);
+                                    HIP_KERNEL_NAME(
+                                        kernel<T, F, BlockSize, WarpSize, ItemsPerThread>)<<<
+                                        dim3(new_size(factor, step_size)),
+                                        dim3(BlockSize),
+                                        0,
+                                        hipStreamDefault>>>(front,
+                                                            back,
+                                                            kernel_op,
+                                                            zero_elem,
+                                                            step_size);
                                 });
                         });
                 });
@@ -167,71 +231,5 @@ class v9
     {
         return actual / factor + (actual % factor == 0 ? 0 : 1);
     }
-
-    template<uint32_t BlockSize, uint32_t WarpSize, uint32_t ItemsPerThread>
-    __global__ static __launch_bounds__(BlockSize) void kernel(
-        T* front, T* back, F op, T zero_elem, uint32_t front_size)
-    {
-        static constexpr uint32_t WarpCount = BlockSize / WarpSize;
-
-        __shared__ T shared[WarpCount];
-
-        auto read_global_safe = [&](const int32_t i)
-        {
-            return [&]<int32_t... I>(std::integer_sequence<int32_t, I...>)
-            {
-                if(i + ItemsPerThread < front_size)
-                    return hip::static_array<T, ItemsPerThread>{front[i + I]...};
-                else
-                    return hip::static_array<T, ItemsPerThread>{
-                        (i + I < front_size ? front[i + I] : zero_elem)...};
-            }(std::make_integer_sequence<int32_t, ItemsPerThread>());
-        };
-        auto read_shared_safe
-            = [&](const int32_t i) { return i < WarpCount ? shared[i] : zero_elem; };
-
-        const int32_t tid = threadIdx.x, bid = blockIdx.x,
-                      gid = bid * (blockDim.x * ItemsPerThread) + tid * ItemsPerThread,
-                      wid = tid / WarpSize, lid = tid % WarpSize;
-
-        T res = [&]()
-        {
-            // Read input from front buffer to local
-            hip::static_array<T, ItemsPerThread> arr = read_global_safe(gid);
-
-            // Reduce ItemsPerThread to scalar
-            tmp::static_for<1, tmp::less_than<ItemsPerThread>, tmp::increment<1>>(
-                [&]<int I>() { get<0>(arr) = op(get<0>(arr), get<I>(arr)); });
-
-            return get<0>(arr);
-        }();
-
-        // Perform warp reductions and communicate results via shared
-        tmp::static_for<
-            WarpCount,
-            tmp::not_equal<0>,
-            tmp::select<tmp::not_equal<1>, tmp::divide_ceil<WarpSize>, tmp::constant<0>>>(
-            [&]<uint32_t ActiveWarps>()
-            {
-                if(wid < ActiveWarps)
-                {
-                    // Warp reduction
-                    tmp::static_for<WarpSize / 2, tmp::not_equal<0>, tmp::divide<2>>(
-                        [&]<int Delta>() { res = op(res, __shfl_down(res, Delta)); });
-
-                    // Write warp result from local to shared
-                    if(lid == 0)
-                        shared[wid] = res;
-                }
-                __syncthreads();
-
-                // Read warp result from shared to local
-                res = read_shared_safe(tid);
-            });
-
-        // Write result from local to back buffer
-        if(tid == 0)
-            back[bid] = res;
-    }
 };
 } // namespace reduction

From f08ccc088572f5cb06080015082fa2aca6cb5962 Mon Sep 17 00:00:00 2001
From: Beatriz Navidad Vilches <beatriz@streamhpc.com>
Date: Fri, 30 Aug 2024 11:07:47 +0200
Subject: [PATCH 4/5] fix: do not build the reduction tutorial on Windows

---
 Tutorials/reduction/CMakeLists.txt           |  5 +++++
 Tutorials/reduction/benchmark/CMakeLists.txt | 17 ++++++++++++++++-
 Tutorials/reduction/example/CMakeLists.txt   | 17 ++++++++++++++++-
 Tutorials/reduction/test/CMakeLists.txt      | 17 ++++++++++++++++-
 4 files changed, 53 insertions(+), 3 deletions(-)

diff --git a/Tutorials/reduction/CMakeLists.txt b/Tutorials/reduction/CMakeLists.txt
index 3b9627546..b2aa87c51 100644
--- a/Tutorials/reduction/CMakeLists.txt
+++ b/Tutorials/reduction/CMakeLists.txt
@@ -31,10 +31,15 @@ if("${GPU_RUNTIME}" STREQUAL "CUDA")
     cmake_minimum_required(VERSION 3.25.2)
 else()
     cmake_minimum_required(VERSION 3.21)
+    if(WIN32)
+        message(STATUS "The reduction tutorial is not supported on Windows. Not building.")
+        return()
+    endif()
 endif()
 
 project(Reduction LANGUAGES CXX)
 
+
 file(RELATIVE_PATH folder_bin ${CMAKE_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR})
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin/${folder_bin})
 
diff --git a/Tutorials/reduction/benchmark/CMakeLists.txt b/Tutorials/reduction/benchmark/CMakeLists.txt
index 2af2d4db1..d13165766 100644
--- a/Tutorials/reduction/benchmark/CMakeLists.txt
+++ b/Tutorials/reduction/benchmark/CMakeLists.txt
@@ -67,9 +67,24 @@ list(APPEND CMAKE_PREFIX_PATH "${ROCM_ROOT}")
 set(include_dirs "../../Common")
 list(APPEND include_dirs "${PROJECT_SOURCE_DIR}")
 
-# For examples targeting NVIDIA, include the HIP header directory.
 if("${GPU_RUNTIME}" STREQUAL "CUDA")
+    # For examples targeting NVIDIA, include the HIP header directory.
     list(APPEND include_dirs "${ROCM_ROOT}/include")
+    
+    # Some CUDA versions have issues when compiling for C++20 on Windows, check if we are using those
+    if(WIN32)
+        find_package(CUDA)
+        if(CUDA_FOUND)
+            set(FAULTY_CUDA_VERSION "12.5")
+            if(CUDA_VERSION VERSION_LESS_EQUAL ${FAULTY_CUDA_VERSION})
+                message(WARNING "CUDA version ${CUDA_VERSION} has issues when compiling for C++20. Not building reduction benchmarks.")
+                return()
+            endif()
+        else()
+            message(STATUS "CUDA Toolkit not found. Not building reduction benchmarks.")
+            return()
+        endif()
+    endif()
 endif()
 
 # libstdc++ Parallel STL on Ubuntu 20.04 requires explicit linking to TBB
diff --git a/Tutorials/reduction/example/CMakeLists.txt b/Tutorials/reduction/example/CMakeLists.txt
index 225dc8a59..f5bb26e0e 100644
--- a/Tutorials/reduction/example/CMakeLists.txt
+++ b/Tutorials/reduction/example/CMakeLists.txt
@@ -67,9 +67,24 @@ list(APPEND CMAKE_PREFIX_PATH "${ROCM_ROOT}")
 set(include_dirs "../../Common")
 list(APPEND include_dirs "${PROJECT_SOURCE_DIR}")
 
-# For examples targeting NVIDIA, include the HIP header directory.
 if("${GPU_RUNTIME}" STREQUAL "CUDA")
+    # For examples targeting NVIDIA, include the HIP header directory.
     list(APPEND include_dirs "${ROCM_ROOT}/include")
+
+    # Some CUDA versions have issues when compiling for C++20 on Windows, check if we are using those
+    if(WIN32)
+        find_package(CUDA)
+        if(CUDA_FOUND)
+            set(FAULTY_CUDA_VERSION "12.5")
+            if(CUDA_VERSION VERSION_LESS_EQUAL ${FAULTY_CUDA_VERSION})
+                message(WARNING "CUDA version ${CUDA_VERSION} has issues when compiling for C++20. Not building reduction examples.")
+                return()
+            endif()
+        else()
+            message(STATUS "CUDA Toolkit not found. Not building reduction examples.")
+            return()
+        endif()
+    endif()
 endif()
 
 # libstdc++ Parallel STL on Ubuntu 20.04 requires explicit linking to TBB
diff --git a/Tutorials/reduction/test/CMakeLists.txt b/Tutorials/reduction/test/CMakeLists.txt
index 15b5d2d7a..d01419a47 100644
--- a/Tutorials/reduction/test/CMakeLists.txt
+++ b/Tutorials/reduction/test/CMakeLists.txt
@@ -67,9 +67,24 @@ list(APPEND CMAKE_PREFIX_PATH "${ROCM_ROOT}")
 set(include_dirs "../../Common")
 list(APPEND include_dirs "${PROJECT_SOURCE_DIR}")
 
-# For examples targeting NVIDIA, include the HIP header directory.
 if("${GPU_RUNTIME}" STREQUAL "CUDA")
+    # For examples targeting NVIDIA, include the HIP header directory.
     list(APPEND include_dirs "${ROCM_ROOT}/include")
+
+    # Some CUDA versions have issues when compiling for C++20 on Windows, check if we are using those
+    if(WIN32)
+        find_package(CUDA)
+        if(CUDA_FOUND)
+            set(FAULTY_CUDA_VERSION "12.5")
+            if(CUDA_VERSION VERSION_LESS_EQUAL ${FAULTY_CUDA_VERSION})
+                message(WARNING "CUDA version ${CUDA_VERSION} has issues when compiling for C++20. Not building reduction tests.")
+                return()
+            endif()
+        else()
+            message(STATUS "CUDA Toolkit not found. Not building reduction tests.")
+            return()
+        endif()
+    endif()
 endif()
 
 # libstdc++ Parallel STL on Ubuntu 20.04 requires explicit linking to TBB

From e7024276133b85acdec9f1ddd639df9d567d5beb Mon Sep 17 00:00:00 2001
From: Beatriz Navidad Vilches <beatriz@streamhpc.com>
Date: Wed, 18 Sep 2024 16:31:23 +0200
Subject: [PATCH 5/5] Fixed CMake linting

---
 Tutorials/reduction/CMakeLists.txt           |  6 ++++--
 Tutorials/reduction/benchmark/CMakeLists.txt | 14 ++++++++++----
 Tutorials/reduction/example/CMakeLists.txt   | 10 ++++++++--
 Tutorials/reduction/test/CMakeLists.txt      | 12 +++++++++---
 4 files changed, 31 insertions(+), 11 deletions(-)

diff --git a/Tutorials/reduction/CMakeLists.txt b/Tutorials/reduction/CMakeLists.txt
index b2aa87c51..4eb6b6245 100644
--- a/Tutorials/reduction/CMakeLists.txt
+++ b/Tutorials/reduction/CMakeLists.txt
@@ -32,14 +32,16 @@ if("${GPU_RUNTIME}" STREQUAL "CUDA")
 else()
     cmake_minimum_required(VERSION 3.21)
     if(WIN32)
-        message(STATUS "The reduction tutorial is not supported on Windows. Not building.")
+        message(
+            STATUS
+            "The reduction tutorial is not supported on Windows. Not building."
+        )
         return()
     endif()
 endif()
 
 project(Reduction LANGUAGES CXX)
 
-
 file(RELATIVE_PATH folder_bin ${CMAKE_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR})
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin/${folder_bin})
 
diff --git a/Tutorials/reduction/benchmark/CMakeLists.txt b/Tutorials/reduction/benchmark/CMakeLists.txt
index d13165766..4cb4e8dc8 100644
--- a/Tutorials/reduction/benchmark/CMakeLists.txt
+++ b/Tutorials/reduction/benchmark/CMakeLists.txt
@@ -70,18 +70,24 @@ list(APPEND include_dirs "${PROJECT_SOURCE_DIR}")
 if("${GPU_RUNTIME}" STREQUAL "CUDA")
     # For examples targeting NVIDIA, include the HIP header directory.
     list(APPEND include_dirs "${ROCM_ROOT}/include")
-    
+
     # Some CUDA versions have issues when compiling for C++20 on Windows, check if we are using those
     if(WIN32)
         find_package(CUDA)
         if(CUDA_FOUND)
             set(FAULTY_CUDA_VERSION "12.5")
             if(CUDA_VERSION VERSION_LESS_EQUAL ${FAULTY_CUDA_VERSION})
-                message(WARNING "CUDA version ${CUDA_VERSION} has issues when compiling for C++20. Not building reduction benchmarks.")
+                message(
+                    WARNING
+                    "CUDA version ${CUDA_VERSION} has issues when compiling for C++20. Not building reduction benchmarks."
+                )
                 return()
             endif()
         else()
-            message(STATUS "CUDA Toolkit not found. Not building reduction benchmarks.")
+            message(
+                STATUS
+                "CUDA Toolkit not found. Not building reduction benchmarks."
+            )
             return()
         endif()
     endif()
@@ -158,7 +164,7 @@ foreach(VER RANGE 0 10)
             ${Sources}
             PROPERTIES LANGUAGE ${GPU_RUNTIME}
         )
-        SET(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${ROCM_ROOT}/cmake)
+        set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${ROCM_ROOT}/cmake)
         find_package(HIP MODULE REQUIRED)
         target_include_directories(
             ${TargetName}
diff --git a/Tutorials/reduction/example/CMakeLists.txt b/Tutorials/reduction/example/CMakeLists.txt
index f5bb26e0e..e7f6a8d4b 100644
--- a/Tutorials/reduction/example/CMakeLists.txt
+++ b/Tutorials/reduction/example/CMakeLists.txt
@@ -77,11 +77,17 @@ if("${GPU_RUNTIME}" STREQUAL "CUDA")
         if(CUDA_FOUND)
             set(FAULTY_CUDA_VERSION "12.5")
             if(CUDA_VERSION VERSION_LESS_EQUAL ${FAULTY_CUDA_VERSION})
-                message(WARNING "CUDA version ${CUDA_VERSION} has issues when compiling for C++20. Not building reduction examples.")
+                message(
+                    WARNING
+                    "CUDA version ${CUDA_VERSION} has issues when compiling for C++20. Not building reduction examples."
+                )
                 return()
             endif()
         else()
-            message(STATUS "CUDA Toolkit not found. Not building reduction examples.")
+            message(
+                STATUS
+                "CUDA Toolkit not found. Not building reduction examples."
+            )
             return()
         endif()
     endif()
diff --git a/Tutorials/reduction/test/CMakeLists.txt b/Tutorials/reduction/test/CMakeLists.txt
index d01419a47..6c71c6207 100644
--- a/Tutorials/reduction/test/CMakeLists.txt
+++ b/Tutorials/reduction/test/CMakeLists.txt
@@ -77,11 +77,17 @@ if("${GPU_RUNTIME}" STREQUAL "CUDA")
         if(CUDA_FOUND)
             set(FAULTY_CUDA_VERSION "12.5")
             if(CUDA_VERSION VERSION_LESS_EQUAL ${FAULTY_CUDA_VERSION})
-                message(WARNING "CUDA version ${CUDA_VERSION} has issues when compiling for C++20. Not building reduction tests.")
+                message(
+                    WARNING
+                    "CUDA version ${CUDA_VERSION} has issues when compiling for C++20. Not building reduction tests."
+                )
                 return()
             endif()
         else()
-            message(STATUS "CUDA Toolkit not found. Not building reduction tests.")
+            message(
+                STATUS
+                "CUDA Toolkit not found. Not building reduction tests."
+            )
             return()
         endif()
     endif()
@@ -179,7 +185,7 @@ foreach(VER RANGE 1 10)
             ${Sources}
             PROPERTIES LANGUAGE ${GPU_RUNTIME}
         )
-        SET(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${ROCM_ROOT}/cmake)
+        set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${ROCM_ROOT}/cmake)
         find_package(HIP MODULE REQUIRED)
         target_include_directories(
             ${TargetName}