Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix Tutorials config and build #163

Merged
merged 5 commits into from
Dec 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions Tutorials/reduction/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,13 @@ if("${GPU_RUNTIME}" STREQUAL "CUDA")
cmake_minimum_required(VERSION 3.25.2)
else()
cmake_minimum_required(VERSION 3.21)
if(WIN32)
message(
STATUS
"The reduction tutorial is not supported on Windows. Not building."
)
return()
endif()
endif()

project(Reduction LANGUAGES CXX)
Expand Down
26 changes: 25 additions & 1 deletion Tutorials/reduction/benchmark/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ project(reduction_benchmarks LANGUAGES CXX)

if("${GPU_RUNTIME}" STREQUAL "CUDA")
cmake_minimum_required(VERSION 3.25.2)
# Allow __device__ or __host__ annotated lambdas
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --extended-lambda")
else()
cmake_minimum_required(VERSION 3.21)
# Add -fPIE flag to compiler.
Expand Down Expand Up @@ -65,9 +67,30 @@ list(APPEND CMAKE_PREFIX_PATH "${ROCM_ROOT}")
set(include_dirs "../../Common")
list(APPEND include_dirs "${PROJECT_SOURCE_DIR}")

# For examples targeting NVIDIA, include the HIP header directory.
if("${GPU_RUNTIME}" STREQUAL "CUDA")
# For examples targeting NVIDIA, include the HIP header directory.
list(APPEND include_dirs "${ROCM_ROOT}/include")

# Some CUDA versions have issues when compiling for C++20 on Windows, check if we are using those
if(WIN32)
find_package(CUDA)
if(CUDA_FOUND)
set(FAULTY_CUDA_VERSION "12.5")
if(CUDA_VERSION VERSION_LESS_EQUAL ${FAULTY_CUDA_VERSION})
message(
WARNING
"CUDA version ${CUDA_VERSION} has issues when compiling for C++20. Not building reduction benchmarks."
)
return()
endif()
else()
message(
STATUS
"CUDA Toolkit not found. Not building reduction benchmarks."
)
return()
endif()
endif()
endif()

# libstdc++ Parallel STL on Ubuntu 20.04 requires explicit linking to TBB
Expand Down Expand Up @@ -141,6 +164,7 @@ foreach(VER RANGE 0 10)
${Sources}
PROPERTIES LANGUAGE ${GPU_RUNTIME}
)
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${ROCM_ROOT}/cmake)
find_package(HIP MODULE REQUIRED)
target_include_directories(
${TargetName}
Expand Down
25 changes: 24 additions & 1 deletion Tutorials/reduction/example/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ project(reduction_examples LANGUAGES CXX)

if("${GPU_RUNTIME}" STREQUAL "CUDA")
cmake_minimum_required(VERSION 3.25.2)
# Allow __device__ or __host__ annotated lambdas
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --extended-lambda")
else()
cmake_minimum_required(VERSION 3.21)
# Add -fPIE flag to compiler.
Expand Down Expand Up @@ -65,9 +67,30 @@ list(APPEND CMAKE_PREFIX_PATH "${ROCM_ROOT}")
set(include_dirs "../../Common")
list(APPEND include_dirs "${PROJECT_SOURCE_DIR}")

# For examples targeting NVIDIA, include the HIP header directory.
if("${GPU_RUNTIME}" STREQUAL "CUDA")
# For examples targeting NVIDIA, include the HIP header directory.
list(APPEND include_dirs "${ROCM_ROOT}/include")

# Some CUDA versions have issues when compiling for C++20 on Windows, check if we are using those
if(WIN32)
find_package(CUDA)
if(CUDA_FOUND)
set(FAULTY_CUDA_VERSION "12.5")
if(CUDA_VERSION VERSION_LESS_EQUAL ${FAULTY_CUDA_VERSION})
message(
WARNING
"CUDA version ${CUDA_VERSION} has issues when compiling for C++20. Not building reduction examples."
)
return()
endif()
else()
message(
STATUS
"CUDA Toolkit not found. Not building reduction examples."
)
return()
endif()
endif()
endif()

# libstdc++ Parallel STL on Ubuntu 20.04 requires explicit linking to TBB
Expand Down
67 changes: 31 additions & 36 deletions Tutorials/reduction/include/Reduction/v1.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,32 @@

namespace reduction
{
template<typename T, typename F>
__global__ static void kernel(T* front, T* back, F op, T zero_elem, uint32_t front_size)
{
extern __shared__ T shared[];

auto read_global_safe = [&](const uint32_t i) { return i < front_size ? front[i] : zero_elem; };

const uint32_t tid = threadIdx.x, bid = blockIdx.x, gid = bid * blockDim.x + tid;

// Read input from front buffer to shared
shared[tid] = read_global_safe(gid);
__syncthreads();

// Shared reduction
for(uint32_t i = 1; i < blockDim.x; i *= 2)
{
if(tid % (2 * i) == 0)
shared[tid] = op(shared[tid], shared[tid + i]);
__syncthreads();
}

// Write result from shared to back buffer
if(tid == 0)
back[bid] = shared[0];
}

template<typename T, typename F>
class v1
{
Expand Down Expand Up @@ -89,16 +115,11 @@ class v1
std::size_t curr = input.size();
while(curr > 1)
{
hipLaunchKernelGGL(kernel,
dim3(new_size(factor, curr)),
dim3(block_size),
factor * sizeof(T),
hipStreamDefault,
front,
back,
kernel_op,
zero_elem,
curr);
HIP_KERNEL_NAME(
kernel<T, F>)<<<dim3(new_size(factor, curr)),
dim3(block_size),
factor * sizeof(T),
hipStreamDefault>>>(front, back, kernel_op, zero_elem, curr);
hip::check(hipGetLastError(), "hipKernelLaunchGGL");

curr = new_size(factor, curr);
Expand Down Expand Up @@ -136,31 +157,5 @@ class v1
{
return actual / factor + (actual % factor == 0 ? 0 : 1);
}

__global__ static void kernel(T* front, T* back, F op, T zero_elem, uint32_t front_size)
{
extern __shared__ T shared[];

auto read_global_safe
= [&](const uint32_t i) { return i < front_size ? front[i] : zero_elem; };

const uint32_t tid = threadIdx.x, bid = blockIdx.x, gid = bid * blockDim.x + tid;

// Read input from front buffer to shared
shared[tid] = read_global_safe(gid);
__syncthreads();

// Shared reduction
for(uint32_t i = 1; i < blockDim.x; i *= 2)
{
if(tid % (2 * i) == 0)
shared[tid] = op(shared[tid], shared[tid + i]);
__syncthreads();
}

// Write result from shared to back buffer
if(tid == 0)
back[bid] = shared[0];
}
};
} // namespace reduction
125 changes: 61 additions & 64 deletions Tutorials/reduction/include/Reduction/v10.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,52 @@

namespace reduction
{
template<typename T, typename F, uint32_t BlockSize, uint32_t WarpSize>
__global__ static __launch_bounds__(BlockSize) void kernel(
T* front, T* back, F op, T zero_elem, uint32_t front_size)
{
static constexpr uint32_t WarpCount = BlockSize / WarpSize;
__shared__ T shared[WarpCount];

auto read_global_safe = [&](const uint32_t i) { return i < front_size ? front[i] : zero_elem; };
auto read_shared_safe = [&](const uint32_t i) { return i < WarpCount ? shared[i] : zero_elem; };

const uint32_t tid = threadIdx.x, bid = blockIdx.x, gid = bid * blockDim.x + tid,
gsi = gridDim.x * blockDim.x, wid = tid / WarpSize, lid = tid % WarpSize;

// Read input from front buffer to local
T res = read_global_safe(gid);
const uint32_t num_global_passes = front_size > gsi ? front_size / gsi : 0;
for(uint32_t i = 0; i < num_global_passes; ++i)
res = op(res, read_global_safe((i + 1) * gsi + gid));

// Perform warp reductions and communicate results via shared
tmp::static_for<WarpCount,
tmp::not_equal<0>,
tmp::select<tmp::not_equal<1>, tmp::divide_ceil<WarpSize>, tmp::constant<0>>>(
[&]<uint32_t ActiveWarps>()
{
if(wid < ActiveWarps)
{
// Warp reduction
tmp::static_for<WarpSize / 2, tmp::not_equal<0>, tmp::divide<2>>(
[&]<int Delta>() { res = op(res, __shfl_down(res, Delta)); });

// Write warp result from local to shared
if(lid == 0)
shared[wid] = res;
}
__syncthreads();

// Read warp result from shared to local
res = read_shared_safe(tid);
});

// Write result from local to back buffer
if(tid == 0)
back[bid] = res;
}

template<typename T, typename F>
class v10
{
Expand Down Expand Up @@ -97,21 +143,21 @@ class v10
block_size,
[&]<int BlockSize>() noexcept
{
tmp::static_switch<std::array{32, 64}>(warp_size,
[&]<int WarpSize>() noexcept
{
hipLaunchKernelGGL(
kernel<BlockSize, WarpSize>,
dim3(block_count),
dim3(BlockSize),
0,
0,
front,
back,
kernel_op,
zero_elem,
step_size);
});
tmp::static_switch<std::array{32, 64}>(
warp_size,
[&]<int WarpSize>() noexcept
{
HIP_KERNEL_NAME(
kernel<T,
F,
BlockSize,
WarpSize>)<<<dim3(block_count), dim3(BlockSize), 0, 0>>>(
front,
back,
kernel_op,
zero_elem,
step_size);
});
});
};

Expand Down Expand Up @@ -158,54 +204,5 @@ class v10
{
return actual / factor + (actual % factor == 0 ? 0 : 1);
}

template<uint32_t BlockSize, uint32_t WarpSize>
__global__ static __launch_bounds__(BlockSize) void kernel(
T* front, T* back, F op, T zero_elem, uint32_t front_size)
{
static constexpr uint32_t WarpCount = BlockSize / WarpSize;
__shared__ T shared[WarpCount];

auto read_global_safe
= [&](const uint32_t i) { return i < front_size ? front[i] : zero_elem; };
auto read_shared_safe
= [&](const uint32_t i) { return i < WarpCount ? shared[i] : zero_elem; };

const uint32_t tid = threadIdx.x, bid = blockIdx.x, gid = bid * blockDim.x + tid,
gsi = gridDim.x * blockDim.x, wid = tid / WarpSize, lid = tid % WarpSize;

// Read input from front buffer to local
T res = read_global_safe(gid);
const uint32_t num_global_passes = front_size > gsi ? front_size / gsi : 0;
for(uint32_t i = 0; i < num_global_passes; ++i)
res = op(res, read_global_safe((i + 1) * gsi + gid));

// Perform warp reductions and communicate results via shared
tmp::static_for<
WarpCount,
tmp::not_equal<0>,
tmp::select<tmp::not_equal<1>, tmp::divide_ceil<WarpSize>, tmp::constant<0>>>(
[&]<uint32_t ActiveWarps>()
{
if(wid < ActiveWarps)
{
// Warp reduction
tmp::static_for<WarpSize / 2, tmp::not_equal<0>, tmp::divide<2>>(
[&]<int Delta>() { res = op(res, __shfl_down(res, Delta)); });

// Write warp result from local to shared
if(lid == 0)
shared[wid] = res;
}
__syncthreads();

// Read warp result from shared to local
res = read_shared_safe(tid);
});

// Write result from local to back buffer
if(tid == 0)
back[bid] = res;
}
};
} // namespace reduction
Loading
Loading