Skip to content

Commit

Permalink
add shfl_up, shfl_down and shfl_xor methods
Browse files Browse the repository at this point in the history
  • Loading branch information
AuroraPerego committed Nov 6, 2023
1 parent 2b64195 commit a2468e4
Show file tree
Hide file tree
Showing 4 changed files with 276 additions and 24 deletions.
132 changes: 124 additions & 8 deletions include/alpaka/warp/Traits.hpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
/* Copyright 2022 Sergei Bastrakov, David M. Rogers, Bernhard Manfred Gruber
/* Copyright 2022 Sergei Bastrakov, David M. Rogers, Bernhard Manfred Gruber, Aurora Perego
* SPDX-License-Identifier: MPL-2.0
*/

Expand Down Expand Up @@ -39,6 +39,18 @@ namespace alpaka::warp
template<typename TWarp, typename TSfinae = void>
struct Shfl;

//! The shfl up warp swizzling trait.
template<typename TWarp, typename TSfinae = void>
struct ShflUp;

//! The shfl down warp swizzling trait.
template<typename TWarp, typename TSfinae = void>
struct ShflDown;

//! The shfl xor warp swizzling trait.
template<typename TWarp, typename TSfinae = void>
struct ShflXor;

//! The active mask trait.
template<typename TWarp, typename TSfinae = void>
struct Activemask;
Expand Down Expand Up @@ -162,7 +174,7 @@ namespace alpaka::warp
//! __shared__ int32_t values[warpsize];
//! values[threadIdx.x] = value;
//! __syncthreads();
//! return values[(srcLane + width*floor(threadIdx.x/width))%width];
//! return values[width*(threadIdx.x/width) + srcLane%width];
//!
//! However, it does not use shared memory.
//!
Expand All @@ -182,19 +194,123 @@ namespace alpaka::warp
//! \param width number of threads receiving a single value
//! \return val from the thread index srcLane.
ALPAKA_NO_HOST_ACC_WARNING
template<typename TWarp>
ALPAKA_FN_ACC auto shfl(TWarp const& warp, std::int32_t value, std::int32_t srcLane, std::int32_t width = 0)
template<typename TWarp, typename T>
ALPAKA_FN_ACC auto shfl(TWarp const& warp, T value, std::int32_t srcLane, std::int32_t width = 0)
{
using ImplementationBase = concepts::ImplementationBase<ConceptWarp, TWarp>;
return trait::Shfl<ImplementationBase>::shfl(warp, value, srcLane, width ? width : getSize(warp));
}

//! shfl for float vals
//! Exchange data between threads within a warp.
//! It copies from a lane with lower ID relative to caller.
//! The lane ID is calculated by subtracting delta from the caller’s lane ID.
//!
//! Effectively executes:
//!
//! __shared__ int32_t values[warpsize];
//! values[threadIdx.x] = value;
//! __syncthreads();
//! return values[width*(threadIdx.x/width) + threadIdx.x%width - delta];
//!
//! However, it does not use shared memory.
//!
//! Notes:
//! * The programmer must ensure that all threads calling this
//! function (and the srcLane) are executing the same line of code.
//! In particular it is not portable to write if(a) {shfl} else {shfl}.
//!
//! * Commonly used with width = warpsize (the default), (returns values[threadIdx.x - delta] if threadIdx.x >=
//! delta)
//!
//! * Width must be a power of 2.
//!
//! \tparam TWarp warp implementation type
//! \tparam T value type
//! \param warp warp implementation
//! \param value value to broadcast
//! \param offset corresponds to the delta used to compute the lane ID
//! \param width number of threads receiving a single value
//! \return val from the thread index lane ID.
ALPAKA_NO_HOST_ACC_WARNING
template<typename TWarp>
ALPAKA_FN_ACC auto shfl(TWarp const& warp, float value, std::int32_t srcLane, std::int32_t width = 0)
template<typename TWarp, typename T>
ALPAKA_FN_ACC auto shfl_up(TWarp const& warp, T value, std::uint32_t offset, std::int32_t width = 0)
{
using ImplementationBase = concepts::ImplementationBase<ConceptWarp, TWarp>;
return trait::Shfl<ImplementationBase>::shfl(warp, value, srcLane, width ? width : getSize(warp));
return trait::ShflUp<ImplementationBase>::shfl_up(warp, value, offset, width ? width : getSize(warp));
}

//! Exchange data between threads within a warp.
//! It copies from a lane with higher ID relative to caller.
//! The lane ID is calculated by adding delta from the caller’s lane ID.
//!
//! Effectively executes:
//!
//! __shared__ int32_t values[warpsize];
//! values[threadIdx.x] = value;
//! __syncthreads();
//! return values[width*(threadIdx.x/width) + threadIdx.x%width + delta];
//!
//! However, it does not use shared memory.
//!
//! Notes:
//! * The programmer must ensure that all threads calling this
//! function (and the srcLane) are executing the same line of code.
//! In particular it is not portable to write if(a) {shfl} else {shfl}.
//!
//! * Commonly used with width = warpsize (the default), (returns values[threadIdx.x+delta] if threadIdx.x+delta <
//! warpsize)
//!
//! * Width must be a power of 2.
//!
//! \tparam TWarp warp implementation type
//! \tparam T value type
//! \param warp warp implementation
//! \param value value to broadcast
//! \param offset corresponds to the delta used to compute the lane ID
//! \param width number of threads receiving a single value
//! \return val from the thread index lane ID.
ALPAKA_NO_HOST_ACC_WARNING
template<typename TWarp, typename T>
ALPAKA_FN_ACC auto shfl_down(TWarp const& warp, T value, std::uint32_t offset, std::int32_t width = 0)
{
using ImplementationBase = concepts::ImplementationBase<ConceptWarp, TWarp>;
return trait::ShflDown<ImplementationBase>::shfl_down(warp, value, offset, width ? width : getSize(warp));
}

//! Exchange data between threads within a warp.
//! It copies from a lane based on bitwise XOR of own lane ID.
//! The lane ID is calculated by performing a bitwise XOR of the caller’s lane ID with delta
//!
//! Effectively executes:
//!
//! __shared__ int32_t values[warpsize];
//! values[threadIdx.x] = value;
//! __syncthreads();
//! return values[width*(threadIdx.x/width) + (threadIdx.x%width ^ delta)];
//!
//! However, it does not use shared memory.
//!
//! Notes:
//! * The programmer must ensure that all threads calling this
//! function (and the srcLane) are executing the same line of code.
//! In particular it is not portable to write if(a) {shfl} else {shfl}.
//!
//! * Commonly used with width = warpsize (the default), (returns values[threadIdx.x^delta])
//!
//! * Width must be a power of 2.
//!
//! \tparam TWarp warp implementation type
//! \tparam T value type
//! \param warp warp implementation
//! \param value value to broadcast
//! \param offset corresponds to the delta used to compute the lane ID
//! \param width number of threads receiving a single value
//! \return val from the thread index lane ID.
ALPAKA_NO_HOST_ACC_WARNING
template<typename TWarp, typename T>
ALPAKA_FN_ACC auto shfl_xor(TWarp const& warp, T value, std::int32_t offset, std::int32_t width = 0)
{
using ImplementationBase = concepts::ImplementationBase<ConceptWarp, TWarp>;
return trait::ShflXor<ImplementationBase>::shfl_xor(warp, value, offset, width ? width : getSize(warp));
}
} // namespace alpaka::warp
63 changes: 62 additions & 1 deletion include/alpaka/warp/WarpGenericSycl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,6 @@ namespace alpaka::warp::trait
static auto shfl(warp::WarpGenericSycl<TDim> const& warp, T value, std::int32_t srcLane, std::int32_t width)
{
ALPAKA_ASSERT_OFFLOAD(width > 0);
ALPAKA_ASSERT_OFFLOAD(srcLane < width);
ALPAKA_ASSERT_OFFLOAD(srcLane >= 0);

/* If width < srcLane the sub-group needs to be split into assumed subdivisions. The first item of each
Expand All @@ -132,6 +131,68 @@ namespace alpaka::warp::trait
return sycl::select_from_group(actual_group, value, src);
}
};

template<typename TDim>
struct ShflUp<warp::WarpGenericSycl<TDim>>
{
template<typename T>
static auto shfl_up(
warp::WarpGenericSycl<TDim> const& warp,
T value,
std::uint32_t offset, /* must be the same for all work-items in the group */
std::int32_t width)
{
std::int32_t offset_int = static_cast<std::int32_t>(offset);
auto const actual_group = warp.m_item_warp.get_sub_group();
auto actual_item_id = static_cast<std::int32_t>(actual_group.get_local_linear_id());
auto const actual_group_id = actual_item_id / width;
auto const actual_src_id = actual_item_id - offset_int;
auto const src = actual_src_id >= actual_group_id * width
? sycl::id<1>{static_cast<std::size_t>(actual_src_id)}
: sycl::id<1>{static_cast<std::size_t>(actual_item_id)};
return sycl::select_from_group(actual_group, value, src);
}
};

template<typename TDim>
struct ShflDown<warp::WarpGenericSycl<TDim>>
{
template<typename T>
static auto shfl_down(
warp::WarpGenericSycl<TDim> const& warp,
T value,
std::uint32_t offset,
std::int32_t width)
{
std::int32_t offset_int = static_cast<std::int32_t>(offset);
auto const actual_group = warp.m_item_warp.get_sub_group();
auto actual_item_id = static_cast<std::int32_t>(actual_group.get_local_linear_id());
auto const actual_group_id = actual_item_id / width;
auto const actual_src_id = actual_item_id + offset_int;
auto const src = actual_src_id < (actual_group_id + 1) * width
? sycl::id<1>{static_cast<std::size_t>(actual_src_id)}
: sycl::id<1>{static_cast<std::size_t>(actual_item_id)};
return sycl::select_from_group(actual_group, value, src);
}
};

template<typename TDim>
struct ShflXor<warp::WarpGenericSycl<TDim>>
{
template<typename T>
static auto shfl_xor(
warp::WarpGenericSycl<TDim> const& warp,
T value,
std::int32_t offset,
std::int32_t /*width*/)
{
auto const actual_group = warp.m_item_warp.get_sub_group();
auto actual_item_id = static_cast<std::int32_t>(actual_group.get_local_linear_id());
auto const actual_src_id = actual_item_id ^ offset;
auto const src = sycl::id<1>{static_cast<std::size_t>(actual_src_id)};
return sycl::select_from_group(actual_group, value, src);
}
};
} // namespace alpaka::warp::trait

#endif
42 changes: 38 additions & 4 deletions include/alpaka/warp/WarpSingleThread.hpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
/* Copyright 2022 Sergei Bastrakov, David M. Rogers, Bernhard Manfred Gruber
/* Copyright 2022 Sergei Bastrakov, David M. Rogers, Bernhard Manfred Gruber, Aurora Perego
* SPDX-License-Identifier: MPL-2.0
*/

Expand Down Expand Up @@ -65,18 +65,52 @@ namespace alpaka::warp
template<>
struct Shfl<WarpSingleThread>
{
template<typename T>
static auto shfl(
warp::WarpSingleThread const& /*warp*/,
std::int32_t val,
T val,
std::int32_t /*srcLane*/,
std::int32_t /*width*/)
{
return val;
}
};

static auto shfl(
template<>
struct ShflUp<WarpSingleThread>
{
template<typename T>
static auto shfl_up(
warp::WarpSingleThread const& /*warp*/,
T val,
std::uint32_t /*srcLane*/,
std::int32_t /*width*/)
{
return val;
}
};

template<>
struct ShflDown<WarpSingleThread>
{
template<typename T>
static auto shfl_down(
warp::WarpSingleThread const& /*warp*/,
T val,
std::uint32_t /*srcLane*/,
std::int32_t /*width*/)
{
return val;
}
};

template<>
struct ShflXor<WarpSingleThread>
{
template<typename T>
static auto shfl_xor(
warp::WarpSingleThread const& /*warp*/,
float val,
T val,
std::int32_t /*srcLane*/,
std::int32_t /*width*/)
{
Expand Down
63 changes: 52 additions & 11 deletions include/alpaka/warp/WarpUniformCudaHipBuiltIn.hpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
/* Copyright 2022 Sergei Bastrakov, David M. Rogers, Jan Stephan, Andrea Bocci, Bernhard Manfred Gruber
/* Copyright 2022 Sergei Bastrakov, David M. Rogers, Jan Stephan, Andrea Bocci, Bernhard Manfred Gruber, Aurora Perego
* SPDX-License-Identifier: MPL-2.0
*/

Expand Down Expand Up @@ -113,34 +113,75 @@ namespace alpaka::warp
template<>
struct Shfl<WarpUniformCudaHipBuiltIn>
{
//-------------------------------------------------------------
template<typename T>
__device__ static auto shfl(
[[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp,
float val,
T val,
int srcLane,
std::int32_t width) -> float
std::int32_t width) -> T
{
# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
return __shfl_sync(activemask(warp), val, srcLane, width);
# else
return __shfl(val, srcLane, width);
# endif
}
};

//-------------------------------------------------------------
__device__ static auto shfl(
template<>
struct ShflUp<WarpUniformCudaHipBuiltIn>
{
template<typename T>
__device__ static auto shfl_up(
[[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp,
std::int32_t val,
int srcLane,
std::int32_t width) -> std::int32_t
T val,
std::uint32_t offset,
std::int32_t width) -> T
{
# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
return __shfl_sync(activemask(warp), val, srcLane, width);
return __shfl_up_sync(activemask(warp), val, offset, width);
# else
return __shfl(val, srcLane, width);
return __shfl_up(val, offset, width);
# endif
}
};

template<>
struct ShflDown<WarpUniformCudaHipBuiltIn>
{
template<typename T>
__device__ static auto shfl_down(
[[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp,
T val,
std::uint32_t offset,
std::int32_t width) -> T
{
# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
return __shfl_down_sync(activemask(warp), val, offset, width);
# else
return __shfl_down(val, offset, width);
# endif
}
};

template<>
struct ShflXor<WarpUniformCudaHipBuiltIn>
{
template<typename T>
__device__ static auto shfl_xor(
[[maybe_unused]] warp::WarpUniformCudaHipBuiltIn const& warp,
T val,
std::int32_t offset,
std::int32_t width) -> T
{
# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED)
return __shfl_xor_sync(activemask(warp), val, offset, width);
# else
return __shfl_xor(val, offset, width);
# endif
}
};

} // namespace trait
# endif
} // namespace alpaka::warp
Expand Down

0 comments on commit a2468e4

Please sign in to comment.