Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SYCL] Adopt the experimental free function extension #2073

Draft
wants to merge 1 commit into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 1 addition & 15 deletions include/alpaka/acc/AccGenericSycl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,25 +75,11 @@ namespace alpaka

AccGenericSycl(
Vec<TDim, TIdx> const& threadElemExtent,
sycl::nd_item<TDim::value> work_item,
sycl::local_accessor<std::byte> dyn_shared_acc,
sycl::local_accessor<std::byte> st_shared_acc)
: WorkDivGenericSycl<TDim, TIdx>{threadElemExtent, work_item}
, gb::IdxGbGenericSycl<TDim, TIdx>{work_item}
, bt::IdxBtGenericSycl<TDim, TIdx>{work_item}
, AtomicHierarchy<AtomicGenericSycl, AtomicGenericSycl, AtomicGenericSycl>{}
, math::MathGenericSycl{}
: WorkDivGenericSycl<TDim, TIdx>{threadElemExtent}
, BlockSharedMemDynGenericSycl{dyn_shared_acc}
, BlockSharedMemStGenericSycl{st_shared_acc}
, BlockSyncGenericSycl<TDim>{work_item}
, IntrinsicGenericSycl{}
, MemFenceGenericSycl{}
# ifdef ALPAKA_DISABLE_VENDOR_RNG
, rand::RandDefault{}
# else
, rand::RandGenericSycl<TDim>{work_item}
# endif
, warp::WarpGenericSycl<TDim>{work_item}
{
}
};
Expand Down
32 changes: 16 additions & 16 deletions include/alpaka/block/sync/BlockSyncGenericSycl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,7 @@ namespace alpaka
public:
using BlockSyncBase = BlockSyncGenericSycl<TDim>;

BlockSyncGenericSycl(sycl::nd_item<TDim::value> work_item) : my_item{work_item}
{
}

sycl::nd_item<TDim::value> my_item;
BlockSyncGenericSycl() = default;
};
} // namespace alpaka

Expand All @@ -32,20 +28,22 @@ namespace alpaka::trait
template<typename TDim>
struct SyncBlockThreads<BlockSyncGenericSycl<TDim>>
{
static auto syncBlockThreads(BlockSyncGenericSycl<TDim> const& blockSync) -> void
static auto syncBlockThreads(BlockSyncGenericSycl<TDim> const&) -> void
{
blockSync.my_item.barrier();
auto const item = sycl::ext::oneapi::experimental::this_nd_item<TDim::value>();
item.barrier();
}
};

template<typename TDim>
struct SyncBlockThreadsPredicate<BlockCount, BlockSyncGenericSycl<TDim>>
{
static auto syncBlockThreadsPredicate(BlockSyncGenericSycl<TDim> const& blockSync, int predicate) -> int
static auto syncBlockThreadsPredicate(BlockSyncGenericSycl<TDim> const&, int predicate) -> int
{
auto const group = blockSync.my_item.get_group();
blockSync.my_item.barrier();
auto const item = sycl::ext::oneapi::experimental::this_nd_item<TDim::value>();
item.barrier();

auto const group = item.get_group();
auto const counter = (predicate != 0) ? 1 : 0;
return sycl::reduce_over_group(group, counter, sycl::plus<>{});
}
Expand All @@ -54,23 +52,25 @@ namespace alpaka::trait
template<typename TDim>
struct SyncBlockThreadsPredicate<BlockAnd, BlockSyncGenericSycl<TDim>>
{
static auto syncBlockThreadsPredicate(BlockSyncGenericSycl<TDim> const& blockSync, int predicate) -> int
static auto syncBlockThreadsPredicate(BlockSyncGenericSycl<TDim> const&, int predicate) -> int
{
auto const group = blockSync.my_item.get_group();
blockSync.my_item.barrier();
auto const item = sycl::ext::oneapi::experimental::this_nd_item<TDim::value>();
item.barrier();

auto const group = item.get_group();
return static_cast<int>(sycl::all_of_group(group, static_cast<bool>(predicate)));
}
};

template<typename TDim>
struct SyncBlockThreadsPredicate<BlockOr, BlockSyncGenericSycl<TDim>>
{
static auto syncBlockThreadsPredicate(BlockSyncGenericSycl<TDim> const& blockSync, int predicate) -> int
static auto syncBlockThreadsPredicate(BlockSyncGenericSycl<TDim> const&, int predicate) -> int
{
auto const group = blockSync.my_item.get_group();
blockSync.my_item.barrier();
auto const item = sycl::ext::oneapi::experimental::this_nd_item<TDim::value>();
item.barrier();

auto const group = item.get_group();
return static_cast<int>(sycl::any_of_group(group, static_cast<bool>(predicate)));
}
};
Expand Down
22 changes: 10 additions & 12 deletions include/alpaka/idx/bt/IdxBtGenericSycl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,7 @@ namespace alpaka::bt
public:
using IdxBtBase = IdxBtGenericSycl;

explicit IdxBtGenericSycl(sycl::nd_item<TDim::value> work_item) : m_item_bt{work_item}
{
}

sycl::nd_item<TDim::value> m_item_bt;
IdxBtGenericSycl() = default;
};
} // namespace alpaka::bt

Expand All @@ -46,22 +42,24 @@ namespace alpaka::trait
{
//! \return The index of the current thread in the block.
template<typename TWorkDiv>
static auto getIdx(bt::IdxBtGenericSycl<TDim, TIdx> const& idx, TWorkDiv const&) -> Vec<TDim, TIdx>
static auto getIdx(bt::IdxBtGenericSycl<TDim, TIdx> const&, TWorkDiv const&) -> Vec<TDim, TIdx>
{
auto const item = sycl::ext::oneapi::experimental::this_nd_item<TDim::value>();

if constexpr(TDim::value == 1)
return Vec<TDim, TIdx>{static_cast<TIdx>(idx.m_item_bt.get_local_id(0))};
return Vec<TDim, TIdx>{static_cast<TIdx>(item.get_local_id(0))};
else if constexpr(TDim::value == 2)
{
return Vec<TDim, TIdx>{
static_cast<TIdx>(idx.m_item_bt.get_local_id(1)),
static_cast<TIdx>(idx.m_item_bt.get_local_id(0))};
static_cast<TIdx>(item.get_local_id(1)),
static_cast<TIdx>(item.get_local_id(0))};
}
else
{
return Vec<TDim, TIdx>{
static_cast<TIdx>(idx.m_item_bt.get_local_id(2)),
static_cast<TIdx>(idx.m_item_bt.get_local_id(1)),
static_cast<TIdx>(idx.m_item_bt.get_local_id(0))};
static_cast<TIdx>(item.get_local_id(2)),
static_cast<TIdx>(item.get_local_id(1)),
static_cast<TIdx>(item.get_local_id(0))};
}
}
};
Expand Down
22 changes: 9 additions & 13 deletions include/alpaka/idx/gb/IdxGbGenericSycl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,7 @@ namespace alpaka::gb
public:
using IdxGbBase = IdxGbGenericSycl;

explicit IdxGbGenericSycl(sycl::nd_item<TDim::value> work_item) : m_item_gb{work_item}
{
}

sycl::nd_item<TDim::value> m_item_gb;
IdxGbGenericSycl() = default;
};
} // namespace alpaka::gb

Expand All @@ -46,22 +42,22 @@ namespace alpaka::trait
{
//! \return The index of the current block in the grid.
template<typename TWorkDiv>
static auto getIdx(gb::IdxGbGenericSycl<TDim, TIdx> const& idx, TWorkDiv const&)
static auto getIdx(gb::IdxGbGenericSycl<TDim, TIdx> const&, TWorkDiv const&)
{
auto const item = sycl::ext::oneapi::experimental::this_nd_item<TDim::value>();

if constexpr(TDim::value == 1)
return Vec<TDim, TIdx>(static_cast<TIdx>(idx.m_item_gb.get_group(0)));
return Vec<TDim, TIdx>(static_cast<TIdx>(item.get_group(0)));
else if constexpr(TDim::value == 2)
{
return Vec<TDim, TIdx>(
static_cast<TIdx>(idx.m_item_gb.get_group(1)),
static_cast<TIdx>(idx.m_item_gb.get_group(0)));
return Vec<TDim, TIdx>(static_cast<TIdx>(item.get_group(1)), static_cast<TIdx>(item.get_group(0)));
}
else
{
return Vec<TDim, TIdx>(
static_cast<TIdx>(idx.m_item_gb.get_group(2)),
static_cast<TIdx>(idx.m_item_gb.get_group(1)),
static_cast<TIdx>(idx.m_item_gb.get_group(0)));
static_cast<TIdx>(item.get_group(2)),
static_cast<TIdx>(item.get_group(1)),
static_cast<TIdx>(item.get_group(0)));
}
}
};
Expand Down
14 changes: 6 additions & 8 deletions include/alpaka/kernel/TaskKernelGenericSycl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,10 @@
# define LAUNCH_SYCL_KERNEL_IF_SUBGROUP_SIZE_IS(sub_group_size) \
cgh.parallel_for( \
sycl::nd_range<TDim::value>{global_size, local_size}, \
[item_elements, dyn_shared_accessor, st_shared_accessor, k_func, k_args]( \
sycl::nd_item<TDim::value> work_item) [[intel::reqd_sub_group_size(sub_group_size)]] \
[item_elements, dyn_shared_accessor, st_shared_accessor, k_func, k_args](sycl::nd_item<TDim::value>) \
[[intel::reqd_sub_group_size(sub_group_size)]] \
{ \
auto acc = TAcc{item_elements, work_item, dyn_shared_accessor, st_shared_accessor}; \
auto acc = TAcc{item_elements, dyn_shared_accessor, st_shared_accessor}; \
core::apply( \
[k_func, &acc](typename std::decay_t<TArgs> const&... args) { k_func(acc, args...); }, \
k_args); \
Expand All @@ -52,10 +52,9 @@
# define LAUNCH_SYCL_KERNEL_WITH_DEFAULT_SUBGROUP_SIZE \
cgh.parallel_for( \
sycl::nd_range<TDim::value>{global_size, local_size}, \
[item_elements, dyn_shared_accessor, st_shared_accessor, k_func, k_args]( \
sycl::nd_item<TDim::value> work_item) \
[item_elements, dyn_shared_accessor, st_shared_accessor, k_func, k_args](sycl::nd_item<TDim::value>) \
{ \
auto acc = TAcc{item_elements, work_item, dyn_shared_accessor, st_shared_accessor}; \
auto acc = TAcc{item_elements, dyn_shared_accessor, st_shared_accessor}; \
core::apply( \
[k_func, &acc](typename std::decay_t<TArgs> const&... args) { k_func(acc, args...); }, \
k_args); \
Expand All @@ -65,8 +64,7 @@
throw sycl::exception(sycl::make_error_code(sycl::errc::kernel_not_supported)); \
cgh.parallel_for( \
sycl::nd_range<TDim::value>{global_size, local_size}, \
[item_elements, dyn_shared_accessor, st_shared_accessor, k_func, k_args]( \
sycl::nd_item<TDim::value> work_item) {});
[item_elements, dyn_shared_accessor, st_shared_accessor, k_func, k_args](sycl::nd_item<TDim::value>) {});

namespace alpaka
{
Expand Down
9 changes: 3 additions & 6 deletions include/alpaka/rand/RandGenericSycl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,7 @@ namespace alpaka::rand
template<typename TDim>
struct RandGenericSycl : concepts::Implements<ConceptRand, RandGenericSycl<TDim>>
{
explicit RandGenericSycl(sycl::nd_item<TDim::value> my_item) : m_item_rand{my_item}
{
}

sycl::nd_item<TDim::value> m_item_rand;
RandGenericSycl() = default;
};

# if !defined(ALPAKA_HOST_ONLY)
Expand Down Expand Up @@ -72,7 +68,8 @@ namespace alpaka::rand

Minstd(RandGenericSycl<TDim> rand, std::uint32_t const& seed)
{
oneapi::dpl::minstd_rand engine(seed, rand.m_item_rand.get_global_linear_id());
auto const item = sycl::ext::oneapi::experimental::this_nd_item<TDim::value>();
oneapi::dpl::minstd_rand engine(seed, item.get_global_linear_id());
rng_engine = engine;
}

Expand Down
36 changes: 19 additions & 17 deletions include/alpaka/warp/WarpGenericSycl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,7 @@ namespace alpaka::warp
class WarpGenericSycl : public concepts::Implements<alpaka::warp::ConceptWarp, WarpGenericSycl<TDim>>
{
public:
WarpGenericSycl(sycl::nd_item<TDim::value> my_item) : m_item_warp{my_item}
{
}

sycl::nd_item<TDim::value> m_item_warp;
WarpGenericSycl() = default;
};
} // namespace alpaka::warp

Expand All @@ -33,9 +29,10 @@ namespace alpaka::warp::trait
template<typename TDim>
struct GetSize<warp::WarpGenericSycl<TDim>>
{
static auto getSize(warp::WarpGenericSycl<TDim> const& warp) -> std::int32_t
static auto getSize(warp::WarpGenericSycl<TDim> const&) -> std::int32_t
{
auto const sub_group = warp.m_item_warp.get_sub_group();
auto const item = sycl::ext::oneapi::experimental::this_nd_item<TDim::value>();
auto const sub_group = item.get_sub_group();
// SYCL sub-groups are always 1D
return static_cast<std::int32_t>(sub_group.get_max_local_range()[0]);
}
Expand All @@ -47,11 +44,12 @@ namespace alpaka::warp::trait
// FIXME This should be std::uint64_t on AMD GCN architectures and on CPU,
// but the former is not targeted in alpaka and CPU case is not supported in SYCL yet.
// Restrict to warpSize <= 32 for now.
static auto activemask(warp::WarpGenericSycl<TDim> const& warp) -> std::uint32_t
static auto activemask(warp::WarpGenericSycl<TDim> const&) -> std::uint32_t
{
// SYCL has no way of querying this. Since sub-group functions have to be executed in convergent code
// regions anyway we return the full mask.
auto const sub_group = warp.m_item_warp.get_sub_group();
auto const item = sycl::ext::oneapi::experimental::this_nd_item<TDim::value>();
auto const sub_group = item.get_sub_group();
auto const mask = sycl::ext::oneapi::group_ballot(sub_group, true);
// FIXME This should be std::uint64_t on AMD GCN architectures and on CPU,
// but the former is not targeted in alpaka and CPU case is not supported in SYCL yet.
Expand All @@ -65,19 +63,21 @@ namespace alpaka::warp::trait
template<typename TDim>
struct All<warp::WarpGenericSycl<TDim>>
{
static auto all(warp::WarpGenericSycl<TDim> const& warp, std::int32_t predicate) -> std::int32_t
static auto all(warp::WarpGenericSycl<TDim> const&, std::int32_t predicate) -> std::int32_t
{
auto const sub_group = warp.m_item_warp.get_sub_group();
auto const item = sycl::ext::oneapi::experimental::this_nd_item<TDim::value>();
auto const sub_group = item.get_sub_group();
return static_cast<std::int32_t>(sycl::all_of_group(sub_group, static_cast<bool>(predicate)));
}
};

template<typename TDim>
struct Any<warp::WarpGenericSycl<TDim>>
{
static auto any(warp::WarpGenericSycl<TDim> const& warp, std::int32_t predicate) -> std::int32_t
static auto any(warp::WarpGenericSycl<TDim> const&, std::int32_t predicate) -> std::int32_t
{
auto const sub_group = warp.m_item_warp.get_sub_group();
auto const item = sycl::ext::oneapi::experimental::this_nd_item<TDim::value>();
auto const sub_group = item.get_sub_group();
return static_cast<std::int32_t>(sycl::any_of_group(sub_group, static_cast<bool>(predicate)));
}
};
Expand All @@ -88,9 +88,10 @@ namespace alpaka::warp::trait
// FIXME This should be std::uint64_t on AMD GCN architectures and on CPU,
// but the former is not targeted in alpaka and CPU case is not supported in SYCL yet.
// Restrict to warpSize <= 32 for now.
static auto ballot(warp::WarpGenericSycl<TDim> const& warp, std::int32_t predicate) -> std::uint32_t
static auto ballot(warp::WarpGenericSycl<TDim> const&, std::int32_t predicate) -> std::uint32_t
{
auto const sub_group = warp.m_item_warp.get_sub_group();
auto const item = sycl::ext::oneapi::experimental::this_nd_item<TDim::value>();
auto const sub_group = item.get_sub_group();
auto const mask = sycl::ext::oneapi::group_ballot(sub_group, static_cast<bool>(predicate));
// FIXME This should be std::uint64_t on AMD GCN architectures and on CPU,
// but the former is not targeted in alpaka and CPU case is not supported in SYCL yet.
Expand All @@ -105,7 +106,7 @@ namespace alpaka::warp::trait
struct Shfl<warp::WarpGenericSycl<TDim>>
{
template<typename T>
static auto shfl(warp::WarpGenericSycl<TDim> const& warp, T value, std::int32_t srcLane, std::int32_t width)
static auto shfl(warp::WarpGenericSycl<TDim> const&, T value, std::int32_t srcLane, std::int32_t width)
{
ALPAKA_ASSERT_OFFLOAD(width > 0);
ALPAKA_ASSERT_OFFLOAD(srcLane < width);
Expand All @@ -117,7 +118,8 @@ namespace alpaka::warp::trait
Example: If we assume a sub-group size of 32 and a width of 16 we will receive two subdivisions:
The first starts at sub-group index 0 and the second at sub-group index 16. For srcLane = 4 the
first subdivision will access the value at sub-group index 4 and the second at sub-group index 20. */
auto const actual_group = warp.m_item_warp.get_sub_group();
auto const item = sycl::ext::oneapi::experimental::this_nd_item<TDim::value>();
auto const actual_group = item.get_sub_group();
auto const actual_item_id = static_cast<std::int32_t>(actual_group.get_local_linear_id());
auto const actual_group_id = actual_item_id / width;
auto const actual_src_id = static_cast<std::size_t>(srcLane + actual_group_id * width);
Expand Down
Loading