From 497128da5a1857e24ba10857a58daa7f5c7d4923 Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Thu, 10 Aug 2023 10:23:13 +0200 Subject: [PATCH] [SYCL] Adopt the experimental free function extension Access the current nd_item via a free function instead of storing it as a data member of the various SYCL accelerator base classes. --- include/alpaka/acc/AccGenericSycl.hpp | 13 +++---- .../block/sync/BlockSyncGenericSycl.hpp | 32 ++++++++-------- include/alpaka/idx/bt/IdxBtGenericSycl.hpp | 22 +++++------ include/alpaka/idx/gb/IdxGbGenericSycl.hpp | 22 +++++------ .../alpaka/kernel/TaskKernelGenericSycl.hpp | 2 - include/alpaka/rand/RandGenericSycl.hpp | 9 ++--- include/alpaka/warp/WarpGenericSycl.hpp | 36 +++++++++--------- include/alpaka/workdiv/WorkDivGenericSycl.hpp | 37 ++++++++++--------- 8 files changed, 82 insertions(+), 91 deletions(-) diff --git a/include/alpaka/acc/AccGenericSycl.hpp b/include/alpaka/acc/AccGenericSycl.hpp index 03d1e23f90c1..23ed66b7ee6e 100644 --- a/include/alpaka/acc/AccGenericSycl.hpp +++ b/include/alpaka/acc/AccGenericSycl.hpp @@ -75,27 +75,26 @@ namespace alpaka AccGenericSycl( Vec const& threadElemExtent, - sycl::nd_item work_item, sycl::local_accessor dyn_shared_acc, sycl::local_accessor st_shared_acc, sycl::accessor global_fence_dummy, sycl::local_accessor local_fence_dummy) - : WorkDivGenericSycl{threadElemExtent, work_item} - , gb::IdxGbGenericSycl{work_item} - , bt::IdxBtGenericSycl{work_item} + : WorkDivGenericSycl{threadElemExtent} + , gb::IdxGbGenericSycl{} + , bt::IdxBtGenericSycl{} , AtomicHierarchy{} , math::MathGenericSycl{} , BlockSharedMemDynGenericSycl{dyn_shared_acc} , BlockSharedMemStGenericSycl{st_shared_acc} - , BlockSyncGenericSycl{work_item} + , BlockSyncGenericSycl{} , IntrinsicGenericSycl{} , MemFenceGenericSycl{global_fence_dummy, local_fence_dummy} # ifdef ALPAKA_DISABLE_VENDOR_RNG , rand::RandDefault{} # else - , rand::RandGenericSycl{work_item} + , rand::RandGenericSycl{} # endif - , warp::WarpGenericSycl{work_item} + , warp::WarpGenericSycl{} { } }; diff --git a/include/alpaka/block/sync/BlockSyncGenericSycl.hpp b/include/alpaka/block/sync/BlockSyncGenericSycl.hpp index 67e97493fee4..9461c6028e06 100644 --- a/include/alpaka/block/sync/BlockSyncGenericSycl.hpp +++ b/include/alpaka/block/sync/BlockSyncGenericSycl.hpp @@ -19,11 +19,7 @@ namespace alpaka public: using BlockSyncBase = BlockSyncGenericSycl; - BlockSyncGenericSycl(sycl::nd_item work_item) : my_item{work_item} - { - } - - sycl::nd_item my_item; + BlockSyncGenericSycl() = default; }; } // namespace alpaka @@ -32,20 +28,22 @@ namespace alpaka::trait template struct SyncBlockThreads> { - static auto syncBlockThreads(BlockSyncGenericSycl const& blockSync) -> void + static auto syncBlockThreads(BlockSyncGenericSycl const&) -> void { - blockSync.my_item.barrier(); + auto const item = sycl::ext::oneapi::experimental::this_nd_item(); + item.barrier(); } }; template struct SyncBlockThreadsPredicate> { - static auto syncBlockThreadsPredicate(BlockSyncGenericSycl const& blockSync, int predicate) -> int + static auto syncBlockThreadsPredicate(BlockSyncGenericSycl const&, int predicate) -> int { - auto const group = blockSync.my_item.get_group(); - blockSync.my_item.barrier(); + auto const item = sycl::ext::oneapi::experimental::this_nd_item(); + item.barrier(); + auto const group = item.get_group(); auto const counter = (predicate != 0) ? 1 : 0; return sycl::reduce_over_group(group, counter, sycl::plus<>{}); } @@ -54,11 +52,12 @@ namespace alpaka::trait template struct SyncBlockThreadsPredicate> { - static auto syncBlockThreadsPredicate(BlockSyncGenericSycl const& blockSync, int predicate) -> int + static auto syncBlockThreadsPredicate(BlockSyncGenericSycl const&, int predicate) -> int { - auto const group = blockSync.my_item.get_group(); - blockSync.my_item.barrier(); + auto const item = sycl::ext::oneapi::experimental::this_nd_item(); + item.barrier(); + auto const group = item.get_group(); return static_cast(sycl::all_of_group(group, static_cast(predicate))); } }; @@ -66,11 +65,12 @@ namespace alpaka::trait template struct SyncBlockThreadsPredicate> { - static auto syncBlockThreadsPredicate(BlockSyncGenericSycl const& blockSync, int predicate) -> int + static auto syncBlockThreadsPredicate(BlockSyncGenericSycl const&, int predicate) -> int { - auto const group = blockSync.my_item.get_group(); - blockSync.my_item.barrier(); + auto const item = sycl::ext::oneapi::experimental::this_nd_item(); + item.barrier(); + auto const group = item.get_group(); return static_cast(sycl::any_of_group(group, static_cast(predicate))); } }; diff --git a/include/alpaka/idx/bt/IdxBtGenericSycl.hpp b/include/alpaka/idx/bt/IdxBtGenericSycl.hpp index 54ef78014f1f..d07aeb982955 100644 --- a/include/alpaka/idx/bt/IdxBtGenericSycl.hpp +++ b/include/alpaka/idx/bt/IdxBtGenericSycl.hpp @@ -23,11 +23,7 @@ namespace alpaka::bt public: using IdxBtBase = IdxBtGenericSycl; - explicit IdxBtGenericSycl(sycl::nd_item work_item) : m_item_bt{work_item} - { - } - - sycl::nd_item m_item_bt; + IdxBtGenericSycl() = default; }; } // namespace alpaka::bt @@ -46,22 +42,24 @@ namespace alpaka::trait { //! \return The index of the current thread in the block. template - static auto getIdx(bt::IdxBtGenericSycl const& idx, TWorkDiv const&) -> Vec + static auto getIdx(bt::IdxBtGenericSycl const&, TWorkDiv const&) -> Vec { + auto const item = sycl::ext::oneapi::experimental::this_nd_item(); + if constexpr(TDim::value == 1) - return Vec{static_cast(idx.m_item_bt.get_local_id(0))}; + return Vec{static_cast(item.get_local_id(0))}; else if constexpr(TDim::value == 2) { return Vec{ - static_cast(idx.m_item_bt.get_local_id(1)), - static_cast(idx.m_item_bt.get_local_id(0))}; + static_cast(item.get_local_id(1)), + static_cast(item.get_local_id(0))}; } else { return Vec{ - static_cast(idx.m_item_bt.get_local_id(2)), - static_cast(idx.m_item_bt.get_local_id(1)), - static_cast(idx.m_item_bt.get_local_id(0))}; + static_cast(item.get_local_id(2)), + static_cast(item.get_local_id(1)), + static_cast(item.get_local_id(0))}; } } }; diff --git a/include/alpaka/idx/gb/IdxGbGenericSycl.hpp b/include/alpaka/idx/gb/IdxGbGenericSycl.hpp index 42547effd6c3..ca138be3f8a2 100644 --- a/include/alpaka/idx/gb/IdxGbGenericSycl.hpp +++ b/include/alpaka/idx/gb/IdxGbGenericSycl.hpp @@ -23,11 +23,7 @@ namespace alpaka::gb public: using IdxGbBase = IdxGbGenericSycl; - explicit IdxGbGenericSycl(sycl::nd_item work_item) : m_item_gb{work_item} - { - } - - sycl::nd_item m_item_gb; + IdxGbGenericSycl() = default; }; } // namespace alpaka::gb @@ -46,22 +42,22 @@ namespace alpaka::trait { //! \return The index of the current block in the grid. template - static auto getIdx(gb::IdxGbGenericSycl const& idx, TWorkDiv const&) + static auto getIdx(gb::IdxGbGenericSycl const&, TWorkDiv const&) { + auto const item = sycl::ext::oneapi::experimental::this_nd_item(); + if constexpr(TDim::value == 1) - return Vec(static_cast(idx.m_item_gb.get_group(0))); + return Vec(static_cast(item.get_group(0))); else if constexpr(TDim::value == 2) { - return Vec( - static_cast(idx.m_item_gb.get_group(1)), - static_cast(idx.m_item_gb.get_group(0))); + return Vec(static_cast(item.get_group(1)), static_cast(item.get_group(0))); } else { return Vec( - static_cast(idx.m_item_gb.get_group(2)), - static_cast(idx.m_item_gb.get_group(1)), - static_cast(idx.m_item_gb.get_group(0))); + static_cast(item.get_group(2)), + static_cast(item.get_group(1)), + static_cast(item.get_group(0))); } } }; diff --git a/include/alpaka/kernel/TaskKernelGenericSycl.hpp b/include/alpaka/kernel/TaskKernelGenericSycl.hpp index cb74b6685d0c..419f71ae1973 100644 --- a/include/alpaka/kernel/TaskKernelGenericSycl.hpp +++ b/include/alpaka/kernel/TaskKernelGenericSycl.hpp @@ -50,7 +50,6 @@ { \ auto acc = TAcc{ \ item_elements, \ - work_item, \ dyn_shared_accessor, \ st_shared_accessor, \ global_fence_dummy, \ @@ -73,7 +72,6 @@ { \ auto acc = TAcc{ \ item_elements, \ - work_item, \ dyn_shared_accessor, \ st_shared_accessor, \ global_fence_dummy, \ diff --git a/include/alpaka/rand/RandGenericSycl.hpp b/include/alpaka/rand/RandGenericSycl.hpp index 99329a0a52bb..accb7c959c45 100644 --- a/include/alpaka/rand/RandGenericSycl.hpp +++ b/include/alpaka/rand/RandGenericSycl.hpp @@ -40,11 +40,7 @@ namespace alpaka::rand template struct RandGenericSycl : concepts::Implements> { - explicit RandGenericSycl(sycl::nd_item my_item) : m_item_rand{my_item} - { - } - - sycl::nd_item m_item_rand; + RandGenericSycl() = default; }; # if !defined(ALPAKA_HOST_ONLY) @@ -72,7 +68,8 @@ namespace alpaka::rand Minstd(RandGenericSycl rand, std::uint32_t const& seed) { - oneapi::dpl::minstd_rand engine(seed, rand.m_item_rand.get_global_linear_id()); + auto const item = sycl::ext::oneapi::experimental::this_nd_item(); + oneapi::dpl::minstd_rand engine(seed, item.get_global_linear_id()); rng_engine = engine; } diff --git a/include/alpaka/warp/WarpGenericSycl.hpp b/include/alpaka/warp/WarpGenericSycl.hpp index a9b3a56fe69f..15e012cbe10b 100644 --- a/include/alpaka/warp/WarpGenericSycl.hpp +++ b/include/alpaka/warp/WarpGenericSycl.hpp @@ -20,11 +20,7 @@ namespace alpaka::warp class WarpGenericSycl : public concepts::Implements> { public: - WarpGenericSycl(sycl::nd_item my_item) : m_item_warp{my_item} - { - } - - sycl::nd_item m_item_warp; + WarpGenericSycl() = default; }; } // namespace alpaka::warp @@ -33,9 +29,10 @@ namespace alpaka::warp::trait template struct GetSize> { - static auto getSize(warp::WarpGenericSycl const& warp) -> std::int32_t + static auto getSize(warp::WarpGenericSycl const&) -> std::int32_t { - auto const sub_group = warp.m_item_warp.get_sub_group(); + auto const item = sycl::ext::oneapi::experimental::this_nd_item(); + auto const sub_group = item.get_sub_group(); // SYCL sub-groups are always 1D return static_cast(sub_group.get_max_local_range()[0]); } @@ -47,11 +44,12 @@ namespace alpaka::warp::trait // FIXME This should be std::uint64_t on AMD GCN architectures and on CPU, // but the former is not targeted in alpaka and CPU case is not supported in SYCL yet. // Restrict to warpSize <= 32 for now. - static auto activemask(warp::WarpGenericSycl const& warp) -> std::uint32_t + static auto activemask(warp::WarpGenericSycl const&) -> std::uint32_t { // SYCL has no way of querying this. Since sub-group functions have to be executed in convergent code // regions anyway we return the full mask. - auto const sub_group = warp.m_item_warp.get_sub_group(); + auto const item = sycl::ext::oneapi::experimental::this_nd_item(); + auto const sub_group = item.get_sub_group(); auto const mask = sycl::ext::oneapi::group_ballot(sub_group, true); // FIXME This should be std::uint64_t on AMD GCN architectures and on CPU, // but the former is not targeted in alpaka and CPU case is not supported in SYCL yet. @@ -65,9 +63,10 @@ namespace alpaka::warp::trait template struct All> { - static auto all(warp::WarpGenericSycl const& warp, std::int32_t predicate) -> std::int32_t + static auto all(warp::WarpGenericSycl const&, std::int32_t predicate) -> std::int32_t { - auto const sub_group = warp.m_item_warp.get_sub_group(); + auto const item = sycl::ext::oneapi::experimental::this_nd_item(); + auto const sub_group = item.get_sub_group(); return static_cast(sycl::all_of_group(sub_group, static_cast(predicate))); } }; @@ -75,9 +74,10 @@ namespace alpaka::warp::trait template struct Any> { - static auto any(warp::WarpGenericSycl const& warp, std::int32_t predicate) -> std::int32_t + static auto any(warp::WarpGenericSycl const&, std::int32_t predicate) -> std::int32_t { - auto const sub_group = warp.m_item_warp.get_sub_group(); + auto const item = sycl::ext::oneapi::experimental::this_nd_item(); + auto const sub_group = item.get_sub_group(); return static_cast(sycl::any_of_group(sub_group, static_cast(predicate))); } }; @@ -88,9 +88,10 @@ namespace alpaka::warp::trait // FIXME This should be std::uint64_t on AMD GCN architectures and on CPU, // but the former is not targeted in alpaka and CPU case is not supported in SYCL yet. // Restrict to warpSize <= 32 for now. - static auto ballot(warp::WarpGenericSycl const& warp, std::int32_t predicate) -> std::uint32_t + static auto ballot(warp::WarpGenericSycl const&, std::int32_t predicate) -> std::uint32_t { - auto const sub_group = warp.m_item_warp.get_sub_group(); + auto const item = sycl::ext::oneapi::experimental::this_nd_item(); + auto const sub_group = item.get_sub_group(); auto const mask = sycl::ext::oneapi::group_ballot(sub_group, static_cast(predicate)); // FIXME This should be std::uint64_t on AMD GCN architectures and on CPU, // but the former is not targeted in alpaka and CPU case is not supported in SYCL yet. @@ -105,7 +106,7 @@ namespace alpaka::warp::trait struct Shfl> { template - static auto shfl(warp::WarpGenericSycl const& warp, T value, std::int32_t srcLane, std::int32_t width) + static auto shfl(warp::WarpGenericSycl const&, T value, std::int32_t srcLane, std::int32_t width) { ALPAKA_ASSERT_OFFLOAD(width > 0); ALPAKA_ASSERT_OFFLOAD(srcLane < width); @@ -117,7 +118,8 @@ namespace alpaka::warp::trait Example: If we assume a sub-group size of 32 and a width of 16 we will receive two subdivisions: The first starts at sub-group index 0 and the second at sub-group index 16. For srcLane = 4 the first subdivision will access the value at sub-group index 4 and the second at sub-group index 20. */ - auto const actual_group = warp.m_item_warp.get_sub_group(); + auto const item = sycl::ext::oneapi::experimental::this_nd_item(); + auto const actual_group = item.get_sub_group(); auto const actual_item_id = static_cast(actual_group.get_local_linear_id()); auto const actual_group_id = actual_item_id / width; auto const actual_src_id = static_cast(srcLane + actual_group_id * width); diff --git a/include/alpaka/workdiv/WorkDivGenericSycl.hpp b/include/alpaka/workdiv/WorkDivGenericSycl.hpp index 26e00750e42d..14b6e38a8870 100644 --- a/include/alpaka/workdiv/WorkDivGenericSycl.hpp +++ b/include/alpaka/workdiv/WorkDivGenericSycl.hpp @@ -23,14 +23,11 @@ namespace alpaka public: using WorkDivBase = WorkDivGenericSycl; - WorkDivGenericSycl(Vec const& threadElemExtent, sycl::nd_item work_item) - : m_threadElemExtent{threadElemExtent} - , m_item_workdiv{work_item} + WorkDivGenericSycl(Vec const& threadElemExtent) : m_threadElemExtent{threadElemExtent} { } Vec const& m_threadElemExtent; - sycl::nd_item m_item_workdiv; }; } // namespace alpaka @@ -55,24 +52,26 @@ namespace alpaka::trait struct GetWorkDiv, origin::Grid, unit::Blocks> { //! \return The number of blocks in each dimension of the grid. - static auto getWorkDiv(WorkDivGenericSycl const& workDiv) -> Vec + static auto getWorkDiv(WorkDivGenericSycl const&) -> Vec { + auto const item = sycl::ext::oneapi::experimental::this_nd_item(); + if constexpr(TDim::value == 0) return Vec{}; else if constexpr(TDim::value == 1) - return Vec{static_cast(workDiv.m_item_workdiv.get_group_range(0))}; + return Vec{static_cast(item.get_group_range(0))}; else if constexpr(TDim::value == 2) { return Vec{ - static_cast(workDiv.m_item_workdiv.get_group_range(1)), - static_cast(workDiv.m_item_workdiv.get_group_range(0))}; + static_cast(item.get_group_range(1)), + static_cast(item.get_group_range(0))}; } else { return Vec{ - static_cast(workDiv.m_item_workdiv.get_group_range(2)), - static_cast(workDiv.m_item_workdiv.get_group_range(1)), - static_cast(workDiv.m_item_workdiv.get_group_range(0))}; + static_cast(item.get_group_range(2)), + static_cast(item.get_group_range(1)), + static_cast(item.get_group_range(0))}; } } }; @@ -82,24 +81,26 @@ namespace alpaka::trait struct GetWorkDiv, origin::Block, unit::Threads> { //! \return The number of threads in each dimension of a block. - static auto getWorkDiv(WorkDivGenericSycl const& workDiv) -> Vec + static auto getWorkDiv(WorkDivGenericSycl const&) -> Vec { + auto const item = sycl::ext::oneapi::experimental::this_nd_item(); + if constexpr(TDim::value == 0) return Vec{}; else if constexpr(TDim::value == 1) - return Vec{static_cast(workDiv.m_item_workdiv.get_local_range(0))}; + return Vec{static_cast(item.get_local_range(0))}; else if constexpr(TDim::value == 2) { return Vec{ - static_cast(workDiv.m_item_workdiv.get_local_range(1)), - static_cast(workDiv.m_item_workdiv.get_local_range(0))}; + static_cast(item.get_local_range(1)), + static_cast(item.get_local_range(0))}; } else { return Vec{ - static_cast(workDiv.m_item_workdiv.get_local_range(2)), - static_cast(workDiv.m_item_workdiv.get_local_range(1)), - static_cast(workDiv.m_item_workdiv.get_local_range(0))}; + static_cast(item.get_local_range(2)), + static_cast(item.get_local_range(1)), + static_cast(item.get_local_range(0))}; } } };