Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SYCL] Enable mapping of group load/store functions to SPIRV built-ins for local address space #16653

Merged
merged 1 commit into from
Jan 24, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions sycl/include/sycl/__spirv/spirv_ops.hpp
Original file line number Diff line number Diff line change
@@ -445,6 +445,47 @@ template <typename dataT>
__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL void
__spirv_SubgroupBlockWriteINTEL(__attribute__((opencl_global)) uint64_t *Ptr,
dataT Data) noexcept;

template <typename dataT>
__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL dataT
__spirv_SubgroupBlockReadINTEL(const __attribute__((opencl_local))
uint8_t *Ptr) noexcept;

template <typename dataT>
__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL void
__spirv_SubgroupBlockWriteINTEL(__attribute__((opencl_local)) uint8_t *Ptr,
dataT Data) noexcept;

template <typename dataT>
__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL dataT
__spirv_SubgroupBlockReadINTEL(const __attribute__((opencl_local))
uint16_t *Ptr) noexcept;

template <typename dataT>
__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL void
__spirv_SubgroupBlockWriteINTEL(__attribute__((opencl_local)) uint16_t *Ptr,
dataT Data) noexcept;

template <typename dataT>
__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL dataT
__spirv_SubgroupBlockReadINTEL(const __attribute__((opencl_local))
uint32_t *Ptr) noexcept;

template <typename dataT>
__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL void
__spirv_SubgroupBlockWriteINTEL(__attribute__((opencl_local)) uint32_t *Ptr,
dataT Data) noexcept;

template <typename dataT>
__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL dataT
__spirv_SubgroupBlockReadINTEL(const __attribute__((opencl_local))
uint64_t *Ptr) noexcept;

template <typename dataT>
__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL void
__spirv_SubgroupBlockWriteINTEL(__attribute__((opencl_local)) uint64_t *Ptr,
dataT Data) noexcept;

template <int W, int rW>
extern __DPCPP_SYCL_EXTERNAL sycl::detail::ap_int<rW>
__spirv_FixedSqrtINTEL(sycl::detail::ap_int<W> a, bool S, int32_t I, int32_t rI,
112 changes: 90 additions & 22 deletions sycl/include/sycl/ext/oneapi/experimental/group_load_store.hpp
Original file line number Diff line number Diff line change
@@ -58,6 +58,13 @@ struct naive_key : detail::compile_time_property_key<detail::PropKind::Naive> {
using value_t = property_value<naive_key>;
};
inline constexpr naive_key::value_t naive;

struct native_local_block_io_key
: detail::compile_time_property_key<detail::PropKind::NativeLocalBlockIO> {
using value_t = property_value<native_local_block_io_key>;
};
inline constexpr native_local_block_io_key::value_t native_local_block_io;

using namespace sycl::detail;
} // namespace detail

@@ -154,7 +161,6 @@ template <typename BlockInfoTy> struct BlockTypeInfo;
template <typename IteratorT, std::size_t ElementsPerWorkItem, bool Blocked>
struct BlockTypeInfo<BlockInfo<IteratorT, ElementsPerWorkItem, Blocked>> {
using BlockInfoTy = BlockInfo<IteratorT, ElementsPerWorkItem, Blocked>;
static_assert(BlockInfoTy::has_builtin);

using block_type = detail::fixed_width_unsigned<BlockInfoTy::block_size>;

@@ -163,15 +169,23 @@ struct BlockTypeInfo<BlockInfo<IteratorT, ElementsPerWorkItem, Blocked>> {
typename std::iterator_traits<IteratorT>::reference>>,
std::add_const_t<block_type>, block_type>;

using block_pointer_type = typename detail::DecoratedType<
block_pointer_elem_type, access::address_space::global_space>::type *;
static constexpr auto deduced_address_space =
detail::deduce_AS<std::remove_cv_t<IteratorT>>::value;

using block_pointer_type =
typename detail::DecoratedType<block_pointer_elem_type,
deduced_address_space>::type *;

using block_op_type = std::conditional_t<
BlockInfoTy::num_blocks == 1, block_type,
detail::ConvertToOpenCLType_t<vec<block_type, BlockInfoTy::num_blocks>>>;
};

// Returns either a pointer suitable to use in a block read/write builtin or
// nullptr if some legality conditions aren't satisfied.
// Returns either a pointer decorated with the deduced address space, suitable
// to use in a block read/write builtin, or nullptr if some legality conditions
// aren't satisfied. If deduced address space is generic then returned pointer
// will have generic address space and has to be dynamically casted to global or
// local space before using in a builtin.
template <int RequiredAlign, std::size_t ElementsPerWorkItem,
typename IteratorT, typename Properties>
auto get_block_op_ptr(IteratorT iter, [[maybe_unused]] Properties props) {
@@ -211,16 +225,17 @@ auto get_block_op_ptr(IteratorT iter, [[maybe_unused]] Properties props) {
bool is_aligned = alignof(value_type) >= RequiredAlign ||
reinterpret_cast<uintptr_t>(iter) % RequiredAlign == 0;

constexpr auto AS = detail::deduce_AS<iter_no_cv>::value;
using block_pointer_type =
typename BlockTypeInfo<BlkInfo>::block_pointer_type;
if constexpr (AS == access::address_space::global_space) {

static constexpr auto deduced_address_space =
BlockTypeInfo<BlkInfo>::deduced_address_space;
if constexpr (deduced_address_space ==
access::address_space::generic_space ||
deduced_address_space ==
access::address_space::global_space ||
deduced_address_space == access::address_space::local_space) {
return is_aligned ? reinterpret_cast<block_pointer_type>(iter) : nullptr;
} else if constexpr (AS == access::address_space::generic_space) {
return is_aligned ? reinterpret_cast<block_pointer_type>(
detail::dynamic_address_cast<
access::address_space::global_space>(iter))
: nullptr;
} else {
return nullptr;
}
@@ -261,11 +276,37 @@ group_load(Group g, InputIteratorT in_ptr,
// Do optimized load.
using value_type = remove_decoration_t<
typename std::iterator_traits<InputIteratorT>::value_type>;

auto load = __spirv_SubgroupBlockReadINTEL<
typename detail::BlockTypeInfo<detail::BlockInfo<
InputIteratorT, ElementsPerWorkItem, blocked>>::block_op_type>(
ptr);
using block_info = typename detail::BlockTypeInfo<
detail::BlockInfo<InputIteratorT, ElementsPerWorkItem, blocked>>;
static constexpr auto deduced_address_space =
block_info::deduced_address_space;
using block_op_type = typename block_info::block_op_type;

if constexpr (deduced_address_space ==
access::address_space::local_space &&
!props.template has_property<
detail::native_local_block_io_key>())
return group_load(g, in_ptr, out, use_naive{});

block_op_type load;
if constexpr (deduced_address_space ==
access::address_space::generic_space) {
if (auto local_ptr = detail::dynamic_address_cast<
access::address_space::local_space>(ptr)) {
if constexpr (props.template has_property<
detail::native_local_block_io_key>())
load = __spirv_SubgroupBlockReadINTEL<block_op_type>(local_ptr);
else
return group_load(g, in_ptr, out, use_naive{});
} else if (auto global_ptr = detail::dynamic_address_cast<
access::address_space::global_space>(ptr)) {
load = __spirv_SubgroupBlockReadINTEL<block_op_type>(global_ptr);
} else {
return group_load(g, in_ptr, out, use_naive{});
}
} else {
load = __spirv_SubgroupBlockReadINTEL<block_op_type>(ptr);
}

// TODO: accessor_iterator's value_type is weird, so we need
// `std::remove_const_t` below:
@@ -331,6 +372,16 @@ group_store(Group g, const span<InputT, ElementsPerWorkItem> in,
return group_store(g, in, out_ptr, use_naive{});

if constexpr (!std::is_same_v<std::nullptr_t, decltype(ptr)>) {
using block_info = typename detail::BlockTypeInfo<
detail::BlockInfo<OutputIteratorT, ElementsPerWorkItem, blocked>>;
static constexpr auto deduced_address_space =
block_info::deduced_address_space;
if constexpr (deduced_address_space ==
access::address_space::local_space &&
!props.template has_property<
detail::native_local_block_io_key>())
return group_store(g, in, out_ptr, use_naive{});

// Do optimized store.
std::remove_const_t<remove_decoration_t<
typename std::iterator_traits<OutputIteratorT>::value_type>>
@@ -341,11 +392,28 @@ group_store(Group g, const span<InputT, ElementsPerWorkItem> in,
values[i] = in[i];
}

__spirv_SubgroupBlockWriteINTEL(
ptr,
sycl::bit_cast<typename detail::BlockTypeInfo<detail::BlockInfo<
OutputIteratorT, ElementsPerWorkItem, blocked>>::block_op_type>(
values));
using block_op_type = typename block_info::block_op_type;
if constexpr (deduced_address_space ==
access::address_space::generic_space) {
if (auto local_ptr = detail::dynamic_address_cast<
access::address_space::local_space>(ptr)) {
if constexpr (props.template has_property<
detail::native_local_block_io_key>())
__spirv_SubgroupBlockWriteINTEL(
local_ptr, sycl::bit_cast<block_op_type>(values));
else
return group_store(g, in, out_ptr, use_naive{});
} else if (auto global_ptr = detail::dynamic_address_cast<
access::address_space::global_space>(ptr)) {
__spirv_SubgroupBlockWriteINTEL(
global_ptr, sycl::bit_cast<block_op_type>(values));
} else {
return group_store(g, in, out_ptr, use_naive{});
}
} else {
__spirv_SubgroupBlockWriteINTEL(ptr,
sycl::bit_cast<block_op_type>(values));
}
}
}
}
3 changes: 2 additions & 1 deletion sycl/include/sycl/ext/oneapi/properties/property.hpp
Original file line number Diff line number Diff line change
@@ -224,8 +224,9 @@ enum PropKind : uint32_t {
WorkGroupScratchSize = 79,
Restrict = 80,
EventMode = 81,
NativeLocalBlockIO = 82,
// PropKindSize must always be the last value.
PropKindSize = 82,
PropKindSize = 83,
};

template <typename PropertyT> struct PropertyToKind {
79 changes: 66 additions & 13 deletions sycl/test-e2e/GroupAlgorithm/load_store/basic.cpp
Original file line number Diff line number Diff line change
@@ -6,8 +6,9 @@

#include <numeric>

int main() {
using namespace sycl;
using namespace sycl;

template <access::address_space addr_space> int test(queue &q) {
namespace sycl_exp = sycl::ext::oneapi::experimental;

constexpr std::size_t wg_size = 32;
@@ -16,8 +17,6 @@ int main() {
constexpr std::size_t elems_per_wi = 4;
constexpr std::size_t n = global_size * elems_per_wi;

queue q;

buffer<int, 1> input_buf{n};

{
@@ -42,8 +41,10 @@ int main() {
accessor store_blocked{store_blocked_buf, cgh};
accessor store_striped{store_striped_buf, cgh};

local_accessor<int, 1> local_acc{wg_size * elems_per_wi, cgh};
cgh.parallel_for(nd_range<1>{global_size, wg_size}, [=](nd_item<1> ndi) {
auto gid = ndi.get_global_id(0);
auto lid = ndi.get_local_id(0);
auto g = ndi.get_group();
auto offset = g.get_group_id(0) * g.get_local_range(0) * elems_per_wi;

@@ -52,31 +53,76 @@ int main() {
auto blocked = sycl_exp::properties{sycl_exp::data_placement_blocked};
auto striped = sycl_exp::properties{sycl_exp::data_placement_striped};

if constexpr (addr_space == access::address_space::local_space) {
// Copy input to local memory.
for (int i = lid * elems_per_wi; i < lid * elems_per_wi + elems_per_wi;
i++) {
local_acc[i] = input[offset + i];
}
ndi.barrier(access::fence_space::local_space);
}

// default
sycl_exp::group_load(g, input.begin() + offset, span{data});
if constexpr (addr_space == access::address_space::local_space) {
sycl_exp::group_load(g, local_acc.begin(), span{data});
} else {
sycl_exp::group_load(g, input.begin() + offset, span{data});
}
for (int i = 0; i < elems_per_wi; ++i)
load_blocked_default[gid * elems_per_wi + i] = data[i];

// blocked
sycl_exp::group_load(g, input.begin() + offset, span{data}, blocked);
if constexpr (addr_space == access::address_space::local_space) {
sycl_exp::group_load(g, local_acc.begin(), span{data}, blocked);
} else {
sycl_exp::group_load(g, input.begin() + offset, span{data}, blocked);
}
for (int i = 0; i < elems_per_wi; ++i)
load_blocked[gid * elems_per_wi + i] = data[i];

// striped
sycl_exp::group_load(g, input.begin() + offset, span{data}, striped);
if constexpr (addr_space == access::address_space::local_space) {
sycl_exp::group_load(g, local_acc.begin(), span{data}, striped);
} else {
sycl_exp::group_load(g, input.begin() + offset, span{data}, striped);
}
for (int i = 0; i < elems_per_wi; ++i)
load_striped[gid * elems_per_wi + i] = data[i];

// Stores:

std::iota(std::begin(data), std::end(data), gid * elems_per_wi);

sycl_exp::group_store(g, span{data},
store_blocked_default.begin() + offset);
sycl_exp::group_store(g, span{data}, store_blocked.begin() + offset,
blocked);
sycl_exp::group_store(g, span{data}, store_striped.begin() + offset,
striped);
auto copy_local_acc_to_global_output = [&](accessor<int, 1> output) {
for (int i = lid * elems_per_wi; i < lid * elems_per_wi + elems_per_wi;
i++) {
output[offset + i] = local_acc[i];
}
};

if constexpr (addr_space == access::address_space::local_space) {
sycl_exp::group_store(g, span{data}, local_acc.begin());
copy_local_acc_to_global_output(store_blocked_default);
} else {
sycl_exp::group_store(g, span{data},
store_blocked_default.begin() + offset);
}

if constexpr (addr_space == access::address_space::local_space) {
sycl_exp::group_store(g, span{data}, local_acc.begin(), blocked);
copy_local_acc_to_global_output(store_blocked);
} else {
sycl_exp::group_store(g, span{data}, store_blocked.begin() + offset,
blocked);
}

if constexpr (addr_space == access::address_space::local_space) {
sycl_exp::group_store(g, span{data}, local_acc.begin(), striped);
copy_local_acc_to_global_output(store_striped);
} else {
sycl_exp::group_store(g, span{data}, store_striped.begin() + offset,
striped);
}
});
});

@@ -111,3 +157,10 @@ int main() {

return 0;
}

int main() {
queue q;
test<access::address_space::global_space>(q);
test<access::address_space::local_space>(q);
return 0;
}
24 changes: 20 additions & 4 deletions sycl/test-e2e/GroupAlgorithm/load_store/conversions_load.cpp
Original file line number Diff line number Diff line change
@@ -13,14 +13,12 @@ struct S {
int i;
};

int main() {
template <sycl::access::address_space addr_space> int test(sycl::queue &q) {
using namespace sycl;
namespace sycl_exp = sycl::ext::oneapi::experimental;

constexpr std::size_t wg_size = 16;

queue q;

buffer<int, 1> input_buf{wg_size * 2};
{
host_accessor acc{input_buf};
@@ -31,12 +29,23 @@ int main() {
q.submit([&](handler &cgh) {
accessor input{input_buf, cgh};
accessor success{success_buf, cgh};
local_accessor<int, 1> local_acc{wg_size * 2, cgh};
cgh.parallel_for(nd_range<1>{wg_size, wg_size}, [=](nd_item<1> ndi) {
auto gid = ndi.get_global_id(0);
auto lid = ndi.get_local_id(0);
auto g = ndi.get_group();

S data[2];
sycl_exp::group_load(g, input.begin(), span{data});

if constexpr (addr_space == access::address_space::local_space) {
for (int i = lid * 2; i < lid * 2 + 2; i++) {
local_acc[i] = input[i];
}
ndi.barrier(access::fence_space::local_space);
sycl_exp::group_load(g, local_acc.begin(), span{data});
} else {
sycl_exp::group_load(g, input.begin(), span{data});
}

bool ok = true;
ok &= (data[0].i == gid * 2 + 0 + 42);
@@ -50,3 +59,10 @@ int main() {

return 0;
}

int main() {
sycl::queue q;
test<sycl::access::address_space::global_space>(q);
test<sycl::access::address_space::local_space>(q);
return 0;
}
24 changes: 20 additions & 4 deletions sycl/test-e2e/GroupAlgorithm/load_store/conversions_store.cpp
Original file line number Diff line number Diff line change
@@ -11,26 +11,35 @@ struct S {
};
static_assert(std::is_trivially_copyable_v<S>);

int main() {
template <sycl::access::address_space addr_space> int test(sycl::queue &q) {
using namespace sycl;
namespace sycl_exp = sycl::ext::oneapi::experimental;

constexpr std::size_t wg_size = 16;

queue q;

buffer<int, 1> output_buf{wg_size * 2};

q.submit([&](handler &cgh) {
accessor output{output_buf, cgh};
local_accessor<int, 1> local_acc{wg_size * 2, cgh};
cgh.parallel_for(nd_range<1>{wg_size, wg_size}, [=](nd_item<1> ndi) {
auto gid = ndi.get_global_id(0);
auto lid = ndi.get_local_id(0);
auto g = ndi.get_group();

S data[2];
data[0].i = gid * 2 + 0;
data[1].i = gid * 2 + 1;
sycl_exp::group_store(g, span{data}, output.begin());

if constexpr (addr_space == access::address_space::local_space) {
sycl_exp::group_store(g, span{data}, local_acc.begin());
ndi.barrier(access::fence_space::local_space);
for (int i = lid * 2; i < lid * 2 + 2; i++) {
output[i] = local_acc[i];
}
} else {
sycl_exp::group_store(g, span{data}, output.begin());
}
});
});

@@ -41,3 +50,10 @@ int main() {

return 0;
}

int main() {
sycl::queue q;
test<sycl::access::address_space::global_space>(q);
test<sycl::access::address_space::local_space>(q);
return 0;
}
60 changes: 50 additions & 10 deletions sycl/test-e2e/GroupAlgorithm/load_store/odd_sized_type.cpp
Original file line number Diff line number Diff line change
@@ -30,14 +30,11 @@ struct __attribute__((packed)) S {
static_assert(sizeof(S) == 7);
static_assert(std::is_trivially_copyable_v<S>);

int main() {

template <access::address_space addr_space> int test(queue &q) {
constexpr std::size_t wg_size = 32;
constexpr std::size_t elems_per_wi = 2;
constexpr std::size_t n = wg_size * elems_per_wi;

queue q;

buffer<S, 1> input_buf{n};

{
@@ -58,31 +55,67 @@ int main() {
accessor store_blocked{store_blocked_buf, cgh};
accessor store_striped{store_striped_buf, cgh};

local_accessor<S, 1> local_acc{wg_size * elems_per_wi, cgh};

cgh.parallel_for(nd_range<1>{wg_size, wg_size}, [=](nd_item<1> ndi) {
auto gid = ndi.get_global_id(0);
auto lid = ndi.get_local_id(0);
auto g = ndi.get_group();

S data[elems_per_wi];

auto blocked = sycl_exp::properties{sycl_exp::data_placement_blocked};
auto striped = sycl_exp::properties{sycl_exp::data_placement_striped};

if constexpr (addr_space == access::address_space::local_space) {
// Copy input to local memory.
for (int i = lid * elems_per_wi; i < lid * elems_per_wi + elems_per_wi;
i++) {
local_acc[i] = input[i];
}
ndi.barrier(access::fence_space::local_space);
}

// blocked
sycl_exp::group_load(g, input.begin(), span{data}, blocked);
if constexpr (addr_space == access::address_space::local_space) {
sycl_exp::group_load(g, local_acc.begin(), span{data}, blocked);
} else {
sycl_exp::group_load(g, input.begin(), span{data}, blocked);
}
for (int i = 0; i < elems_per_wi; ++i)
load_blocked[gid * elems_per_wi + i] = data[i];

// striped
sycl_exp::group_load(g, input.begin(), span{data}, striped);
if constexpr (addr_space == access::address_space::local_space) {
sycl_exp::group_load(g, local_acc.begin(), span{data}, striped);
} else {
sycl_exp::group_load(g, input.begin(), span{data}, striped);
}
for (int i = 0; i < elems_per_wi; ++i)
load_striped[gid * elems_per_wi + i] = data[i];

// Stores:

std::iota(std::begin(data), std::end(data), gid * elems_per_wi);

sycl_exp::group_store(g, span{data}, store_blocked.begin(), blocked);
sycl_exp::group_store(g, span{data}, store_striped.begin(), striped);
auto copy_local_acc_to_global_output = [&](accessor<S, 1> output) {
for (int i = lid * elems_per_wi; i < lid * elems_per_wi + elems_per_wi;
i++) {
output[i] = local_acc[i];
}
};

if constexpr (addr_space == access::address_space::local_space) {
sycl_exp::group_store(g, span{data}, local_acc.begin(), blocked);
copy_local_acc_to_global_output(store_blocked);
} else {
sycl_exp::group_store(g, span{data}, store_blocked.begin(), blocked);
}

if constexpr (addr_space == access::address_space::local_space) {
sycl_exp::group_store(g, span{data}, local_acc.begin(), striped);
copy_local_acc_to_global_output(store_striped);
} else {
sycl_exp::group_store(g, span{data}, store_striped.begin(), striped);
}
});
});

@@ -119,3 +152,10 @@ int main() {

return 0;
}

int main() {
queue q;
test<access::address_space::global_space>(q);
test<access::address_space::local_space>(q);
return 0;
}
60 changes: 51 additions & 9 deletions sycl/test-e2e/GroupAlgorithm/load_store/odd_wg_size.cpp
Original file line number Diff line number Diff line change
@@ -12,7 +12,8 @@ namespace sycl_exp = sycl::ext::oneapi::experimental;
// Similar to partial_sg.cpp, but check group (vs. sub_group) loads/stores when
// WG_SIZE isn't equally divisible by SG_SIZE.

template <int SG_SIZE, int WG_SIZE> void test(queue &q) {
template <access::address_space addr_space, int SG_SIZE, int WG_SIZE>
void test(queue &q) {
constexpr std::size_t wg_size = WG_SIZE;
constexpr std::size_t n_wgs = 2;
constexpr std::size_t global_size = n_wgs * wg_size;
@@ -39,10 +40,12 @@ template <int SG_SIZE, int WG_SIZE> void test(queue &q) {
accessor store_blocked{store_blocked_buf, cgh};
accessor store_striped{store_striped_buf, cgh};

local_accessor<int, 1> local_acc{wg_size * elems_per_wi, cgh};
cgh.parallel_for(
nd_range<1>{global_size, wg_size},
[=](nd_item<1> ndi) [[sycl::reqd_sub_group_size(SG_SIZE)]] {
auto gid = ndi.get_global_id(0);
auto lid = ndi.get_local_id(0);
auto g = ndi.get_group();
auto offset = g.get_group_id(0) * g.get_local_range(0) * elems_per_wi;

@@ -51,24 +54,61 @@ template <int SG_SIZE, int WG_SIZE> void test(queue &q) {
auto blocked = sycl_exp::properties{sycl_exp::data_placement_blocked};
auto striped = sycl_exp::properties{sycl_exp::data_placement_striped};

if constexpr (addr_space == access::address_space::local_space) {
// Copy input to local memory.
for (int i = lid * elems_per_wi;
i < lid * elems_per_wi + elems_per_wi; i++) {
local_acc[i] = input[offset + i];
}
ndi.barrier(access::fence_space::local_space);
}

// blocked
sycl_exp::group_load(g, input.begin() + offset, span{data}, blocked);
if constexpr (addr_space == access::address_space::local_space) {
sycl_exp::group_load(g, local_acc.begin(), span{data}, blocked);
} else {
sycl_exp::group_load(g, input.begin() + offset, span{data},
blocked);
}
for (int i = 0; i < elems_per_wi; ++i)
load_blocked[gid * elems_per_wi + i] = data[i];

// striped
sycl_exp::group_load(g, input.begin() + offset, span{data}, striped);
if constexpr (addr_space == access::address_space::local_space) {
sycl_exp::group_load(g, local_acc.begin(), span{data}, striped);
} else {
sycl_exp::group_load(g, input.begin() + offset, span{data},
striped);
}
for (int i = 0; i < elems_per_wi; ++i)
load_striped[gid * elems_per_wi + i] = data[i];

// Stores:

std::iota(std::begin(data), std::end(data), gid * elems_per_wi);

sycl_exp::group_store(g, span{data}, store_blocked.begin() + offset,
blocked);
sycl_exp::group_store(g, span{data}, store_striped.begin() + offset,
striped);
auto copy_local_acc_to_global_output = [&](accessor<int, 1> output) {
for (int i = lid * elems_per_wi;
i < lid * elems_per_wi + elems_per_wi; i++) {
output[offset + i] = local_acc[i];
}
};

if constexpr (addr_space == access::address_space::local_space) {
sycl_exp::group_store(g, span{data}, local_acc.begin(), blocked);
copy_local_acc_to_global_output(store_blocked);
} else {
sycl_exp::group_store(g, span{data}, store_blocked.begin() + offset,
blocked);
}

if constexpr (addr_space == access::address_space::local_space) {
sycl_exp::group_store(g, span{data}, local_acc.begin(), striped);
copy_local_acc_to_global_output(store_striped);
} else {
sycl_exp::group_store(g, span{data}, store_striped.begin() + offset,
striped);
}
});
});

@@ -110,8 +150,10 @@ int main() {
if (std::none_of(device_sg_sizes.begin(), device_sg_sizes.end(),
[](auto x) { return x == sg_size; }))
return;
test<sg_size, sg_size / 2>(q);
test<sg_size, sg_size * 3 / 2>(q);
test<access::address_space::global_space, sg_size, sg_size / 2>(q);
test<access::address_space::local_space, sg_size, sg_size / 2>(q);
test<access::address_space::global_space, sg_size, sg_size * 3 / 2>(q);
test<access::address_space::local_space, sg_size, sg_size * 3 / 2>(q);
});

return 0;
66 changes: 55 additions & 11 deletions sycl/test-e2e/GroupAlgorithm/load_store/partial_sg.cpp
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
// RUN: %{build} -Wno-error=incorrect-sub-group-size -o %t.out
// RUN: %{run} %t.out

#include <numeric>
#include <sycl/detail/core.hpp>
#include <sycl/ext/oneapi/experimental/group_load_store.hpp>
#include <sycl/group_barrier.hpp>
#include <sycl/sub_group.hpp>

#include <numeric>

using namespace sycl;
namespace sycl_exp = sycl::ext::oneapi::experimental;

template <int SG_SIZE> void test(queue &q) {
template <access::address_space addr_space, int SG_SIZE> void test(queue &q) {
// Verify scenario when the last sub_group isn't full.
constexpr std::size_t wg_size = SG_SIZE * 3 / 2;
constexpr std::size_t elems_per_wi = 4;
@@ -36,6 +36,7 @@ template <int SG_SIZE> void test(queue &q) {
accessor store_blocked{store_blocked_buf, cgh};
accessor store_striped{store_striped_buf, cgh};

local_accessor<int, 1> local_acc{wg_size * elems_per_wi, cgh};
cgh.parallel_for(
nd_range<1>{wg_size, wg_size},
[=](nd_item<1> ndi) [[sycl::reqd_sub_group_size(SG_SIZE)]] {
@@ -49,24 +50,65 @@ template <int SG_SIZE> void test(queue &q) {
auto blocked = sycl_exp::properties{sycl_exp::data_placement_blocked};
auto striped = sycl_exp::properties{sycl_exp::data_placement_striped};

if constexpr (addr_space == access::address_space::local_space) {
// Copy input to local memory.
for (int i = sg.get_local_id() * elems_per_wi;
i < sg.get_local_id() * elems_per_wi + elems_per_wi; i++) {
local_acc[offset + i] = input[offset + i];
}
group_barrier(sg);
}

// blocked
sycl_exp::group_load(sg, input.begin() + offset, span{data}, blocked);
if constexpr (addr_space == access::address_space::local_space) {
sycl_exp::group_load(sg, local_acc.begin() + offset, span{data},
blocked);
} else {
sycl_exp::group_load(sg, input.begin() + offset, span{data},
blocked);
}
for (int i = 0; i < elems_per_wi; ++i)
load_blocked[gid * elems_per_wi + i] = data[i];

// striped
sycl_exp::group_load(sg, input.begin() + offset, span{data}, striped);
if constexpr (addr_space == access::address_space::local_space) {
sycl_exp::group_load(sg, local_acc.begin() + offset, span{data},
striped);
} else {
sycl_exp::group_load(sg, input.begin() + offset, span{data},
striped);
}
for (int i = 0; i < elems_per_wi; ++i)
load_striped[gid * elems_per_wi + i] = data[i];

// Stores:

std::iota(std::begin(data), std::end(data), gid * elems_per_wi);

sycl_exp::group_store(sg, span{data}, store_blocked.begin() + offset,
blocked);
sycl_exp::group_store(sg, span{data}, store_striped.begin() + offset,
striped);
auto copy_local_acc_to_global_output = [&](accessor<int, 1> output) {
for (int i = sg.get_local_id() * elems_per_wi;
i < sg.get_local_id() * elems_per_wi + elems_per_wi; i++) {
output[offset + i] = local_acc[offset + i];
}
};

if constexpr (addr_space == access::address_space::local_space) {
sycl_exp::group_store(sg, span{data}, local_acc.begin() + offset,
blocked);
copy_local_acc_to_global_output(store_blocked);
} else {
sycl_exp::group_store(sg, span{data},
store_blocked.begin() + offset, blocked);
}

if constexpr (addr_space == access::address_space::local_space) {
sycl_exp::group_store(sg, span{data}, local_acc.begin() + offset,
striped);
copy_local_acc_to_global_output(store_striped);
} else {
sycl_exp::group_store(sg, span{data},
store_striped.begin() + offset, striped);
}
});
});

@@ -110,8 +152,10 @@ int main() {
detail::loop<std::size(sg_sizes)>([&](auto sg_size_idx) {
constexpr auto sg_size = sg_sizes[sg_size_idx];
if (std::any_of(device_sg_sizes.begin(), device_sg_sizes.end(),
[](auto x) { return x == sg_size; }))
test<sg_size>(q);
[](auto x) { return x == sg_size; })) {
test<access::address_space::global_space, sg_size>(q);
test<access::address_space::local_space, sg_size>(q);
}
});

return 0;
1,644 changes: 1,087 additions & 557 deletions sycl/test/check_device_code/group_load.cpp

Large diffs are not rendered by default.

179 changes: 179 additions & 0 deletions sycl/test/check_device_code/group_load_store_native_key.cpp

Large diffs are not rendered by default.

2,064 changes: 1,335 additions & 729 deletions sycl/test/check_device_code/group_store.cpp

Large diffs are not rendered by default.