Skip to content

Commit

Permalink
Outline sequential kernel loops, implement chaos mode for simple PF
Browse files Browse the repository at this point in the history
  • Loading branch information
fknorr committed Jan 4, 2024
1 parent 0689c53 commit 8a7a4fe
Show file tree
Hide file tree
Showing 10 changed files with 148 additions and 128 deletions.
5 changes: 4 additions & 1 deletion include/simsycl/detail/group_operation_impl.hh
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,10 @@ struct concurrent_sub_group {
sub_group_instance instance;
};

inline detail::concurrent_sub_group &get_concurrent_group(const sycl::sub_group &g) { return *g.m_concurrent_group; }
inline detail::concurrent_sub_group &get_concurrent_group(const sycl::sub_group &g) {
SIMSYCL_CHECK(g.m_concurrent_group && "group operations not available in this kernel");
return *g.m_concurrent_group;
}

// group operation function template

Expand Down
142 changes: 26 additions & 116 deletions include/simsycl/detail/parallel_for.hh
Original file line number Diff line number Diff line change
Expand Up @@ -23,123 +23,17 @@ namespace simsycl::detail {
struct no_offset_t {
} inline constexpr no_offset;

template<typename Func, typename... Params>
void sequential_for(const sycl::range<1> &range, no_offset_t /* no offset */, Func &&func, Params &&...args) {
sycl::id<1> id;
for(id[0] = 0; id[0] < range[0]; ++id[0]) { //
func(make_item(id, range), std::forward<Params>(args)...);
}
}

template<typename Func, typename... Params>
void sequential_for(const sycl::range<2> &range, no_offset_t /* no offset */, Func &&func, Params &&...args) {
sycl::id<2> id;
for(id[0] = 0; id[0] < range[0]; ++id[0]) {
for(id[1] = 0; id[1] < range[1]; ++id[1]) { //
func(make_item(id, range), std::forward<Params>(args)...);
}
}
}

template<typename Func, typename... Params>
void sequential_for(const sycl::range<3> &range, no_offset_t /* no offset */, Func &&func, Params &&...args) {
sycl::id<3> id;
for(id[0] = 0; id[0] < range[0]; ++id[0]) {
for(id[1] = 0; id[1] < range[1]; ++id[1]) {
for(id[2] = 0; id[2] < range[2]; ++id[2]) { //
func(make_item(id, range), std::forward<Params>(args)...);
}
}
}
}

template<typename Func, typename... Params>
void sequential_for(const sycl::range<1> &range, const sycl::id<1> &offset, Func &&func, Params &&...args) {
sycl::id<1> id;
for(id[0] = offset[0]; id[0] < offset[0] + range[0]; ++id[0]) { //
func(make_item(id, range, offset), std::forward<Params>(args)...);
}
}
template<typename>
struct with_offset;
template<>

template<typename Func, typename... Params>
void sequential_for(const sycl::range<2> &range, const sycl::id<2> &offset, Func &&func, Params &&...args) {
sycl::id<2> id;
for(id[0] = offset[0]; id[0] < offset[0] + range[0]; ++id[0]) {
for(id[1] = offset[1]; id[1] < offset[1] + range[1]; ++id[1]) { //
func(make_item(id, range, offset), std::forward<Params>(args)...);
}
}
}

template<typename Func, typename... Params>
void sequential_for(const sycl::range<3> &range, const sycl::id<3> &offset, Func &&func, Params &&...args) {
sycl::id<3> id;
for(id[0] = offset[0]; id[0] < offset[0] + range[0]; ++id[0]) {
for(id[1] = offset[1]; id[1] < offset[1] + range[1]; ++id[1]) {
for(id[2] = offset[2]; id[2] < offset[2] + range[2]; ++id[2]) { //
func(make_item(id, range, offset), std::forward<Params>(args)...);
}
}
}
}


template<typename WorkgroupFunctionType>
void sequential_for_work_group(sycl::range<1> num_work_groups, std::optional<sycl::range<1>> work_group_size,
const WorkgroupFunctionType &kernel_func) {
sycl::id<1> group_id;
const auto type
= work_group_size.has_value() ? group_type::hierarchical_explicit_size : group_type::hierarchical_implicit_size;
for(group_id[0] = 0; group_id[0] < num_work_groups[0]; ++group_id[0]) {
concurrent_group impl;
const auto group_item = make_item(group_id, num_work_groups);
const auto local_item = make_item(sycl::id(0), work_group_size.value_or(sycl::range(1)));
const auto global_item = make_item(
group_id * sycl::id(local_item.get_range()), local_item.get_range() * group_item.get_range(), sycl::id(0));
sycl::group<1> group = make_group(type, local_item, global_item, group_item, &impl);
kernel_func(group);
}
}
struct with_offset<no_offset_t> : std::false_type {};
template<int Dimensions>

template<typename WorkgroupFunctionType>
void sequential_for_work_group(sycl::range<2> num_work_groups, std::optional<sycl::range<2>> work_group_size,
const WorkgroupFunctionType &kernel_func) {
sycl::id<2> group_id;
const auto type
= work_group_size.has_value() ? group_type::hierarchical_explicit_size : group_type::hierarchical_implicit_size;
for(group_id[0] = 0; group_id[0] < num_work_groups[0]; ++group_id[0]) {
for(group_id[1] = 0; group_id[1] < num_work_groups[1]; ++group_id[1]) {
concurrent_group impl;
const auto group_item = make_item(group_id, num_work_groups);
const auto local_item = make_item(sycl::id(0, 0), work_group_size.value_or(sycl::range(1, 1)));
const auto global_item = make_item(group_id * sycl::id(local_item.get_range()),
local_item.get_range() * group_item.get_range(), sycl::id(0, 0));
sycl::group<2> group = make_group(type, local_item, global_item, group_item, &impl);
kernel_func(group);
}
}
}
struct with_offset<sycl::id<Dimensions>> : std::true_type {};

template<typename WorkgroupFunctionType>
void sequential_for_work_group(sycl::range<3> num_work_groups, std::optional<sycl::range<3>> work_group_size,
const WorkgroupFunctionType &kernel_func) {
sycl::id<3> group_id;
const auto type
= work_group_size.has_value() ? group_type::hierarchical_explicit_size : group_type::hierarchical_implicit_size;
for(group_id[0] = 0; group_id[0] < num_work_groups[0]; ++group_id[0]) {
for(group_id[1] = 0; group_id[1] < num_work_groups[1]; ++group_id[1]) {
for(group_id[2] = 0; group_id[2] < num_work_groups[2]; ++group_id[2]) {
concurrent_group impl;
const auto group_item = make_item(group_id, num_work_groups);
const auto local_item = make_item(sycl::id(0, 0, 0), work_group_size.value_or(sycl::range(1, 1, 1)));
const auto global_item = make_item(group_id * sycl::id(local_item.get_range()),
local_item.get_range() * group_item.get_range(), sycl::id(0, 0, 0));
sycl::group<3> group = make_group(type, local_item, global_item, group_item, &impl);
kernel_func(group);
}
}
}
}
template<typename T>
inline constexpr bool with_offset_v = with_offset<T>::value;


struct local_memory_requirement {
Expand All @@ -152,6 +46,20 @@ struct local_memory_requirement {
template<int Dimensions>
using nd_kernel = std::function<void(const sycl::nd_item<Dimensions> &)>;

template<int Dimensions, bool WithOffset>
using simple_kernel = std::function<void(const sycl::item<Dimensions, WithOffset> &)>;

template<int Dimensions>
using hierarchical_kernel = std::function<void(const sycl::group<Dimensions> &)>;

template<int Dimensions, typename Offset>
void sequential_for(const sycl::range<Dimensions> &range, const Offset &offset,
const simple_kernel<Dimensions, with_offset_v<Offset>> &kernel);

template<int Dimensions>
void sequential_for_work_group(sycl::range<Dimensions> num_work_groups,
std::optional<sycl::range<Dimensions>> work_group_size, const hierarchical_kernel<Dimensions> &kernel);

template<int Dimensions>
void cooperative_for_nd_range(const sycl::device &device, const sycl::nd_range<Dimensions> &range,
const std::vector<local_memory_requirement> &local_memory, const nd_kernel<Dimensions> &kernel);
Expand All @@ -160,7 +68,9 @@ template<typename KernelName, int Dimensions, typename Offset, typename KernelFu
void execute_parallel_for(
const sycl::range<Dimensions> &range, const Offset &offset, KernelFunc &&func, Params &&...args) {
register_kernel_on_static_construction<KernelName, KernelFunc>();
sequential_for(range, offset, func, std::forward<Params>(args)...);
const simple_kernel<Dimensions, with_offset_v<Offset>> kernel(
[&](const sycl::item<Dimensions> &item) { func(item, std::forward<Params>(args)...); });
sequential_for(range, offset, kernel);
}

template<typename KernelName, int Dimensions, typename KernelFunc, typename... Params>
Expand Down Expand Up @@ -226,7 +136,7 @@ void parallel_for_work_group(const sycl::device &device, sycl::range<Dimensions>
const WorkgroupFunctionType &kernel_func) {
register_kernel_on_static_construction<KernelName, WorkgroupFunctionType>();
const auto local_allocations = prepare_hierarchical_parallel_for(device, work_group_size, local_memory);
sequential_for_work_group(num_work_groups, work_group_size, kernel_func);
sequential_for_work_group(num_work_groups, work_group_size, hierarchical_kernel<Dimensions>(kernel_func));
}

} // namespace simsycl::detail
7 changes: 1 addition & 6 deletions include/simsycl/detail/reference_type.hh
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,7 @@ class reference_type {
static_assert(std::is_base_of_v<reference_type, Derived>);
}

state_type &state() {
SIMSYCL_CHECK(m_state != nullptr);
return *m_state;
}

const state_type &state() const {
state_type &state() const {
SIMSYCL_CHECK(m_state != nullptr);
return *m_state;
}
Expand Down
3 changes: 3 additions & 0 deletions include/simsycl/sycl/device.hh
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ struct accelerator_selector {

struct device_state;

size_t *device_bytes_free(const sycl::device &device);

} // namespace simsycl::detail

namespace simsycl::sycl {
Expand Down Expand Up @@ -105,6 +107,7 @@ class device final : public detail::reference_type<device, detail::device_state>

friend device simsycl::make_device(sycl::platform &platform, const device_config &config);
friend void simsycl::set_parent_device(sycl::device &device, const sycl::device &parent);
friend size_t *detail::device_bytes_free(const sycl::device &device);

device(const detail::device_selector &selector);
device(std::shared_ptr<detail::device_state> &&state) : reference_type(std::move(state)) {}
Expand Down
2 changes: 0 additions & 2 deletions include/simsycl/sycl/forward.hh
Original file line number Diff line number Diff line change
Expand Up @@ -172,8 +172,6 @@ struct event_state;

sycl::event make_event(std::shared_ptr<event_state> &&state);

void enter_kernel_fiber(boost::context::continuation &&from_scheduler);
boost::context::continuation &&leave_kernel_fiber();
void yield_to_kernel_scheduler();
void maybe_yield_to_kernel_scheduler();

Expand Down
4 changes: 4 additions & 0 deletions src/simsycl/device.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ struct device_state {
weak_ref<sycl::device> parent;
};

size_t *device_bytes_free(const sycl::device &device) {
return &device.state().bytes_free;
}

int default_selector::operator()(const sycl::device &device) const {
return device.is_gpu() || device.is_accelerator() ? 1 : 0;
}
Expand Down
69 changes: 69 additions & 0 deletions src/simsycl/schedule.cc
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,75 @@ cooperative_schedule::state shuffle_schedule::update(state state_before, std::ve

namespace simsycl::detail {

template<int Dimensions, typename Offset>
void sequential_for(const sycl::range<Dimensions> &range, const Offset &offset,
const simple_kernel<Dimensions, with_offset_v<Offset>> &kernel) {
// limit the number of work items scheduled at a time to avoid allocating huge index buffers
constexpr size_t max_schedule_chunk_size = 16 << 10;
const auto schedule_chunk_size = std::min(range.size(), max_schedule_chunk_size);
const auto &schedule = get_cooperative_schedule();
std::vector<size_t> order(schedule_chunk_size);
auto schedule_state = schedule.init(order);

for(size_t schedule_offset = 0; schedule_offset < range.size(); schedule_offset += max_schedule_chunk_size) {
for(size_t schedule_id = 0; schedule_id < schedule_chunk_size; ++schedule_id) {
const auto linear_id = schedule_offset + order[schedule_id];
if(linear_id < range.size()) {
if constexpr(with_offset_v<Offset>) {
const auto id = offset + linear_index_to_id(range, linear_id);
kernel(make_item(id, range, offset));
} else {
const auto id = linear_index_to_id(range, linear_id);
kernel(make_item(id, range));
}
}
}
schedule_state = schedule.update(schedule_state, order);
}
}

template void sequential_for(
const sycl::range<1> &range, const no_offset_t & /* no offset */, const simple_kernel<1, false> &kernel);
template void sequential_for(
const sycl::range<2> &range, const no_offset_t & /* no offset */, const simple_kernel<2, false> &kernel);
template void sequential_for(
const sycl::range<3> &range, const no_offset_t & /* no offset */, const simple_kernel<3, false> &kernel);
template void sequential_for<1, sycl::id<1>>(
const sycl::range<1> &range, const sycl::id<1> &offset, const simple_kernel<1, true> &kernel);
template void sequential_for(
const sycl::range<2> &range, const sycl::id<2> &offset, const simple_kernel<2, true> &kernel);
template void sequential_for(
const sycl::range<3> &range, const sycl::id<3> &offset, const simple_kernel<3, true> &kernel);

template<int Dimensions>
sycl::range<Dimensions> unit_range() {
sycl::range<Dimensions> r;
for(int i = 0; i < Dimensions; ++i) { r[i] = 1; }
return r;
}

template<int Dimensions>
void sequential_for_work_group(sycl::range<Dimensions> num_work_groups,
std::optional<sycl::range<Dimensions>> work_group_size, const hierarchical_kernel<Dimensions> &kernel) {
const auto type
= work_group_size.has_value() ? group_type::hierarchical_explicit_size : group_type::hierarchical_implicit_size;
for(size_t group_linear_id = 0; group_linear_id < num_work_groups.size(); ++group_linear_id) {
const auto group_id = linear_index_to_id(num_work_groups, group_linear_id);
const auto group_item = make_item(group_id, num_work_groups);
const auto local_item = make_item(sycl::id<Dimensions>(), work_group_size.value_or(unit_range<Dimensions>()));
const auto global_item = make_item(group_id * sycl::id(local_item.get_range()),
local_item.get_range() * group_item.get_range(), sycl::id<Dimensions>());
kernel(make_group(type, local_item, global_item, group_item, nullptr));
}
}

template void sequential_for_work_group(sycl::range<1> num_work_groups, std::optional<sycl::range<1>> work_group_size,
const hierarchical_kernel<1> &kernel);
template void sequential_for_work_group(sycl::range<2> num_work_groups, std::optional<sycl::range<2>> work_group_size,
const hierarchical_kernel<2> &kernel);
template void sequential_for_work_group(sycl::range<3> num_work_groups, std::optional<sycl::range<3>> work_group_size,
const hierarchical_kernel<3> &kernel);

boost::context::continuation g_scheduler;

void enter_kernel_fiber(boost::context::continuation &&from_scheduler) {
Expand Down
5 changes: 2 additions & 3 deletions src/simsycl/system.cc
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,6 @@ struct memory_state {
struct system_state {
std::vector<sycl::platform> platforms;
std::vector<sycl::device> devices;
std::unordered_map<sycl::device, size_t> device_bytes_free;
std::set<usm_allocation, usm_allocation_order> usm_allocations;

explicit system_state(const system_config &config) {
Expand Down Expand Up @@ -181,7 +180,7 @@ void *usm_alloc(const sycl::context &context, sycl::usm::alloc kind, std::option
throw sycl::exception(sycl::errc::memory_allocation, "Allocation size exceeds device limit");
}

bytes_free = &system.device_bytes_free.at(*device);
bytes_free = detail::device_bytes_free(*device);
if(*bytes_free < size_bytes) {
throw sycl::exception(sycl::errc::memory_allocation, "Not enough memory available");
}
Expand Down Expand Up @@ -225,7 +224,7 @@ void usm_free(void *ptr, const sycl::context &context) {
#endif

if(iter->get_device().has_value()) {
system.device_bytes_free.at(iter->get_device().value()) += iter->get_size_bytes();
*detail::device_bytes_free(iter->get_device().value()) += iter->get_size_bytes();
}
system.usm_allocations.erase(iter);
}
Expand Down
1 change: 1 addition & 0 deletions test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ add_executable(tests
marray_tests.cc
math_tests.cc
reduction_tests.cc
schedule_tests.cc
vec_tests.cc
)

Expand Down
38 changes: 38 additions & 0 deletions test/schedule_tests.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#include <sycl/sycl.hpp>

#include <catch2/catch_test_macros.hpp>
#include <catch2/generators/catch_generators.hpp>


bool is_fibonacci(const int *buf, int n) {
for(int i = 0; i < n; ++i) {
if(buf[i] != (i < 2 ? i : buf[i - 1] + buf[i - 2])) return false;
}
return true;
}

TEST_CASE(
"naive racy fibonacci implementation is produces expected results only with round-robin schedule", "[schedule]") {
const bool chaos_mode = GENERATE(values<int>({false, true}));
if(chaos_mode) { simsycl::set_cooperative_schedule(std::make_unique<simsycl::shuffle_schedule>()); }
INFO((chaos_mode ? "shuffle schedule" : "round-robin schedule"));

sycl::queue q;
auto *buf = sycl::malloc_shared<int>(100, q);

SECTION("in simple parallel_for") {
q.parallel_for(sycl::range<1>(100), [=](sycl::item<1> item) {
const int i = item.get_id(0);
buf[i] = i < 2 ? i : buf[i - 1] + buf[i - 2];
});
CHECK(is_fibonacci(buf, 100) == !chaos_mode);
}

SECTION("in nd_range parallel_for") {
q.parallel_for(sycl::nd_range<1>(100, 100), [=](sycl::nd_item<1> item) {
const int i = item.get_global_id(0);
buf[i] = i < 2 ? i : buf[i - 1] + buf[i - 2];
});
CHECK(is_fibonacci(buf, 100) == !chaos_mode);
}
}

0 comments on commit 8a7a4fe

Please sign in to comment.