Outline sequential kernel loops, implement chaos mode for simple PF

celerity · Jan 4, 2024 · 8a7a4fe · 8a7a4fe
1 parent 0689c53
commit 8a7a4fe
Show file tree

Hide file tree

Showing 10 changed files with 148 additions and 128 deletions.
diff --git a/include/simsycl/detail/group_operation_impl.hh b/include/simsycl/detail/group_operation_impl.hh
@@ -183,7 +183,10 @@ struct concurrent_sub_group {
     sub_group_instance instance;
 };
 
-inline detail::concurrent_sub_group &get_concurrent_group(const sycl::sub_group &g) { return *g.m_concurrent_group; }
+inline detail::concurrent_sub_group &get_concurrent_group(const sycl::sub_group &g) {
+    SIMSYCL_CHECK(g.m_concurrent_group && "group operations not available in this kernel");
+    return *g.m_concurrent_group;
+}
 
 // group operation function template
 

diff --git a/include/simsycl/detail/parallel_for.hh b/include/simsycl/detail/parallel_for.hh
@@ -23,123 +23,17 @@ namespace simsycl::detail {
 struct no_offset_t {
 } inline constexpr no_offset;
 
-template<typename Func, typename... Params>
-void sequential_for(const sycl::range<1> &range, no_offset_t /* no offset */, Func &&func, Params &&...args) {
-    sycl::id<1> id;
-    for(id[0] = 0; id[0] < range[0]; ++id[0]) { //
-        func(make_item(id, range), std::forward<Params>(args)...);
-    }
-}
-
-template<typename Func, typename... Params>
-void sequential_for(const sycl::range<2> &range, no_offset_t /* no offset */, Func &&func, Params &&...args) {
-    sycl::id<2> id;
-    for(id[0] = 0; id[0] < range[0]; ++id[0]) {
-        for(id[1] = 0; id[1] < range[1]; ++id[1]) { //
-            func(make_item(id, range), std::forward<Params>(args)...);
-        }
-    }
-}
-
-template<typename Func, typename... Params>
-void sequential_for(const sycl::range<3> &range, no_offset_t /* no offset */, Func &&func, Params &&...args) {
-    sycl::id<3> id;
-    for(id[0] = 0; id[0] < range[0]; ++id[0]) {
-        for(id[1] = 0; id[1] < range[1]; ++id[1]) {
-            for(id[2] = 0; id[2] < range[2]; ++id[2]) { //
-                func(make_item(id, range), std::forward<Params>(args)...);
-            }
-        }
-    }
-}
-
-template<typename Func, typename... Params>
-void sequential_for(const sycl::range<1> &range, const sycl::id<1> &offset, Func &&func, Params &&...args) {
-    sycl::id<1> id;
-    for(id[0] = offset[0]; id[0] < offset[0] + range[0]; ++id[0]) { //
-        func(make_item(id, range, offset), std::forward<Params>(args)...);
-    }
-}
+template<typename>
+struct with_offset;
+template<>
 
-template<typename Func, typename... Params>
-void sequential_for(const sycl::range<2> &range, const sycl::id<2> &offset, Func &&func, Params &&...args) {
-    sycl::id<2> id;
-    for(id[0] = offset[0]; id[0] < offset[0] + range[0]; ++id[0]) {
-        for(id[1] = offset[1]; id[1] < offset[1] + range[1]; ++id[1]) { //
-            func(make_item(id, range, offset), std::forward<Params>(args)...);
-        }
-    }
-}
-
-template<typename Func, typename... Params>
-void sequential_for(const sycl::range<3> &range, const sycl::id<3> &offset, Func &&func, Params &&...args) {
-    sycl::id<3> id;
-    for(id[0] = offset[0]; id[0] < offset[0] + range[0]; ++id[0]) {
-        for(id[1] = offset[1]; id[1] < offset[1] + range[1]; ++id[1]) {
-            for(id[2] = offset[2]; id[2] < offset[2] + range[2]; ++id[2]) { //
-                func(make_item(id, range, offset), std::forward<Params>(args)...);
-            }
-        }
-    }
-}
-
-
-template<typename WorkgroupFunctionType>
-void sequential_for_work_group(sycl::range<1> num_work_groups, std::optional<sycl::range<1>> work_group_size,
-    const WorkgroupFunctionType &kernel_func) {
-    sycl::id<1> group_id;
-    const auto type
-        = work_group_size.has_value() ? group_type::hierarchical_explicit_size : group_type::hierarchical_implicit_size;
-    for(group_id[0] = 0; group_id[0] < num_work_groups[0]; ++group_id[0]) {
-        concurrent_group impl;
-        const auto group_item = make_item(group_id, num_work_groups);
-        const auto local_item = make_item(sycl::id(0), work_group_size.value_or(sycl::range(1)));
-        const auto global_item = make_item(
-            group_id * sycl::id(local_item.get_range()), local_item.get_range() * group_item.get_range(), sycl::id(0));
-        sycl::group<1> group = make_group(type, local_item, global_item, group_item, &impl);
-        kernel_func(group);
-    }
-}
+struct with_offset<no_offset_t> : std::false_type {};
+template<int Dimensions>
 
-template<typename WorkgroupFunctionType>
-void sequential_for_work_group(sycl::range<2> num_work_groups, std::optional<sycl::range<2>> work_group_size,
-    const WorkgroupFunctionType &kernel_func) {
-    sycl::id<2> group_id;
-    const auto type
-        = work_group_size.has_value() ? group_type::hierarchical_explicit_size : group_type::hierarchical_implicit_size;
-    for(group_id[0] = 0; group_id[0] < num_work_groups[0]; ++group_id[0]) {
-        for(group_id[1] = 0; group_id[1] < num_work_groups[1]; ++group_id[1]) {
-            concurrent_group impl;
-            const auto group_item = make_item(group_id, num_work_groups);
-            const auto local_item = make_item(sycl::id(0, 0), work_group_size.value_or(sycl::range(1, 1)));
-            const auto global_item = make_item(group_id * sycl::id(local_item.get_range()),
-                local_item.get_range() * group_item.get_range(), sycl::id(0, 0));
-            sycl::group<2> group = make_group(type, local_item, global_item, group_item, &impl);
-            kernel_func(group);
-        }
-    }
-}
+struct with_offset<sycl::id<Dimensions>> : std::true_type {};
 
-template<typename WorkgroupFunctionType>
-void sequential_for_work_group(sycl::range<3> num_work_groups, std::optional<sycl::range<3>> work_group_size,
-    const WorkgroupFunctionType &kernel_func) {
-    sycl::id<3> group_id;
-    const auto type
-        = work_group_size.has_value() ? group_type::hierarchical_explicit_size : group_type::hierarchical_implicit_size;
-    for(group_id[0] = 0; group_id[0] < num_work_groups[0]; ++group_id[0]) {
-        for(group_id[1] = 0; group_id[1] < num_work_groups[1]; ++group_id[1]) {
-            for(group_id[2] = 0; group_id[2] < num_work_groups[2]; ++group_id[2]) {
-                concurrent_group impl;
-                const auto group_item = make_item(group_id, num_work_groups);
-                const auto local_item = make_item(sycl::id(0, 0, 0), work_group_size.value_or(sycl::range(1, 1, 1)));
-                const auto global_item = make_item(group_id * sycl::id(local_item.get_range()),
-                    local_item.get_range() * group_item.get_range(), sycl::id(0, 0, 0));
-                sycl::group<3> group = make_group(type, local_item, global_item, group_item, &impl);
-                kernel_func(group);
-            }
-        }
-    }
-}
+template<typename T>
+inline constexpr bool with_offset_v = with_offset<T>::value;
 
 
 struct local_memory_requirement {
@@ -152,6 +46,20 @@ struct local_memory_requirement {
 template<int Dimensions>
 using nd_kernel = std::function<void(const sycl::nd_item<Dimensions> &)>;
 
+template<int Dimensions, bool WithOffset>
+using simple_kernel = std::function<void(const sycl::item<Dimensions, WithOffset> &)>;
+
+template<int Dimensions>
+using hierarchical_kernel = std::function<void(const sycl::group<Dimensions> &)>;
+
+template<int Dimensions, typename Offset>
+void sequential_for(const sycl::range<Dimensions> &range, const Offset &offset,
+    const simple_kernel<Dimensions, with_offset_v<Offset>> &kernel);
+
+template<int Dimensions>
+void sequential_for_work_group(sycl::range<Dimensions> num_work_groups,
+    std::optional<sycl::range<Dimensions>> work_group_size, const hierarchical_kernel<Dimensions> &kernel);
+
 template<int Dimensions>
 void cooperative_for_nd_range(const sycl::device &device, const sycl::nd_range<Dimensions> &range,
     const std::vector<local_memory_requirement> &local_memory, const nd_kernel<Dimensions> &kernel);
@@ -160,7 +68,9 @@ template<typename KernelName, int Dimensions, typename Offset, typename KernelFu
 void execute_parallel_for(
     const sycl::range<Dimensions> &range, const Offset &offset, KernelFunc &&func, Params &&...args) {
     register_kernel_on_static_construction<KernelName, KernelFunc>();
-    sequential_for(range, offset, func, std::forward<Params>(args)...);
+    const simple_kernel<Dimensions, with_offset_v<Offset>> kernel(
+        [&](const sycl::item<Dimensions> &item) { func(item, std::forward<Params>(args)...); });
+    sequential_for(range, offset, kernel);
 }
 
 template<typename KernelName, int Dimensions, typename KernelFunc, typename... Params>
@@ -226,7 +136,7 @@ void parallel_for_work_group(const sycl::device &device, sycl::range<Dimensions>
     const WorkgroupFunctionType &kernel_func) {
     register_kernel_on_static_construction<KernelName, WorkgroupFunctionType>();
     const auto local_allocations = prepare_hierarchical_parallel_for(device, work_group_size, local_memory);
-    sequential_for_work_group(num_work_groups, work_group_size, kernel_func);
+    sequential_for_work_group(num_work_groups, work_group_size, hierarchical_kernel<Dimensions>(kernel_func));
 }
 
 } // namespace simsycl::detail
diff --git a/include/simsycl/detail/reference_type.hh b/include/simsycl/detail/reference_type.hh
@@ -45,12 +45,7 @@ class reference_type {
         static_assert(std::is_base_of_v<reference_type, Derived>);
     }
 
-    state_type &state() {
-        SIMSYCL_CHECK(m_state != nullptr);
-        return *m_state;
-    }
-
-    const state_type &state() const {
+    state_type &state() const {
         SIMSYCL_CHECK(m_state != nullptr);
         return *m_state;
     }

diff --git a/include/simsycl/sycl/device.hh b/include/simsycl/sycl/device.hh
@@ -41,6 +41,8 @@ struct accelerator_selector {
 
 struct device_state;
 
+size_t *device_bytes_free(const sycl::device &device);
+
 } // namespace simsycl::detail
 
 namespace simsycl::sycl {
@@ -105,6 +107,7 @@ class device final : public detail::reference_type<device, detail::device_state>
 
     friend device simsycl::make_device(sycl::platform &platform, const device_config &config);
     friend void simsycl::set_parent_device(sycl::device &device, const sycl::device &parent);
+    friend size_t *detail::device_bytes_free(const sycl::device &device);
 
     device(const detail::device_selector &selector);
     device(std::shared_ptr<detail::device_state> &&state) : reference_type(std::move(state)) {}

diff --git a/include/simsycl/sycl/forward.hh b/include/simsycl/sycl/forward.hh
@@ -172,8 +172,6 @@ struct event_state;
 
 sycl::event make_event(std::shared_ptr<event_state> &&state);
 
-void enter_kernel_fiber(boost::context::continuation &&from_scheduler);
-boost::context::continuation &&leave_kernel_fiber();
 void yield_to_kernel_scheduler();
 void maybe_yield_to_kernel_scheduler();
 

diff --git a/src/simsycl/device.cc b/src/simsycl/device.cc
@@ -16,6 +16,10 @@ struct device_state {
     weak_ref<sycl::device> parent;
 };
 
+size_t *device_bytes_free(const sycl::device &device) {
+    return &device.state().bytes_free;
+}
+
 int default_selector::operator()(const sycl::device &device) const {
     return device.is_gpu() || device.is_accelerator() ? 1 : 0;
 }

diff --git a/src/simsycl/schedule.cc b/src/simsycl/schedule.cc
@@ -40,6 +40,75 @@ cooperative_schedule::state shuffle_schedule::update(state state_before, std::ve
 
 namespace simsycl::detail {
 
+template<int Dimensions, typename Offset>
+void sequential_for(const sycl::range<Dimensions> &range, const Offset &offset,
+    const simple_kernel<Dimensions, with_offset_v<Offset>> &kernel) {
+    // limit the number of work items scheduled at a time to avoid allocating huge index buffers
+    constexpr size_t max_schedule_chunk_size = 16 << 10;
+    const auto schedule_chunk_size = std::min(range.size(), max_schedule_chunk_size);
+    const auto &schedule = get_cooperative_schedule();
+    std::vector<size_t> order(schedule_chunk_size);
+    auto schedule_state = schedule.init(order);
+
+    for(size_t schedule_offset = 0; schedule_offset < range.size(); schedule_offset += max_schedule_chunk_size) {
+        for(size_t schedule_id = 0; schedule_id < schedule_chunk_size; ++schedule_id) {
+            const auto linear_id = schedule_offset + order[schedule_id];
+            if(linear_id < range.size()) {
+                if constexpr(with_offset_v<Offset>) {
+                    const auto id = offset + linear_index_to_id(range, linear_id);
+                    kernel(make_item(id, range, offset));
+                } else {
+                    const auto id = linear_index_to_id(range, linear_id);
+                    kernel(make_item(id, range));
+                }
+            }
+        }
+        schedule_state = schedule.update(schedule_state, order);
+    }
+}
+
+template void sequential_for(
+    const sycl::range<1> &range, const no_offset_t & /* no offset */, const simple_kernel<1, false> &kernel);
+template void sequential_for(
+    const sycl::range<2> &range, const no_offset_t & /* no offset */, const simple_kernel<2, false> &kernel);
+template void sequential_for(
+    const sycl::range<3> &range, const no_offset_t & /* no offset */, const simple_kernel<3, false> &kernel);
+template void sequential_for<1, sycl::id<1>>(
+    const sycl::range<1> &range, const sycl::id<1> &offset, const simple_kernel<1, true> &kernel);
+template void sequential_for(
+    const sycl::range<2> &range, const sycl::id<2> &offset, const simple_kernel<2, true> &kernel);
+template void sequential_for(
+    const sycl::range<3> &range, const sycl::id<3> &offset, const simple_kernel<3, true> &kernel);
+
+template<int Dimensions>
+sycl::range<Dimensions> unit_range() {
+    sycl::range<Dimensions> r;
+    for(int i = 0; i < Dimensions; ++i) { r[i] = 1; }
+    return r;
+}
+
+template<int Dimensions>
+void sequential_for_work_group(sycl::range<Dimensions> num_work_groups,
+    std::optional<sycl::range<Dimensions>> work_group_size, const hierarchical_kernel<Dimensions> &kernel) {
+    const auto type
+        = work_group_size.has_value() ? group_type::hierarchical_explicit_size : group_type::hierarchical_implicit_size;
+    for(size_t group_linear_id = 0; group_linear_id < num_work_groups.size(); ++group_linear_id) {
+        const auto group_id = linear_index_to_id(num_work_groups, group_linear_id);
+        const auto group_item = make_item(group_id, num_work_groups);
+        const auto local_item = make_item(sycl::id<Dimensions>(), work_group_size.value_or(unit_range<Dimensions>()));
+        const auto global_item = make_item(group_id * sycl::id(local_item.get_range()),
+            local_item.get_range() * group_item.get_range(), sycl::id<Dimensions>());
+        kernel(make_group(type, local_item, global_item, group_item, nullptr));
+    }
+}
+
+template void sequential_for_work_group(sycl::range<1> num_work_groups, std::optional<sycl::range<1>> work_group_size,
+    const hierarchical_kernel<1> &kernel);
+template void sequential_for_work_group(sycl::range<2> num_work_groups, std::optional<sycl::range<2>> work_group_size,
+    const hierarchical_kernel<2> &kernel);
+template void sequential_for_work_group(sycl::range<3> num_work_groups, std::optional<sycl::range<3>> work_group_size,
+    const hierarchical_kernel<3> &kernel);
+
 boost::context::continuation g_scheduler;
 
 void enter_kernel_fiber(boost::context::continuation &&from_scheduler) {

diff --git a/src/simsycl/system.cc b/src/simsycl/system.cc
@@ -114,7 +114,6 @@ struct memory_state {
 struct system_state {
     std::vector<sycl::platform> platforms;
     std::vector<sycl::device> devices;
-    std::unordered_map<sycl::device, size_t> device_bytes_free;
     std::set<usm_allocation, usm_allocation_order> usm_allocations;
 
     explicit system_state(const system_config &config) {
@@ -181,7 +180,7 @@ void *usm_alloc(const sycl::context &context, sycl::usm::alloc kind, std::option
             throw sycl::exception(sycl::errc::memory_allocation, "Allocation size exceeds device limit");
         }
 
-        bytes_free = &system.device_bytes_free.at(*device);
+        bytes_free = detail::device_bytes_free(*device);
         if(*bytes_free < size_bytes) {
             throw sycl::exception(sycl::errc::memory_allocation, "Not enough memory available");
         }
@@ -225,7 +224,7 @@ void usm_free(void *ptr, const sycl::context &context) {
 #endif
 
     if(iter->get_device().has_value()) {
-        system.device_bytes_free.at(iter->get_device().value()) += iter->get_size_bytes();
+        *detail::device_bytes_free(iter->get_device().value()) += iter->get_size_bytes();
     }
     system.usm_allocations.erase(iter);
 }

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
@@ -19,6 +19,7 @@ add_executable(tests
   marray_tests.cc
   math_tests.cc
   reduction_tests.cc
+  schedule_tests.cc
   vec_tests.cc
 )
 

diff --git a/test/schedule_tests.cc b/test/schedule_tests.cc
@@ -0,0 +1,38 @@
+#include <sycl/sycl.hpp>
+
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/generators/catch_generators.hpp>
+
+
+bool is_fibonacci(const int *buf, int n) {
+    for(int i = 0; i < n; ++i) {
+        if(buf[i] != (i < 2 ? i : buf[i - 1] + buf[i - 2])) return false;
+    }
+    return true;
+}
+
+TEST_CASE(
+    "naive racy fibonacci implementation is produces expected results only with round-robin schedule", "[schedule]") {
+    const bool chaos_mode = GENERATE(values<int>({false, true}));
+    if(chaos_mode) { simsycl::set_cooperative_schedule(std::make_unique<simsycl::shuffle_schedule>()); }
+    INFO((chaos_mode ? "shuffle schedule" : "round-robin schedule"));
+
+    sycl::queue q;
+    auto *buf = sycl::malloc_shared<int>(100, q);
+
+    SECTION("in simple parallel_for") {
+        q.parallel_for(sycl::range<1>(100), [=](sycl::item<1> item) {
+            const int i = item.get_id(0);
+            buf[i] = i < 2 ? i : buf[i - 1] + buf[i - 2];
+        });
+        CHECK(is_fibonacci(buf, 100) == !chaos_mode);
+    }
+
+    SECTION("in nd_range parallel_for") {
+        q.parallel_for(sycl::nd_range<1>(100, 100), [=](sycl::nd_item<1> item) {
+            const int i = item.get_global_id(0);
+            buf[i] = i < 2 ? i : buf[i - 1] + buf[i - 2];
+        });
+        CHECK(is_fibonacci(buf, 100) == !chaos_mode);
+    }
+}