diff --git a/include/handler.h b/include/handler.h index 4b6bb5820..dd615dae6 100644 --- a/include/handler.h +++ b/include/handler.h @@ -28,14 +28,35 @@ #endif namespace celerity { - class handler; - -namespace experimental { - template - void constrain_split(handler& cgh, const range& constraint); } +namespace celerity::experimental { + +/** + * Constrains the granularity at which a task's global range can be split into chunks. + * + * In some situations an output buffer access is only guaranteed to write to non-overlapping subranges + * if the task is split in a certain way. For example when computing the row-wise sum of a 2D matrix into + * a 1D vector, a split constraint is required to ensure that each element of the vector is written by + * exactly one chunk. + * + * Another use case is for performance optimization, for example when the creation of lots of small chunks + * would result in hardware under-utilization and excessive data transfers. + * + * Since ND-range parallel_for kernels are already constrained to be split with group size granularity, + * adding an additional constraint on top results in an effective constraint of LCM(group size, constraint). + * + * The constraint (or effective constraint) must evenly divide the global range. + * This function has no effect when called for a task without a user-provided global range. + */ +template +void constrain_split(handler& cgh, const range& constraint); + +} // namespace celerity::experimental + +namespace celerity { + namespace detail { class device_queue; class task_manager; @@ -359,7 +380,7 @@ class handler { template void host_task(range global_range, id global_offset, Functor&& kernel) { const detail::task_geometry geometry{ - Dims, detail::range_cast<3>(global_range), detail::id_cast<3>(global_offset), get_constrained_granularity(range(detail::unit_range))}; + Dims, detail::range_cast<3>(global_range), detail::id_cast<3>(global_offset), get_constrained_granularity(global_range, range(detail::ones))}; auto launcher = make_host_task_launcher(detail::range_cast<3>(global_range), 0, std::forward(kernel)); create_host_compute_task(geometry, std::move(launcher)); } @@ -391,7 +412,7 @@ class handler { size_t m_num_collective_nodes; detail::hydration_id m_next_accessor_hydration_id = 1; std::vector> m_attached_state; - range<3> m_split_constraint = detail::unit_range; + range<3> m_split_constraint = detail::ones; handler(detail::task_id tid, size_t num_collective_nodes) : m_tid(tid), m_num_collective_nodes(num_collective_nodes) {} @@ -418,8 +439,8 @@ class handler { granularity[d] = local_range[d]; } } - const detail::task_geometry geometry{ - Dims, detail::range_cast<3>(global_range), detail::id_cast<3>(global_offset), get_constrained_granularity(detail::range_cast(granularity))}; + const detail::task_geometry geometry{Dims, detail::range_cast<3>(global_range), detail::id_cast<3>(global_offset), + get_constrained_granularity(global_range, detail::range_cast(granularity))}; auto launcher = make_device_kernel_launcher( global_range, global_offset, local_range, std::forward(kernel), std::index_sequence_for(), reductions...); create_device_compute_task(geometry, detail::kernel_debug_name(), std::move(launcher)); @@ -451,13 +472,17 @@ class handler { } template - range<3> get_constrained_granularity(const range& granularity) const { + range<3> get_constrained_granularity(const range& global_size, const range& granularity) const { range<3> result = detail::range_cast<3>(granularity); - for(size_t i = 0; i < Dims; ++i) { + for(int i = 0; i < Dims; ++i) { const auto lcm = std::lcm(granularity[i], m_split_constraint[i]); if(lcm == 0) { throw std::runtime_error("Split constraint cannot be 0"); } result[i] = lcm; } + if(global_size % detail::range_cast(result) != range(detail::zeros)) { + throw std::runtime_error( + fmt::format("The split constraint {} does not evenly divide the global size {}", detail::range_cast(result), global_size)); + } return result; } @@ -649,11 +674,11 @@ template return detail::reduction_impl(vars, cgh, identity, combiner, prop_list); } -namespace experimental { - template - void constrain_split(handler& cgh, const range& constraint) { - cgh.experimental_constrain_split(constraint); - } -} // namespace experimental - } // namespace celerity + +namespace celerity::experimental { +template +void constrain_split(handler& cgh, const range& constraint) { + cgh.experimental_constrain_split(constraint); +} +} // namespace celerity::experimental diff --git a/test/runtime_tests.cc b/test/runtime_tests.cc index 97ccd36f0..e31921802 100644 --- a/test/runtime_tests.cc +++ b/test/runtime_tests.cc @@ -795,6 +795,41 @@ namespace detail { #endif + TEST_CASE_METHOD(test_utils::runtime_fixture, "handler throws if effective split constraint does not evenly divide global size", "[handler]") { + distr_queue q; + + const auto submit = [&q](auto range, auto constraint) { + q.submit([&](handler& cgh) { + experimental::constrain_split(cgh, constraint); + cgh.parallel_for(range, [=](auto) {}); + }); + }; + + CHECK_THROWS_WITH(submit(range<1>{10}, range<1>{0}), "Split constraint cannot be 0"); + CHECK_THROWS_WITH(submit(range<2>{10, 10}, range<2>{2, 0}), "Split constraint cannot be 0"); + CHECK_THROWS_WITH(submit(range<3>{10, 10, 10}, range<3>{2, 2, 0}), "Split constraint cannot be 0"); + + CHECK_NOTHROW(submit(range<1>{10}, range<1>{2})); + CHECK_NOTHROW(submit(range<2>{10, 8}, range<2>{2, 4})); + CHECK_NOTHROW(submit(range<3>{10, 8, 16}, range<3>{2, 4, 8})); + + CHECK_THROWS_WITH(submit(range<1>{10}, range<1>{3}), "The split constraint [3] does not evenly divide the global size [10]"); + CHECK_THROWS_WITH(submit(range<2>{10, 8}, range<2>{2, 5}), "The split constraint [2,5] does not evenly divide the global size [10,8]"); + CHECK_THROWS_WITH(submit(range<3>{10, 8, 16}, range<3>{2, 4, 9}), "The split constraint [2,4,9] does not evenly divide the global size [10,8,16]"); + + CHECK_THROWS_WITH(submit(range<1>{10}, range<1>{20}), "The split constraint [20] does not evenly divide the global size [10]"); + + CHECK_NOTHROW(submit(nd_range<1>{100, 10}, range<1>{2})); + CHECK_NOTHROW(submit(nd_range<2>{{100, 80}, {10, 20}}, range<2>{2, 4})); + CHECK_NOTHROW(submit(nd_range<3>{{100, 80, 60}, {1, 2, 30}}, range<3>{2, 4, 20})); + + CHECK_THROWS_WITH(submit(nd_range<1>{100, 10}, range<1>{3}), "The split constraint [30] does not evenly divide the global size [100]"); + CHECK_THROWS_WITH( + submit(nd_range<2>{{100, 80}, {10, 20}}, range<2>{2, 3}), "The split constraint [10,60] does not evenly divide the global size [100,80]"); + CHECK_THROWS_WITH(submit(nd_range<3>{{100, 80, 60}, {1, 2, 30}}, range<3>{1, 2, 40}), + "The split constraint [1,2,120] does not evenly divide the global size [100,80,60]"); + } + TEST_CASE_METHOD(test_utils::runtime_fixture, "handler throws when accessor target does not match command type", "[handler]") { distr_queue q; buffer buf0{1};