Skip to content

Commit

Permalink
Address reviewer comments
Browse files Browse the repository at this point in the history
  • Loading branch information
psalz committed Sep 15, 2023
1 parent df5c47b commit 29433eb
Show file tree
Hide file tree
Showing 2 changed files with 78 additions and 18 deletions.
61 changes: 43 additions & 18 deletions include/handler.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,35 @@
#endif

namespace celerity {

class handler;

namespace experimental {
template <int Dims>
void constrain_split(handler& cgh, const range<Dims>& constraint);
}

namespace celerity::experimental {

/**
* Constrains the granularity at which a task's global range can be split into chunks.
*
* In some situations an output buffer access is only guaranteed to write to non-overlapping subranges
* if the task is split in a certain way. For example when computing the row-wise sum of a 2D matrix into
* a 1D vector, a split constraint is required to ensure that each element of the vector is written by
* exactly one chunk.
*
* Another use case is for performance optimization, for example when the creation of lots of small chunks
* would result in hardware under-utilization and excessive data transfers.
*
* Since ND-range parallel_for kernels are already constrained to be split with group size granularity,
* adding an additional constraint on top results in an effective constraint of LCM(group size, constraint).
*
* The constraint (or effective constraint) must evenly divide the global range.
* This function has no effect when called for a task without a user-provided global range.
*/
template <int Dims>
void constrain_split(handler& cgh, const range<Dims>& constraint);

} // namespace celerity::experimental

namespace celerity {

namespace detail {
class device_queue;
class task_manager;
Expand Down Expand Up @@ -359,7 +380,7 @@ class handler {
template <int Dims, typename Functor>
void host_task(range<Dims> global_range, id<Dims> global_offset, Functor&& kernel) {
const detail::task_geometry geometry{
Dims, detail::range_cast<3>(global_range), detail::id_cast<3>(global_offset), get_constrained_granularity(range<Dims>(detail::unit_range))};
Dims, detail::range_cast<3>(global_range), detail::id_cast<3>(global_offset), get_constrained_granularity(global_range, range<Dims>(detail::ones))};
auto launcher = make_host_task_launcher<Dims, false>(detail::range_cast<3>(global_range), 0, std::forward<Functor>(kernel));
create_host_compute_task(geometry, std::move(launcher));
}
Expand Down Expand Up @@ -391,7 +412,7 @@ class handler {
size_t m_num_collective_nodes;
detail::hydration_id m_next_accessor_hydration_id = 1;
std::vector<std::shared_ptr<detail::lifetime_extending_state>> m_attached_state;
range<3> m_split_constraint = detail::unit_range;
range<3> m_split_constraint = detail::ones;

handler(detail::task_id tid, size_t num_collective_nodes) : m_tid(tid), m_num_collective_nodes(num_collective_nodes) {}

Expand All @@ -418,8 +439,8 @@ class handler {
granularity[d] = local_range[d];
}
}
const detail::task_geometry geometry{
Dims, detail::range_cast<3>(global_range), detail::id_cast<3>(global_offset), get_constrained_granularity(detail::range_cast<Dims>(granularity))};
const detail::task_geometry geometry{Dims, detail::range_cast<3>(global_range), detail::id_cast<3>(global_offset),
get_constrained_granularity(global_range, detail::range_cast<Dims>(granularity))};
auto launcher = make_device_kernel_launcher<KernelFlavor, KernelName, Dims>(
global_range, global_offset, local_range, std::forward<Kernel>(kernel), std::index_sequence_for<Reductions...>(), reductions...);
create_device_compute_task(geometry, detail::kernel_debug_name<KernelName>(), std::move(launcher));
Expand Down Expand Up @@ -451,13 +472,17 @@ class handler {
}

template <int Dims>
range<3> get_constrained_granularity(const range<Dims>& granularity) const {
range<3> get_constrained_granularity(const range<Dims>& global_size, const range<Dims>& granularity) const {
range<3> result = detail::range_cast<3>(granularity);
for(size_t i = 0; i < Dims; ++i) {
for(int i = 0; i < Dims; ++i) {
const auto lcm = std::lcm(granularity[i], m_split_constraint[i]);
if(lcm == 0) { throw std::runtime_error("Split constraint cannot be 0"); }
result[i] = lcm;
}
if(global_size % detail::range_cast<Dims>(result) != range<Dims>(detail::zeros)) {
throw std::runtime_error(
fmt::format("The split constraint {} does not evenly divide the global size {}", detail::range_cast<Dims>(result), global_size));
}
return result;
}

Expand Down Expand Up @@ -649,11 +674,11 @@ template <typename DataT, int Dims, typename BinaryOperation>
return detail::reduction_impl(vars, cgh, identity, combiner, prop_list);
}

namespace experimental {
template <int Dims>
void constrain_split(handler& cgh, const range<Dims>& constraint) {
cgh.experimental_constrain_split(constraint);
}
} // namespace experimental

} // namespace celerity

namespace celerity::experimental {
template <int Dims>
void constrain_split(handler& cgh, const range<Dims>& constraint) {
cgh.experimental_constrain_split(constraint);
}
} // namespace celerity::experimental
35 changes: 35 additions & 0 deletions test/runtime_tests.cc
Original file line number Diff line number Diff line change
Expand Up @@ -795,6 +795,41 @@ namespace detail {

#endif

TEST_CASE_METHOD(test_utils::runtime_fixture, "handler throws if effective split constraint does not evenly divide global size", "[handler]") {
distr_queue q;

const auto submit = [&q](auto range, auto constraint) {
q.submit([&](handler& cgh) {
experimental::constrain_split(cgh, constraint);
cgh.parallel_for(range, [=](auto) {});
});
};

CHECK_THROWS_WITH(submit(range<1>{10}, range<1>{0}), "Split constraint cannot be 0");
CHECK_THROWS_WITH(submit(range<2>{10, 10}, range<2>{2, 0}), "Split constraint cannot be 0");
CHECK_THROWS_WITH(submit(range<3>{10, 10, 10}, range<3>{2, 2, 0}), "Split constraint cannot be 0");

CHECK_NOTHROW(submit(range<1>{10}, range<1>{2}));
CHECK_NOTHROW(submit(range<2>{10, 8}, range<2>{2, 4}));
CHECK_NOTHROW(submit(range<3>{10, 8, 16}, range<3>{2, 4, 8}));

CHECK_THROWS_WITH(submit(range<1>{10}, range<1>{3}), "The split constraint [3] does not evenly divide the global size [10]");
CHECK_THROWS_WITH(submit(range<2>{10, 8}, range<2>{2, 5}), "The split constraint [2,5] does not evenly divide the global size [10,8]");
CHECK_THROWS_WITH(submit(range<3>{10, 8, 16}, range<3>{2, 4, 9}), "The split constraint [2,4,9] does not evenly divide the global size [10,8,16]");

CHECK_THROWS_WITH(submit(range<1>{10}, range<1>{20}), "The split constraint [20] does not evenly divide the global size [10]");

CHECK_NOTHROW(submit(nd_range<1>{100, 10}, range<1>{2}));
CHECK_NOTHROW(submit(nd_range<2>{{100, 80}, {10, 20}}, range<2>{2, 4}));
CHECK_NOTHROW(submit(nd_range<3>{{100, 80, 60}, {1, 2, 30}}, range<3>{2, 4, 20}));

CHECK_THROWS_WITH(submit(nd_range<1>{100, 10}, range<1>{3}), "The split constraint [30] does not evenly divide the global size [100]");
CHECK_THROWS_WITH(
submit(nd_range<2>{{100, 80}, {10, 20}}, range<2>{2, 3}), "The split constraint [10,60] does not evenly divide the global size [100,80]");
CHECK_THROWS_WITH(submit(nd_range<3>{{100, 80, 60}, {1, 2, 30}}, range<3>{1, 2, 40}),
"The split constraint [1,2,120] does not evenly divide the global size [100,80,60]");
}

TEST_CASE_METHOD(test_utils::runtime_fixture, "handler throws when accessor target does not match command type", "[handler]") {
distr_queue q;
buffer<size_t, 1> buf0{1};
Expand Down

0 comments on commit 29433eb

Please sign in to comment.