Skip to content

Commit

Permalink
Add experimental split constraints API
Browse files Browse the repository at this point in the history
This adds a new experimental::constrain_split() API that can be used to
limit the ways in which a kernel can be split, similar to how ND-range
kernels are constrained to be split along work group boundaries.
  • Loading branch information
psalz committed Sep 15, 2023
1 parent 3d7c59c commit df5c47b
Show file tree
Hide file tree
Showing 4 changed files with 59 additions and 2 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ Versioning](http://semver.org/spec/v2.0.0.html).

- Introduce new experimental `for_each_item` utility to iterate over a celerity range (#199)
- Add new environment variables `CELERITY_HORIZON_STEP` and `CELERITY_HORIZON_MAX_PARALLELISM` to control Horizon generation (#199)
- Add new `experimental::constrain_split` API to limit how a kernel can be split (#?)

## Changed

Expand Down
38 changes: 36 additions & 2 deletions include/handler.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,11 @@ namespace celerity {

class handler;

namespace experimental {
template <int Dims>
void constrain_split(handler& cgh, const range<Dims>& constraint);
}

namespace detail {
class device_queue;
class task_manager;
Expand Down Expand Up @@ -353,7 +358,8 @@ class handler {
*/
template <int Dims, typename Functor>
void host_task(range<Dims> global_range, id<Dims> global_offset, Functor&& kernel) {
const detail::task_geometry geometry{Dims, detail::range_cast<3>(global_range), detail::id_cast<3>(global_offset), {1, 1, 1}};
const detail::task_geometry geometry{
Dims, detail::range_cast<3>(global_range), detail::id_cast<3>(global_offset), get_constrained_granularity(range<Dims>(detail::unit_range))};
auto launcher = make_host_task_launcher<Dims, false>(detail::range_cast<3>(global_range), 0, std::forward<Functor>(kernel));
create_host_compute_task(geometry, std::move(launcher));
}
Expand All @@ -372,6 +378,8 @@ class handler {
friend detail::hydration_id detail::add_requirement(handler& cgh, const detail::buffer_id bid, std::unique_ptr<detail::range_mapper_base> rm);
friend void detail::add_requirement(handler& cgh, const detail::host_object_id hoid, const experimental::side_effect_order order, const bool is_void);
friend void detail::add_reduction(handler& cgh, const detail::reduction_info& rinfo);
template <int Dims>
friend void experimental::constrain_split(handler& cgh, const range<Dims>& constraint);
friend void detail::extend_lifetime(handler& cgh, std::shared_ptr<detail::lifetime_extending_state> state);

detail::task_id m_tid;
Expand All @@ -383,6 +391,7 @@ class handler {
size_t m_num_collective_nodes;
detail::hydration_id m_next_accessor_hydration_id = 1;
std::vector<std::shared_ptr<detail::lifetime_extending_state>> m_attached_state;
range<3> m_split_constraint = detail::unit_range;

handler(detail::task_id tid, size_t num_collective_nodes) : m_tid(tid), m_num_collective_nodes(num_collective_nodes) {}

Expand All @@ -409,7 +418,8 @@ class handler {
granularity[d] = local_range[d];
}
}
const detail::task_geometry geometry{Dims, detail::range_cast<3>(global_range), detail::id_cast<3>(global_offset), granularity};
const detail::task_geometry geometry{
Dims, detail::range_cast<3>(global_range), detail::id_cast<3>(global_offset), get_constrained_granularity(detail::range_cast<Dims>(granularity))};
auto launcher = make_device_kernel_launcher<KernelFlavor, KernelName, Dims>(
global_range, global_offset, local_range, std::forward<Kernel>(kernel), std::index_sequence_for<Reductions...>(), reductions...);
create_device_compute_task(geometry, detail::kernel_debug_name<KernelName>(), std::move(launcher));
Expand All @@ -434,6 +444,23 @@ class handler {

void extend_lifetime(std::shared_ptr<detail::lifetime_extending_state> state) { m_attached_state.emplace_back(std::move(state)); }

template <int Dims>
void experimental_constrain_split(const range<Dims>& constraint) {
assert(m_task == nullptr);
m_split_constraint = detail::range_cast<3>(constraint);
}

template <int Dims>
range<3> get_constrained_granularity(const range<Dims>& granularity) const {
range<3> result = detail::range_cast<3>(granularity);
for(size_t i = 0; i < Dims; ++i) {
const auto lcm = std::lcm(granularity[i], m_split_constraint[i]);
if(lcm == 0) { throw std::runtime_error("Split constraint cannot be 0"); }
result[i] = lcm;
}
return result;
}

void create_host_compute_task(detail::task_geometry geometry, std::unique_ptr<detail::command_launcher_storage_base> launcher) {
assert(m_task == nullptr);
if(geometry.global_size.size() == 0) {
Expand Down Expand Up @@ -622,4 +649,11 @@ template <typename DataT, int Dims, typename BinaryOperation>
return detail::reduction_impl(vars, cgh, identity, combiner, prop_list);
}

namespace experimental {
template <int Dims>
void constrain_split(handler& cgh, const range<Dims>& constraint) {
cgh.experimental_constrain_split(constraint);
}
} // namespace experimental

} // namespace celerity
5 changes: 5 additions & 0 deletions test/distributed_graph_generator_test_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,11 @@ class task_builder {
return chain<step>([&host_obj, order](handler& cgh) { host_obj.add_side_effect(cgh, order); });
}

template <int Dims>
step constrain_split(const range<Dims>& constraint) {
return chain<step>([constraint](handler& cgh) { experimental::constrain_split(cgh, constraint); });
}

private:
dist_cdag_test_context& m_dctx;
std::deque<action> m_actions;
Expand Down
17 changes: 17 additions & 0 deletions test/graph_gen_granularity_tests.cc
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,23 @@ TEST_CASE("distributed_graph_generator respects task granularity when splitting"
}
}

TEST_CASE("distributed_graph_generator respects split constraints", "[distributed_graph_generator]") {
const size_t num_nodes = 4;
dist_cdag_test_context dctx(num_nodes);

// Split constraints use the same underlying mechanisms as task granularity (tested above), so we'll keep this brief
const auto tid_a = dctx.device_compute<class UKN(task)>(range<1>{128}).constrain_split(range<1>{64}).submit();
REQUIRE(dctx.query(tid_a).count() == 2);
CHECK(dynamic_cast<const execution_command*>(dctx.query(tid_a).get_raw(0)[0])->get_execution_range().range == range<3>{64, 1, 1});
CHECK(dynamic_cast<const execution_command*>(dctx.query(tid_a).get_raw(1)[0])->get_execution_range().range == range<3>{64, 1, 1});

// The more interesting aspect is that a constrained nd-range kernel uses the least common multiple of the two constraints
const auto tid_b = dctx.device_compute<class UKN(task)>(nd_range<1>{{192}, {32}}).constrain_split(range<1>{3}).submit();
REQUIRE(dctx.query(tid_b).count() == 2);
CHECK(dynamic_cast<const execution_command*>(dctx.query(tid_b).get_raw(0)[0])->get_execution_range().range == range<3>{96, 1, 1});
CHECK(dynamic_cast<const execution_command*>(dctx.query(tid_b).get_raw(1)[0])->get_execution_range().range == range<3>{96, 1, 1});
}

template <int Dims>
class simple_task;

Expand Down

0 comments on commit df5c47b

Please sign in to comment.