diff --git a/include/distributed_graph_generator.h b/include/distributed_graph_generator.h index f64180d7f..846eb8b1f 100644 --- a/include/distributed_graph_generator.h +++ b/include/distributed_graph_generator.h @@ -2,7 +2,6 @@ #include #include -#include #include "command_graph.h" #include "ranges.h" @@ -65,6 +64,7 @@ class distributed_graph_generator { buffer_state(region_map lw, region_map> rr) : local_last_writer(std::move(lw)), replicated_regions(std::move(rr)), pending_reduction(std::nullopt) {} + region<3> initialized_region; // for detecting uninitialized reads region_map local_last_writer; region_map replicated_regions; @@ -78,7 +78,10 @@ class distributed_graph_generator { distributed_graph_generator( const size_t num_nodes, const node_id local_nid, command_graph& cdag, const task_manager& tm, detail::command_recorder* recorder); - void add_buffer(const buffer_id bid, const int dims, const range<3>& range); + void set_uninitialized_read_policy(const error_policy policy) { m_uninitialized_read_policy = policy; } + void set_overlapping_write_policy(const error_policy policy) { m_overlapping_write_policy = policy; } + + void add_buffer(const buffer_id bid, const int dims, const range<3>& range, bool host_initialized); std::unordered_set build_task(const task& tsk); @@ -124,6 +127,8 @@ class distributed_graph_generator { node_id m_local_nid; command_graph& m_cdag; const task_manager& m_task_mngr; + error_policy m_uninitialized_read_policy = error_policy::throw_exception; + error_policy m_overlapping_write_policy = error_policy::throw_exception; std::unordered_map m_buffer_states; command_id m_epoch_for_new_commands = 0; command_id m_epoch_last_pruned_before = 0; diff --git a/include/scheduler.h b/include/scheduler.h index 77723b3a4..60f73e0bb 100644 --- a/include/scheduler.h +++ b/include/scheduler.h @@ -32,7 +32,9 @@ namespace detail { */ void notify_task_created(const task* const tsk) { notify(event_task_available{tsk}); } - void notify_buffer_registered(const buffer_id bid, const int dims, const range<3>& range) { notify(event_buffer_registered{bid, dims, range}); } + void notify_buffer_registered(const buffer_id bid, const int dims, const range<3>& range, bool host_initialized) { + notify(event_buffer_registered{bid, dims, range, host_initialized}); + } protected: /** @@ -53,6 +55,7 @@ namespace detail { buffer_id bid; int dims; celerity::range<3> range; + bool host_initialized; }; using event = std::variant; diff --git a/include/task.h b/include/task.h index e4d09e529..4069926e3 100644 --- a/include/task.h +++ b/include/task.h @@ -271,5 +271,7 @@ namespace detail { } }; + std::unordered_map> detect_overlapping_writes(const task& tsk, const std::vector>& chunks); + } // namespace detail } // namespace celerity diff --git a/include/task_manager.h b/include/task_manager.h index 6fed0240e..a94b93a5d 100644 --- a/include/task_manager.h +++ b/include/task_manager.h @@ -64,6 +64,8 @@ namespace detail { virtual ~task_manager() = default; + void set_uninitialized_read_policy(const error_policy policy) { m_uninitialized_read_policy = policy; } + template task_id submit_command_group(CGF cgf, Hints... hints) { auto reservation = m_task_buffer.reserve_task_entry(await_free_task_slot_callback()); @@ -182,6 +184,8 @@ namespace detail { const size_t m_num_collective_nodes; host_queue* m_queue; + error_policy m_uninitialized_read_policy = error_policy::throw_exception; + task_ring_buffer m_task_buffer; // The active epoch is used as the last writer for host-initialized buffers. diff --git a/include/types.h b/include/types.h index ddbd363df..44cc7f65e 100644 --- a/include/types.h +++ b/include/types.h @@ -75,4 +75,12 @@ struct reduction_info { }; constexpr node_id master_node_id = 0; + +enum class error_policy { + ignore, + log_warning, + log_error, + throw_exception, +}; + } // namespace celerity::detail diff --git a/include/utils.h b/include/utils.h index e46f980fe..09ef1403b 100644 --- a/include/utils.h +++ b/include/utils.h @@ -1,11 +1,17 @@ #pragma once +#include "types.h" + #include #include +#include #include #include #include +#include "spdlog/fmt/fmt.h" + + namespace celerity::detail::utils { template @@ -90,4 +96,19 @@ std::string simplify_task_name(const std::string& demangled_type_name); // escapes "<", ">", and "&" with their corresponding HTML escape sequences std::string escape_for_dot_label(std::string str); +template +[[noreturn]] void throw_error(FmtParams&&... fmt_args) { + throw std::runtime_error(fmt::format(std::forward(fmt_args)...)); +} + +template +void report_error(const error_policy policy, FmtParams&&... fmt_args) { + switch(policy) { + case error_policy::ignore: break; + case error_policy::log_warning: CELERITY_WARN(std::forward(fmt_args)...); break; + case error_policy::log_error: CELERITY_ERROR(std::forward(fmt_args)...); break; + case error_policy::throw_exception: throw_error(std::forward(fmt_args)...); break; + } +} + } // namespace celerity::detail::utils diff --git a/src/distributed_graph_generator.cc b/src/distributed_graph_generator.cc index 788ed6964..00084a582 100644 --- a/src/distributed_graph_generator.cc +++ b/src/distributed_graph_generator.cc @@ -25,9 +25,10 @@ distributed_graph_generator::distributed_graph_generator( m_epoch_for_new_commands = epoch_cmd->get_cid(); } -void distributed_graph_generator::add_buffer(const buffer_id bid, const int dims, const range<3>& range) { +void distributed_graph_generator::add_buffer(const buffer_id bid, const int dims, const range<3>& range, bool host_initialized) { m_buffer_states.emplace( std::piecewise_construct, std::tuple{bid}, std::tuple{region_map{range, dims}, region_map{range, dims}}); + if(host_initialized) { m_buffer_states.at(bid).initialized_region = box(subrange({}, range)); } // Mark contents as available locally (= don't generate await push commands) and fully replicated (= don't generate push commands). // This is required when tasks access host-initialized or uninitialized buffers. m_buffer_states.at(bid).local_last_writer.update_region(subrange<3>({}, range), m_epoch_for_new_commands); @@ -162,6 +163,19 @@ void distributed_graph_generator::generate_distributed_commands(const task& tsk) assert(chunks.size() <= num_chunks); // We may have created less than requested assert(!chunks.empty()); + if(m_overlapping_write_policy != error_policy::ignore) { + if(const auto overlapping_writes = detect_overlapping_writes(tsk, chunks); !overlapping_writes.empty()) { + auto error = fmt::format("Task T{}", tsk.get_id()); + if(!tsk.get_debug_name().empty()) { fmt::format_to(std::back_inserter(error), " \"{}\"", tsk.get_debug_name()); } + error += " has overlapping writes between multiple nodes in"; + for(const auto& [bid, overlap] : overlapping_writes) { + fmt::format_to(std::back_inserter(error), " B{} {}", bid, overlap); + } + error += ". Choose a non-overlapping range mapper for the write access or constrain the split to make the access non-overlapping."; + utils::report_error(m_overlapping_write_policy, "{}", error); + } + } + // Assign each chunk to a node // We assign chunks next to each other to the same worker (if there is more chunks than workers), as this is likely to produce less // transfers between tasks than a round-robin assignment (for typical stencil codes). @@ -283,6 +297,7 @@ void distributed_graph_generator::generate_distributed_commands(const task& tsk) // TODO the per-node reduction result is discarded - warn user about dead store } + region<3> uninitialized_reads; for(const auto mode : required_modes) { const auto& req = reqs_by_mode.at(mode); if(detail::access::mode_traits::is_consumer(mode)) { @@ -340,6 +355,10 @@ void distributed_graph_generator::generate_distributed_commands(const task& tsk) } } } + + if(is_local_chunk && m_uninitialized_read_policy != error_policy::ignore) { + uninitialized_reads = region_union(uninitialized_reads, region_difference(req, buffer_state.initialized_region)); + } } if(is_local_chunk && detail::access::mode_traits::is_producer(mode)) { @@ -353,7 +372,14 @@ void distributed_graph_generator::generate_distributed_commands(const task& tsk) } } + if(!uninitialized_reads.empty()) { + utils::report_error(m_uninitialized_read_policy, "Command C{} on N{} reads B{} {}, which has not been written by any node.", cmd->get_cid(), + m_local_nid, bid, detail::region(std::move(uninitialized_reads))); + } + if(generate_reduction) { + post_reduction_buffer_states.at(bid).initialized_region = scalar_box; + const auto& reduction = *buffer_state.pending_reduction; const auto local_last_writer = buffer_state.local_last_writer.get_region_values(scalar_box); @@ -478,6 +504,11 @@ void distributed_graph_generator::generate_distributed_commands(const task& tsk) // Determine which local data is fresh/stale based on task-level writes. auto requirements = get_buffer_requirements_for_mapped_access(tsk, subrange<3>(tsk.get_global_offset(), tsk.get_global_size()), tsk.get_global_size()); + // Add requirements for reductions + for(const auto& reduction : tsk.get_reductions()) { + // the actual mode is irrelevant as long as it's a producer - TODO have a better query API for task buffer requirements + requirements[reduction.bid][access_mode::write] = scalar_box; + } for(auto& [bid, reqs_by_mode] : requirements) { box_vector<3> global_write_boxes; for(const auto mode : access::producer_modes) { @@ -491,6 +522,8 @@ void distributed_graph_generator::generate_distributed_commands(const task& tsk) const auto remote_writes = region_difference(global_writes, local_writes); auto& buffer_state = m_buffer_states.at(bid); + buffer_state.initialized_region = region_union(buffer_state.initialized_region, global_writes); + // TODO: We need a way of updating regions in place! E.g. apply_to_values(box, callback) auto boxes_and_cids = buffer_state.local_last_writer.get_region_values(remote_writes); for(auto& [box, wcs] : boxes_and_cids) { diff --git a/src/runtime.cc b/src/runtime.cc index a73ed3b6e..acf409a70 100644 --- a/src/runtime.cc +++ b/src/runtime.cc @@ -148,14 +148,25 @@ namespace detail { m_reduction_mngr = std::make_unique(); m_host_object_mngr = std::make_unique(); + if(m_cfg->is_recording()) m_task_recorder = std::make_unique(m_buffer_mngr.get()); m_task_mngr = std::make_unique(m_num_nodes, m_h_queue.get(), m_task_recorder.get()); if(m_cfg->get_horizon_step()) m_task_mngr->set_horizon_step(m_cfg->get_horizon_step().value()); if(m_cfg->get_horizon_max_parallelism()) m_task_mngr->set_horizon_max_parallelism(m_cfg->get_horizon_max_parallelism().value()); + // Merely _declaring_ an uninitialized read is legitimate as long as the kernel does not actually perform the read at runtime - this might happen in the + // first iteration of a submit-loop. We could get rid of this case by making access-modes a runtime property of accessors (cf + // https://github.com/celerity/meta/issues/74). + m_task_mngr->set_uninitialized_read_policy(error_policy::log_warning); + m_exec = std::make_unique(m_num_nodes, m_local_nid, *m_h_queue, *m_d_queue, *m_task_mngr, *m_buffer_mngr, *m_reduction_mngr); + m_cdag = std::make_unique(); if(m_cfg->is_recording()) m_command_recorder = std::make_unique(m_task_mngr.get(), m_buffer_mngr.get()); auto dggen = std::make_unique(m_num_nodes, m_local_nid, *m_cdag, *m_task_mngr, m_command_recorder.get()); + // Any uninitialized read that is observed on CDAG generation was already logged on task generation, unless we have a bug. + dggen->set_uninitialized_read_policy(error_policy::ignore); + dggen->set_overlapping_write_policy(error_policy::log_error); + m_schdlr = std::make_unique(is_dry_run(), std::move(dggen), *m_exec); m_task_mngr->register_task_callback([this](const task* tsk) { m_schdlr->notify_task_created(tsk); }); @@ -274,7 +285,7 @@ namespace detail { void runtime::handle_buffer_registered(buffer_id bid) { const auto& info = m_buffer_mngr->get_buffer_info(bid); m_task_mngr->add_buffer(bid, info.dimensions, info.range, info.is_host_initialized); - m_schdlr->notify_buffer_registered(bid, info.dimensions, info.range); + m_schdlr->notify_buffer_registered(bid, info.dimensions, info.range, info.is_host_initialized); } void runtime::handle_buffer_unregistered(buffer_id bid) { maybe_destroy_runtime(); } diff --git a/src/scheduler.cc b/src/scheduler.cc index 19980d20e..1122f736a 100644 --- a/src/scheduler.cc +++ b/src/scheduler.cc @@ -53,7 +53,7 @@ namespace detail { serializer.flush(cmds); }, [&](const event_buffer_registered& e) { // - m_dggen->add_buffer(e.bid, e.dims, e.range); + m_dggen->add_buffer(e.bid, e.dims, e.range, e.host_initialized); }, [&](const event_shutdown&) { assert(in_flight_events.empty()); diff --git a/src/task.cc b/src/task.cc index f5fbaeb98..209d5ca7b 100644 --- a/src/task.cc +++ b/src/task.cc @@ -1,6 +1,6 @@ #include "task.h" +#include "access_modes.h" -#include namespace celerity { namespace detail { @@ -62,5 +62,45 @@ namespace detail { // TODO for multiple side effects on the same hoid, find the weakest order satisfying all of them emplace(hoid, order); } + + std::unordered_map> detect_overlapping_writes(const task& tsk, const std::vector>& chunks) { + const box<3> scalar_reduction_box({0, 0, 0}, {1, 1, 1}); + + auto& bam = tsk.get_buffer_access_map(); + + std::unordered_map> buffer_write_accumulators; + std::unordered_map> overlapping_writes; + for(const auto bid : bam.get_accessed_buffers()) { + for(const auto& ck : chunks) { + region<3> writes; + for(const auto mode : bam.get_access_modes(bid)) { + if(access::mode_traits::is_producer(mode)) { + const auto req = bam.get_mode_requirements(bid, mode, tsk.get_dimensions(), subrange(ck.offset, ck.range), tsk.get_global_size()); + writes = region_union(writes, req); + } + } + if(!writes.empty()) { + auto& write_accumulator = buffer_write_accumulators[bid]; // allow default-insert + if(const auto overlap = region_intersection(write_accumulator, writes); !overlap.empty()) { + auto& full_overlap = overlapping_writes[bid]; // allow default-insert + full_overlap = region_union(full_overlap, overlap); + } + write_accumulator = region_union(write_accumulator, writes); + } + } + } + + for(const auto& rinfo : tsk.get_reductions()) { + auto& write_accumulator = buffer_write_accumulators[rinfo.bid]; // allow default-insert + if(const auto overlap = region_intersection(write_accumulator, scalar_reduction_box); !overlap.empty()) { + auto& full_overlap = overlapping_writes[rinfo.bid]; // allow default-insert + full_overlap = region_union(full_overlap, overlap); + } + write_accumulator = region_union(write_accumulator, scalar_reduction_box); + } + + return overlapping_writes; + } + } // namespace detail } // namespace celerity diff --git a/src/task_manager.cc b/src/task_manager.cc index 77450ec7f..e74004167 100644 --- a/src/task_manager.cc +++ b/src/task_manager.cc @@ -98,12 +98,20 @@ namespace detail { if(reduction.has_value()) { read_requirements = region_union(read_requirements, scalar_box); } const auto last_writers = m_buffers_last_writers.at(bid).get_region_values(read_requirements); - for(auto& p : last_writers) { - // This indicates that the buffer is being used for the first time by this task, or all previous tasks also only read from it. - // A valid use case (i.e., not reading garbage) for this is when the buffer has been initialized using a host pointer. - if(p.second == std::nullopt) continue; - const task_id last_writer = *p.second; - add_dependency(tsk, *m_task_buffer.get_task(last_writer), dependency_kind::true_dep, dependency_origin::dataflow); + box_vector<3> uninitialized_reads; + for(const auto& [box, writer] : last_writers) { + // host-initialized buffers are last-written by the current epoch + if(writer.has_value()) { + add_dependency(tsk, *m_task_buffer.get_task(*writer), dependency_kind::true_dep, dependency_origin::dataflow); + } else { + uninitialized_reads.push_back(box); + } + } + if(!uninitialized_reads.empty()) { + utils::report_error(m_uninitialized_read_policy, + "Task T{}{} declares a reading access on uninitialized B{} {}. Make sure to construct the accessor with no_init if possible.", + tsk.get_id(), !tsk.get_debug_name().empty() ? fmt::format(" \"{}\"", utils::simplify_task_name(tsk.get_debug_name())) : "", bid, + region(std::move(uninitialized_reads))); } } diff --git a/test/distributed_graph_generator_test_utils.h b/test/distributed_graph_generator_test_utils.h index 1cae35569..9b78dc5fc 100644 --- a/test/distributed_graph_generator_test_utils.h +++ b/test/distributed_graph_generator_test_utils.h @@ -494,7 +494,7 @@ class dist_cdag_test_context { const auto buf = test_utils::mock_buffer(bid, size); m_tm.add_buffer(bid, Dims, range_cast<3>(size), mark_as_host_initialized); for(auto& dggen : m_dggens) { - dggen->add_buffer(bid, Dims, range_cast<3>(size)); + dggen->add_buffer(bid, Dims, range_cast<3>(size), mark_as_host_initialized); } return buf; } diff --git a/test/graph_generation_tests.cc b/test/graph_generation_tests.cc index 65e58b31b..7c5882324 100644 --- a/test/graph_generation_tests.cc +++ b/test/graph_generation_tests.cc @@ -1,5 +1,6 @@ #include #include +#include #include "distributed_graph_generator_test_utils.h" @@ -123,27 +124,6 @@ TEST_CASE("distributed_graph_generator builds dependencies to all local commands CHECK(dctx.query(tid_c).have_successors(dctx.query(tid_d))); } -// This test case currently fails and exists for documentation purposes: -// - Having fixed write access to a buffer results in unclear semantics when it comes to splitting the task into chunks. -// - We could check for write access when using the built-in fixed range mapper and warn / throw. -// - But of course this is the easy case; the user could just as well write the same by hand. -// -// Really the most sensible thing to do might be to check whether chunks write to overlapping regions and abort if so. -TEST_CASE("distributed_graph_generator handles fixed write access", "[distributed_graph_generator][command-graph][!shouldfail]") { - dist_cdag_test_context dctx(2); - - const range<1> test_range = {128}; - auto buf0 = dctx.create_buffer(test_range); - - const auto tid_a = dctx.device_compute(test_range).discard_write(buf0, acc::all{}).submit(); - // Another solution could be to not split the task at all - CHECK(dctx.query(tid_a).count() == 1); - - dctx.device_compute(test_range).read(buf0, acc::all{}).submit(); - // Right now this generates push commands, which also doesn't make much sense - CHECK(dctx.query(command_type::push).empty()); -} - // This is a highly constructed and unrealistic example, but we'd still like the behavior to be clearly defined. TEST_CASE("distributed_graph_generator generates anti-dependencies for execution commands that have a task-level true dependency", "[distributed_graph_generator][command-graph]") { @@ -499,3 +479,48 @@ TEST_CASE("fences introduce dependencies on buffers", "[distributed_graph_genera CHECK(dctx.query(tid_fence, nid).have_successors(dctx.query(tid_b, nid))); } } + +TEST_CASE("distributed_graph_generator throws in tests if it detects an uninitialized read", "[distributed_graph_generator]") { + const size_t num_nodes = 2; + const range<1> node_range{num_nodes}; + + dist_cdag_test_context dctx(num_nodes); + dctx.get_task_manager().set_uninitialized_read_policy(error_policy::ignore); // otherwise we get task-level errors first + + SECTION("on a fully uninitialized buffer") { + auto buf = dctx.create_buffer<1>({1}); + CHECK_THROWS_WITH((dctx.device_compute(node_range).read(buf, acc::all()).submit()), + "Command C1 on N0 reads B0 {[0,0,0] - [1,1,1]}, which has not been written by any node."); + } + + SECTION("on a partially, locally initialized buffer") { + auto buf = dctx.create_buffer<1>(node_range); + dctx.device_compute(range(1)).discard_write(buf, acc::one_to_one()).submit(); + CHECK_THROWS_WITH((dctx.device_compute(node_range).read(buf, acc::all()).submit()), + "Command C2 on N0 reads B0 {[1,0,0] - [2,1,1]}, which has not been written by any node."); + } + + SECTION("on a partially, remotely initialized buffer") { + auto buf = dctx.create_buffer<1>(node_range); + dctx.device_compute(range(1)).discard_write(buf, acc::one_to_one()).submit(); + CHECK_THROWS_WITH((dctx.device_compute(node_range).read(buf, acc::one_to_one()).submit()), + "Command C1 on N1 reads B0 {[1,0,0] - [2,1,1]}, which has not been written by any node."); + } +} + +TEST_CASE("distributed_graph_generator throws in tests if it detects overlapping writes", "[distributed_graph_generator]") { + dist_cdag_test_context dctx(2); + auto buf = dctx.create_buffer<2>({20, 20}); + + SECTION("on all-write") { + CHECK_THROWS_WITH((dctx.device_compute(buf.get_range()).discard_write(buf, acc::all()).submit()), + "Task T1 \"celerity::detail::unnamed_kernel\" has overlapping writes between multiple nodes in B0 {[0,0,0] - [20,20,1]}. Choose a non-overlapping " + "range mapper for the write access or constrain the split to make the access non-overlapping."); + } + + SECTION("on neighborhood-write") { + CHECK_THROWS_WITH((dctx.device_compute(buf.get_range()).discard_write(buf, acc::neighborhood(1, 1)).submit()), + "Task T1 \"celerity::detail::unnamed_kernel\" has overlapping writes between multiple nodes in B0 {[9,0,0] - [11,20,1]}. Choose a non-overlapping " + "range mapper for the write access or constrain the split to make the access non-overlapping."); + } +} diff --git a/test/print_graph_tests.cc b/test/print_graph_tests.cc index 0b45e41dc..52bdd7253 100644 --- a/test/print_graph_tests.cc +++ b/test/print_graph_tests.cc @@ -70,8 +70,14 @@ TEST_CASE("command graph printing is unchanged", "[print_graph][command-graph]") auto buf_0 = dctx.create_buffer(range<1>{1}); - dctx.device_compute(range<1>(num_nodes)).reduce(buf_0, false).submit(); - dctx.device_compute(range<1>(num_nodes)).read(buf_0, acc::all{}).read_write(buf_0, acc::all{}).write(buf_0, acc::all{}).submit(); + dctx.device_compute(range<1>(num_nodes)) // + .reduce(buf_0, false) + .submit(); + dctx.device_compute(range<1>(num_nodes)) + .read(buf_0, acc::all{}) + .read_write(buf_0, acc::one_to_one{}) + .write(buf_0, acc::one_to_one{}) + .submit(); // Smoke test: It is valid for the dot output to change with updates to graph generation. If this test fails, verify that the printed graph is sane and // replace the `expected` value with the new dot graph. @@ -84,11 +90,11 @@ TEST_CASE("command graph printing is unchanged", "[print_graph][command-graph]") "[1,1,1]}
read_write B0 {[0,0,0] - [1,1,1]}
write B0 {[0,0,0] - [1,1,1]}> fontcolor=black " "shape=box];}id_0_0->id_0_1[color=orchid];id_0_3->id_0_2[];id_0_5->id_0_2[color=limegreen];id_0_6->id_0_2[color=limegreen];id_0_7->id_0_2[color=" "limegreen];id_0_3[label=reduction R1
B0 {[0,0,0] - [1,1,1]}> fontcolor=black " - "shape=ellipse];id_0_1->id_0_3[];id_0_4->id_0_3[];id_0_4[label=(R1) await push transfer 8589934592
BB0 {[0,0,0] - " - "[1,1,1]}> fontcolor=black shape=ellipse];id_0_0->id_0_4[color=orchid];id_0_5[label=(R1) push transfer 8589934593 to N1
BB0 " - "[0,0,0] + [1,1,1]> fontcolor=black shape=ellipse];id_0_1->id_0_5[];id_0_6[label=(R1) push transfer 8589934594 to N2
BB0 " - "[0,0,0] + [1,1,1]> fontcolor=black shape=ellipse];id_0_1->id_0_6[];id_0_7[label=(R1) push transfer 8589934595 to N3
BB0 " - "[0,0,0] + [1,1,1]> fontcolor=black shape=ellipse];id_0_1->id_0_7[];}"; + "shape=ellipse];id_0_1->id_0_3[];id_0_4->id_0_3[];id_0_4[label=(R1) await push transfer 8589934592
BB0 {[0,0,0] - [1,1,1]}> " + "fontcolor=black shape=ellipse];id_0_0->id_0_4[color=orchid];id_0_5[label=(R1) push transfer 8589934593 to N1
BB0 [0,0,0] + " + "[1,1,1]> fontcolor=black shape=ellipse];id_0_1->id_0_5[];id_0_6[label=(R1) push transfer 8589934594 to N2
BB0 [0,0,0] + " + "[1,1,1]> fontcolor=black shape=ellipse];id_0_1->id_0_6[];id_0_7[label=(R1) push transfer 8589934595 to N3
BB0 [0,0,0] + " + "[1,1,1]> fontcolor=black shape=ellipse];id_0_1->id_0_7[];}"; // fully check node 0 const auto dot0 = dctx.print_command_graph(0); @@ -96,10 +102,11 @@ TEST_CASE("command graph printing is unchanged", "[print_graph][command-graph]") // only check the rough string length and occurence count of N1/N2... for other nodes const int expected_occurences = count_occurences(expected, "N0"); - for(size_t i = 1; i < num_nodes; ++i) { - const auto dot_n = dctx.print_command_graph(i); - REQUIRE_THAT(dot_n.size(), Catch::Matchers::WithinAbs(expected.size(), 50)); - CHECK(count_occurences(dot_n, fmt::format("N{}", i)) == expected_occurences); + for(node_id nid = 1; nid < num_nodes; ++nid) { + CAPTURE(nid); + const auto dot_n = dctx.print_command_graph(nid); + CHECK_THAT(dot_n.size(), Catch::Matchers::WithinAbs(expected.size(), 200)); + CHECK(count_occurences(dot_n, fmt::format("N{}", nid)) == expected_occurences); } } @@ -114,7 +121,7 @@ TEST_CASE_METHOD(test_utils::runtime_fixture, "buffer debug names show up in the CHECK(celerity::debug::get_buffer_name(buff_a) == buff_name); q.submit([&](handler& cgh) { - celerity::accessor acc_a{buff_a, cgh, acc::all{}, celerity::write_only}; + celerity::accessor acc_a{buff_a, cgh, acc::one_to_one{}, celerity::write_only, celerity::no_init}; cgh.parallel_for(range, [=](item<1> item) { (void)acc_a; }); }); @@ -138,7 +145,8 @@ TEST_CASE_METHOD(test_utils::runtime_fixture, "full graph is printed if CELERITY distr_queue q; celerity::range<1> range(16); - celerity::buffer buff_a(range); + std::vector init(range.size()); + celerity::buffer buff_a(init.data(), range); // set small horizon step size so that we do not need to generate a very large graph to test this functionality auto& tm = celerity::detail::runtime::get_instance().get_task_manager(); @@ -158,11 +166,11 @@ TEST_CASE_METHOD(test_utils::runtime_fixture, "full graph is printed if CELERITY SECTION("task graph") { const auto* expected = - "digraph G {label=\"Task Graph\" 0[shape=ellipse label=epoch>];1[shape=box style=rounded label=device-compute [0,0,0] + [16,1,1]
read_write B0 {[0,0,0] - [16,1,1]}>];0->1[color=orchid];2[shape=ellipse " - "label=horizon>];1->2[color=orange];3[shape=box style=rounded label=device-compute " + "digraph G {label=\"Task Graph\" 0[shape=ellipse label=epoch>];1[shape=box style=rounded label=device-compute [0,0,0] + [16,1,1]
read_write B0 {[0,0,0] - [16,1,1]}>];0->1[];2[shape=ellipse " + "label=horizon>];1->2[color=orange];3[shape=box style=rounded label=device-compute " "[0,0,0] + [16,1,1]
read_write B0 {[0,0,0] - [16,1,1]}>];1->3[];4[shape=ellipse " - "label=horizon>];3->4[color=orange];2->4[color=orange];5[shape=box style=rounded label=horizon>];3->4[color=orange];2->4[color=orange];5[shape=box style=rounded label=device-compute [0,0,0] + [16,1,1]
read_write B0 {[0,0,0] - [16,1,1]}>];3->5[];6[shape=ellipse " "label=horizon>];5->6[color=orange];4->6[color=orange];7[shape=ellipse label=epoch>];6->7[color=orange];}"; @@ -172,14 +180,14 @@ TEST_CASE_METHOD(test_utils::runtime_fixture, "full graph is printed if CELERITY SECTION("command graph") { const auto* expected = "digraph G{label=\"Command Graph\" subgraph cluster_id_0_0{label=<T0 (epoch)>;color=darkgray;id_0_0[label=epoch> fontcolor=black shape=box];}subgraph cluster_id_0_1{label=<T1 \"full_graph_printing_17\" " + "N0
epoch> fontcolor=black shape=box];}subgraph cluster_id_0_1{label=<T1 \"full_graph_printing_18\" " "(device-compute)>;color=darkgray;id_0_1[label=execution [0,0,0] + [16,1,1]
read_write B0 {[0,0,0] - " "[16,1,1]}> fontcolor=black shape=box];}subgraph cluster_id_0_2{label=<T2 " "(horizon)>;color=darkgray;id_0_2[label=horizon> fontcolor=black shape=box];}subgraph cluster_id_0_3{label=<T3 \"full_graph_printing_17\" (device-compute)>;color=darkgray;id_0_3[label=execution [0,0,0] + " + "color=\"#606060\">T3 \"full_graph_printing_18\" (device-compute)
>;color=darkgray;id_0_3[label=execution [0,0,0] + " "[16,1,1]
read_write B0 {[0,0,0] - [16,1,1]}> fontcolor=black shape=box];}subgraph cluster_id_0_4{label=<T4 " "(horizon)>;color=darkgray;id_0_4[label=horizon> fontcolor=black shape=box];}subgraph cluster_id_0_5{label=<T5 \"full_graph_printing_17\" (device-compute)>;color=darkgray;id_0_5[label=execution [0,0,0] + " + "color=\"#606060\">T5 \"full_graph_printing_18\" (device-compute)>;color=darkgray;id_0_5[label=execution [0,0,0] + " "[16,1,1]
read_write B0 {[0,0,0] - [16,1,1]}> fontcolor=black shape=box];}subgraph cluster_id_0_6{label=<T6 " "(horizon)>;color=darkgray;id_0_6[label=horizon> fontcolor=black shape=box];}subgraph cluster_id_0_7{label=<T7 (epoch)>;color=darkgray;id_0_7[label=epoch (barrier)> fontcolor=black " diff --git a/test/runtime_deprecation_tests.cc b/test/runtime_deprecation_tests.cc index 0cecfc6cd..4037973ad 100644 --- a/test/runtime_deprecation_tests.cc +++ b/test/runtime_deprecation_tests.cc @@ -27,7 +27,7 @@ namespace detail { experimental::host_object ho; int my_int = 33; q.submit(allow_by_ref, [= /* capture buffer/host-object by value */, &my_int](handler& cgh) { - accessor acc{buf, cgh, celerity::access::all{}, celerity::write_only_host_task}; + accessor acc{buf, cgh, celerity::access::all{}, celerity::write_only_host_task, celerity::no_init}; experimental::side_effect se{ho, cgh}; cgh.host_task(on_master_node, [=, &my_int] { (void)acc; diff --git a/test/runtime_tests.cc b/test/runtime_tests.cc index 07cf515a4..8061ab421 100644 --- a/test/runtime_tests.cc +++ b/test/runtime_tests.cc @@ -1,7 +1,5 @@ #include "sycl_wrappers.h" -#include - #ifdef _WIN32 #define WIN32_LEAN_AND_MEAN #define NOMINMAX @@ -24,6 +22,7 @@ #include "named_threads.h" #include "ranges.h" +#include "log_test_utils.h" #include "test_utils.h" namespace celerity { @@ -89,10 +88,12 @@ namespace detail { } TEST_CASE_METHOD(test_utils::runtime_fixture, "get_access can be called on const buffer", "[buffer]") { - buffer buf_a{range<2>{32, 64}}; + const range<2> range{32, 64}; + std::vector init(range.size()); + buffer buf_a{init.data(), range}; auto& tm = runtime::get_instance().get_task_manager(); const auto tid = test_utils::add_compute_task( - tm, [&](handler& cgh) { buf_a.get_access(cgh, one_to_one{}); }, buf_a.get_range()); + tm, [&](handler& cgh) { buf_a.get_access(cgh, one_to_one{}); }, range); const auto tsk = tm.get_task(tid); const auto bufs = tsk->get_buffer_access_map().get_accessed_buffers(); REQUIRE(bufs.size() == 1); @@ -244,7 +245,7 @@ namespace detail { TEST_CASE("task_manager correctly records compute task information", "[task_manager][task][device_compute_task]") { task_manager tm{1, nullptr, nullptr}; test_utils::mock_buffer_factory mbf(tm); - auto buf_a = mbf.create_buffer(range<2>(64, 152)); + auto buf_a = mbf.create_buffer(range<2>(64, 152), true /* host_initialized */); auto buf_b = mbf.create_buffer(range<3>(7, 21, 99)); const auto tid = test_utils::add_compute_task( tm, @@ -557,19 +558,19 @@ namespace detail { buffer buf{{10, 10}}; CHECK_THROWS_WITH(q.submit([&](handler& cgh) { - auto acc = buf.get_access(cgh, one_to_one{}); + auto acc = buf.get_access(cgh, one_to_one{}); cgh.parallel_for(range<1>{10}, [=](celerity::item<1>) { (void)acc; }); }), "Invalid range mapper dimensionality: 1-dimensional kernel submitted with a requirement whose range mapper is neither invocable for chunk<1> nor " "(chunk<1>, range<2>) to produce subrange<2>"); CHECK_NOTHROW(q.submit([&](handler& cgh) { - auto acc = buf.get_access(cgh, one_to_one{}); + auto acc = buf.get_access(cgh, one_to_one{}); cgh.parallel_for(range<2>{10, 10}, [=](celerity::item<2>) { (void)acc; }); })); CHECK_THROWS_WITH(q.submit([&](handler& cgh) { - auto acc = buf.get_access(cgh, one_to_one{}); + auto acc = buf.get_access(cgh, one_to_one{}); cgh.parallel_for(range<3>{10, 10, 10}, [=](celerity::item<3>) { (void)acc; }); }), "Invalid range mapper dimensionality: 3-dimensional kernel submitted with a requirement whose range mapper is neither invocable for chunk<3> nor " @@ -594,7 +595,7 @@ namespace detail { TEMPLATE_TEST_CASE_METHOD_SIG( dimension_runtime_fixture, "item::get_id() includes global offset, item::get_linear_id() does not", "[item]", ((int Dims), Dims), 1, 2, 3) { - distr_queue q; + distr_queue q{sycl::device{sycl::default_selector_v}}; // Initialize runtime with a single device so we don't get multiple chunks const int n = 3; const auto global_offset = test_utils::truncate_id({4, 5, 6}); @@ -631,32 +632,38 @@ namespace detail { buffer buf_1{range<1>{2}}; CHECK_THROWS(tm.submit_command_group([&](handler& cgh) { // - cgh.parallel_for(range<1>{1}, reduction(buf_1, cgh, cl::sycl::plus{}), [=](celerity::item<1>, auto&) {}); + cgh.parallel_for( + range<1>{1}, reduction(buf_1, cgh, cl::sycl::plus{}, property::reduction::initialize_to_identity()), [=](celerity::item<1>, auto&) {}); })); buffer buf_4{range<1>{1}}; CHECK_NOTHROW(tm.submit_command_group([&](handler& cgh) { // - cgh.parallel_for(range<1>{1}, reduction(buf_4, cgh, cl::sycl::plus{}), [=](celerity::item<1>, auto&) {}); + cgh.parallel_for( + range<1>{1}, reduction(buf_4, cgh, cl::sycl::plus{}, property::reduction::initialize_to_identity()), [=](celerity::item<1>, auto&) {}); })); buffer buf_2{range<2>{1, 2}}; CHECK_THROWS(tm.submit_command_group([&](handler& cgh) { // - cgh.parallel_for(range<2>{1, 1}, reduction(buf_2, cgh, cl::sycl::plus{}), [=](celerity::item<2>, auto&) {}); + cgh.parallel_for(range<2>{1, 1}, + reduction(buf_2, cgh, cl::sycl::plus{}, property::reduction::initialize_to_identity()), [=](celerity::item<2>, auto&) {}); })); buffer buf_3{range<3>{1, 2, 1}}; CHECK_THROWS(tm.submit_command_group([&](handler& cgh) { // - cgh.parallel_for(range<3>{1, 1, 1}, reduction(buf_3, cgh, cl::sycl::plus{}), [=](celerity::item<3>, auto&) {}); + cgh.parallel_for(range<3>{1, 1, 1}, + reduction(buf_3, cgh, cl::sycl::plus{}, property::reduction::initialize_to_identity()), [=](celerity::item<3>, auto&) {}); })); buffer buf_5{range<2>{1, 1}}; CHECK_NOTHROW(tm.submit_command_group([&](handler& cgh) { // - cgh.parallel_for(range<2>{1, 1}, reduction(buf_5, cgh, cl::sycl::plus{}), [=](celerity::item<2>, auto&) {}); + cgh.parallel_for(range<2>{1, 1}, + reduction(buf_5, cgh, cl::sycl::plus{}, property::reduction::initialize_to_identity()), [=](celerity::item<2>, auto&) {}); })); buffer buf_6{range<3>{1, 1, 1}}; CHECK_NOTHROW(tm.submit_command_group([&](handler& cgh) { // - cgh.parallel_for(range<3>{1, 1, 1}, reduction(buf_6, cgh, cl::sycl::plus{}), [=](celerity::item<3>, auto&) {}); + cgh.parallel_for(range<3>{1, 1, 1}, + reduction(buf_6, cgh, cl::sycl::plus{}, property::reduction::initialize_to_identity()), [=](celerity::item<3>, auto&) {}); })); #else SKIP_BECAUSE_NO_SCALAR_REDUCTIONS @@ -707,7 +714,7 @@ namespace detail { q.submit([&](handler& cgh) { local_accessor la{32, cgh}; - accessor ga{out, cgh, celerity::access::one_to_one{}, write_only}; + accessor ga{out, cgh, celerity::access::one_to_one{}, write_only, no_init}; cgh.parallel_for(celerity::nd_range<1>{64, 32}, [=](nd_item<1> item) { la[item.get_local_id()] = static_cast(item.get_global_linear_id()); group_barrier(item.get_group()); @@ -731,8 +738,9 @@ namespace detail { buffer b{range<1>{1}}; distr_queue{}.submit([&](handler& cgh) { - cgh.parallel_for(celerity::nd_range{range<2>{8, 8}, range<2>{4, 4}}, reduction(b, cgh, cl::sycl::plus<>{}), - [](nd_item<2> item, auto& sum) { sum += item.get_global_linear_id(); }); + cgh.parallel_for(celerity::nd_range{range<2>{8, 8}, range<2>{4, 4}}, + reduction(b, cgh, cl::sycl::plus{}, property::reduction::initialize_to_identity()), + [](nd_item<2> item, auto& sum) { sum += static_cast(item.get_global_linear_id()); }); }); #else SKIP_BECAUSE_NO_SCALAR_REDUCTIONS @@ -752,8 +760,8 @@ namespace detail { #if CELERITY_FEATURE_SCALAR_REDUCTIONS buffer b{{1}}; q.submit([&](handler& cgh) { - cgh.parallel_for( - range<1>{64}, reduction(b, cgh, cl::sycl::plus{}), [=](item<1> item, auto& r) { r += static_cast(item.get_linear_id()); }); + cgh.parallel_for(range<1>{64}, reduction(b, cgh, cl::sycl::plus{}, property::reduction::initialize_to_identity()), + [=](item<1> item, auto& r) { r += static_cast(item.get_linear_id()); }); }); q.submit([&](handler& cgh) { cgh.parallel_for(celerity::nd_range<1>{64, 32}, reduction(b, cgh, cl::sycl::plus{}), @@ -1212,7 +1220,7 @@ namespace detail { void dry_run_with_nodes(const size_t num_nodes) { env::scoped_test_environment ste(std::unordered_map{{dryrun_envvar_name, std::to_string(num_nodes)}}); - distr_queue q; + distr_queue q{sycl::device{sycl::default_selector_v}}; // Initialize runtime with a single device so we don't get multiple chunks auto& rt = runtime::get_instance(); auto& tm = rt.get_task_manager(); @@ -1383,7 +1391,7 @@ namespace detail { distr_queue q; q.submit([&](handler& cgh) { - accessor acc(buf, cgh, all{}, write_only, no_init); + accessor acc(buf, cgh, one_to_one(), write_only, no_init); cgh.parallel_for(buf.get_range(), [=](celerity::item<2> item) { acc[item] = static_cast(item.get_linear_id()); }); }); @@ -1451,5 +1459,35 @@ namespace detail { }); } + TEST_CASE_METHOD(test_utils::runtime_fixture, "runtime warns on uninitialized reads", "[runtime]") { + buffer buf(1); + + std::unique_ptr lc; + { + distr_queue q; + lc = std::make_unique(); + + SECTION("in device kernels") { + q.submit([&](handler& cgh) { + accessor acc(buf, cgh, celerity::access::all(), celerity::read_only); + cgh.parallel_for(range(1), [=](item<1>) { (void)acc; }); + }); + } + + SECTION("in host tasks") { + q.submit([&](handler& cgh) { + accessor acc(buf, cgh, celerity::access::all(), celerity::read_only_host_task); + cgh.host_task(on_master_node, [=] { (void)acc; }); + }); + } + + q.slow_full_sync(); + } + + const auto error_message = + "declares a reading access on uninitialized B0 {[0,0,0] - [1,1,1]}. Make sure to construct the accessor with no_init if possible."; + CHECK_THAT(lc->get_log(), Catch::Matchers::ContainsSubstring(error_message)); + } + } // namespace detail } // namespace celerity diff --git a/test/system/distr_tests.cc b/test/system/distr_tests.cc index 4f9aac429..dd2c38b22 100644 --- a/test/system/distr_tests.cc +++ b/test/system/distr_tests.cc @@ -388,12 +388,15 @@ namespace detail { auto& tm = celerity::detail::runtime::get_instance().get_task_manager(); tm.set_horizon_step(1); - for(int i = 0; i < 2; ++i) { - q.submit([&](handler& cgh) { - celerity::accessor acc_a{buff_a, cgh, celerity::access::one_to_one{}, celerity::read_write}; - cgh.parallel_for(range, [=](item<2> item) { (void)acc_a; }); - }); - } + q.submit([&](handler& cgh) { + celerity::accessor acc_a{buff_a, cgh, celerity::access::one_to_one{}, celerity::write_only, celerity::no_init}; + cgh.parallel_for(range, [=](item<2> item) { (void)acc_a; }); + }); + + q.submit([&](handler& cgh) { + celerity::accessor acc_a{buff_a, cgh, celerity::access::one_to_one{}, celerity::read_write}; + cgh.parallel_for(range, [=](item<2> item) { (void)acc_a; }); + }); q.slow_full_sync(); @@ -408,27 +411,26 @@ namespace detail { const std::string expected = "digraph G{label=\"Command Graph\" subgraph cluster_id_0_0{label=<T0 (epoch)>;color=darkgray;id_0_0[label=epoch> fontcolor=black shape=box];}subgraph cluster_id_0_1{label=<T1 \"unnamed_kernel\" " - "(device-compute)>;color=darkgray;id_0_1[label=execution [0,0,0] + [8,16,1]
read_write B0 {[0,0,0] - " + "(device-compute)>;color=darkgray;id_0_1[label=execution [0,0,0] + [8,16,1]
discard_write B0 {[0,0,0] - " "[8,16,1]}> fontcolor=black shape=box];}subgraph cluster_id_0_2{label=<T2 " "(horizon)>;color=darkgray;id_0_2[label=horizon> fontcolor=black shape=box];}subgraph cluster_id_0_3{label=<T3 \"unnamed_kernel\" (device-compute)>;color=darkgray;id_0_3[label=execution [0,0,0] + " - "[8,16,1]
read_write B0 {[0,0,0] - [8,16,1]}> fontcolor=black shape=box];}subgraph cluster_id_0_4{label=<T4 (horizon)>;color=darkgray;id_0_4[label=horizon> fontcolor=black shape=box];}subgraph " - "cluster_id_0_5{label=<T5 (epoch)>;color=darkgray;id_0_5[label=epoch (barrier)> " - "fontcolor=black " - "shape=box];}id_0_0->id_0_1[];id_0_1->id_0_2[color=orange];id_0_1->id_0_3[];id_0_3->id_0_4[color=orange];id_0_2->id_0_4[color=orange];id_0_4->" - "id_0_5[color=orange];subgraph cluster_id_1_0{label=<T0 (epoch)>;color=darkgray;id_1_0[label=epoch> fontcolor=crimson shape=box];}subgraph cluster_id_1_1{label=<T1 \"unnamed_kernel\" " - "(device-compute)>;color=darkgray;id_1_1[label=execution [8,0,0] + [8,16,1]
read_write B0 {[8,0,0] " - "- [16,16,1]}> fontcolor=crimson shape=box];}subgraph cluster_id_1_2{label=<T2 " - "(horizon)>;color=darkgray;id_1_2[label=horizon> fontcolor=crimson shape=box];}subgraph " + "[8,16,1]
read_write B0 {[0,0,0] - [8,16,1]}> fontcolor=black shape=box];}subgraph cluster_id_0_4{label=<T4 " + "(horizon)>;color=darkgray;id_0_4[label=horizon> fontcolor=black shape=box];}subgraph cluster_id_0_5{label=<T5 (epoch)>;color=darkgray;id_0_5[label=epoch (barrier)> fontcolor=black " + "shape=box];}id_0_0->id_0_1[color=orchid];id_0_1->id_0_2[color=orange];id_0_1->id_0_3[];id_0_3->id_0_4[color=orange];id_0_2->id_0_4[color=" + "orange];id_0_4->id_0_5[color=orange];subgraph cluster_id_1_0{label=<T0 " + "(epoch)>;color=darkgray;id_1_0[label=epoch> fontcolor=crimson shape=box];}subgraph cluster_id_1_1{label=<T1 \"unnamed_kernel\" (device-compute)>;color=darkgray;id_1_1[label=execution [8,0,0] + " + "[8,16,1]
discard_write B0 {[8,0,0] - [16,16,1]}> fontcolor=crimson shape=box];}subgraph cluster_id_1_2{label=<T2 (horizon)>;color=darkgray;id_1_2[label=horizon> fontcolor=crimson shape=box];}subgraph " "cluster_id_1_3{label=<T3 \"unnamed_kernel\" (device-compute)>;color=darkgray;id_1_3[label=execution [8,0,0] + [8,16,1]
read_write B0 {[8,0,0] - [16,16,1]}> fontcolor=crimson shape=box];}subgraph " "cluster_id_1_4{label=<T4 (horizon)>;color=darkgray;id_1_4[label=horizon> " "fontcolor=crimson shape=box];}subgraph cluster_id_1_5{label=<T5 (epoch)>;color=darkgray;id_1_5[label=epoch (barrier)> fontcolor=crimson " - "shape=box];}id_1_0->id_1_1[];id_1_1->id_1_2[color=orange];id_1_1->id_1_3[];id_1_3->id_1_4[color=orange];id_1_2->id_1_4[color=orange];id_1_4->" - "id_1_5[color=orange];}"; + "shape=box];}id_1_0->id_1_1[color=orchid];id_1_1->id_1_2[color=orange];id_1_1->id_1_3[];id_1_3->id_1_4[color=orange];id_1_2->id_1_4[color=" + "orange];id_1_4->id_1_5[color=orange];}"; CHECK(graph == expected); @@ -436,5 +438,38 @@ namespace detail { } } + TEST_CASE_METHOD(test_utils::runtime_fixture, "runtime logs errors on overlapping writes between commands", "[runtime]") { + std::unique_ptr lc; + { + distr_queue q; + const auto num_nodes = runtime::get_instance().get_num_nodes(); + if(num_nodes < 2) { SKIP("Test needs at least 2 participating nodes"); } + + lc = std::make_unique(); + + buffer buf(1); + + SECTION("in distributed device kernels") { + q.submit([&](handler& cgh) { + accessor acc(buf, cgh, celerity::access::all(), write_only, no_init); + cgh.parallel_for(range(num_nodes), [=](item<1>) { (void)acc; }); + }); + } + + SECTION("in collective host tasks") { + q.submit([&](handler& cgh) { + accessor acc(buf, cgh, celerity::access::all(), write_only_host_task, no_init); + cgh.host_task(celerity::experimental::collective, [=](experimental::collective_partition) { (void)acc; }); + }); + } + + q.slow_full_sync(); + } + + const auto error_message = "has overlapping writes between multiple nodes in B0 {[0,0,0] - [1,1,1]}. Choose a non-overlapping range mapper for the " + "write access or constrain the split to make the access non-overlapping."; + CHECK_THAT(lc->get_log(), Catch::Matchers::ContainsSubstring(error_message)); + } + } // namespace detail } // namespace celerity diff --git a/test/task_graph_tests.cc b/test/task_graph_tests.cc index 0168c0052..de07b44cf 100644 --- a/test/task_graph_tests.cc +++ b/test/task_graph_tests.cc @@ -4,6 +4,7 @@ #include #include +#include #include #include @@ -94,16 +95,18 @@ namespace detail { using namespace cl::sycl::access; auto tt = test_utils::task_test_context{}; - auto buf = tt.mbf.create_buffer(range<1>(128)); + auto buf = tt.mbf.create_buffer(range<1>(128), true /* mark_as_host_initialized */); const auto tid_a = test_utils::add_compute_task(tt.tm, [&](handler& cgh) { buf.get_access(cgh, fixed<1>{{0, 64}}); }); const auto tid_b = test_utils::add_compute_task(tt.tm, [&](handler& cgh) { buf.get_access(cgh, fixed<1>{{0, 128}}); }); - REQUIRE(has_dependency(tt.tm, tid_b, tid_a)); + CHECK(has_dependency(tt.tm, tid_b, tid_a)); + CHECK(has_dependency(tt.tm, tid_b, task_manager::initial_epoch_task)); // for read of the host-initialized part const auto tid_c = test_utils::add_compute_task(tt.tm, [&](handler& cgh) { buf.get_access(cgh, fixed<1>{{64, 128}}); }); - REQUIRE_FALSE(has_dependency(tt.tm, tid_c, tid_a)); + CHECK_FALSE(has_dependency(tt.tm, tid_c, tid_a)); + CHECK(has_dependency(tt.tm, tid_c, task_manager::initial_epoch_task)); // for read of the host-initialized part } TEST_CASE("task_manager correctly generates anti-dependencies", "[task_manager][task-graph]") { @@ -137,9 +140,10 @@ namespace detail { using namespace cl::sycl::access; auto tt = test_utils::task_test_context{}; - auto host_init_buf = tt.mbf.create_buffer(range<1>(128), true); - auto non_host_init_buf = tt.mbf.create_buffer(range<1>(128), false); - auto artificial_dependency_buf = tt.mbf.create_buffer(range<1>(1), false); + tt.tm.set_uninitialized_read_policy(error_policy::ignore); // we explicitly test reading from non_host_init_buf + auto host_init_buf = tt.mbf.create_buffer(range<1>(128), true /* mark_as_host_initialized */); + auto non_host_init_buf = tt.mbf.create_buffer(range<1>(128), false /* mark_as_host_initialized */); + auto artificial_dependency_buf = tt.mbf.create_buffer(range<1>(1), false /* mark_as_host_initialized */); const auto tid_a = test_utils::add_compute_task(tt.tm, [&](handler& cgh) { host_init_buf.get_access(cgh, fixed<1>{{0, 128}}); @@ -208,23 +212,20 @@ namespace detail { CAPTURE(producer_mode); auto tt = test_utils::task_test_context{}; - auto buf = tt.mbf.create_buffer(range<1>(128), false); + auto buf = tt.mbf.create_buffer(range<1>(128), true /* mark_as_host_initialized */); - const task_id tid_a = test_utils::add_compute_task(tt.tm, [&](handler& cgh) { - dispatch_get_access(buf, cgh, producer_mode, fixed<1>{{0, 128}}); - }); + const task_id tid_a = + test_utils::add_compute_task(tt.tm, [&](handler& cgh) { dispatch_get_access(buf, cgh, producer_mode, all()); }); - const task_id tid_b = test_utils::add_compute_task(tt.tm, [&](handler& cgh) { - dispatch_get_access(buf, cgh, consumer_mode, fixed<1>{{0, 128}}); - }); - REQUIRE(has_dependency(tt.tm, tid_b, tid_a)); + const task_id tid_b = + test_utils::add_compute_task(tt.tm, [&](handler& cgh) { dispatch_get_access(buf, cgh, consumer_mode, all()); }); + CHECK(has_dependency(tt.tm, tid_b, tid_a)); - const task_id tid_c = test_utils::add_compute_task(tt.tm, [&](handler& cgh) { - dispatch_get_access(buf, cgh, producer_mode, fixed<1>{{0, 128}}); - }); + const task_id tid_c = + test_utils::add_compute_task(tt.tm, [&](handler& cgh) { dispatch_get_access(buf, cgh, producer_mode, all()); }); const bool pure_consumer = consumer_mode == mode::read; const bool pure_producer = producer_mode == mode::discard_read_write || producer_mode == mode::discard_write; - REQUIRE(has_dependency(tt.tm, tid_c, tid_b, pure_consumer || pure_producer ? dependency_kind::anti_dep : dependency_kind::true_dep)); + CHECK(has_dependency(tt.tm, tid_c, tid_b, pure_consumer || pure_producer ? dependency_kind::anti_dep : dependency_kind::true_dep)); } } } @@ -305,7 +306,7 @@ namespace detail { auto tt = test_utils::task_test_context{}; tt.tm.set_horizon_step(2); - auto buf_a = tt.mbf.create_buffer(range<1>(128)); + auto buf_a = tt.mbf.create_buffer(range<1>(128), true /* mark_as_host_initialized */); test_utils::add_host_task(tt.tm, on_master_node, [&](handler& cgh) { buf_a.get_access(cgh, fixed<1>({0, 128})); }); @@ -368,7 +369,7 @@ namespace detail { const auto buff_size = 128; const auto num_tasks = 9; const auto buff_elem_per_task = buff_size / num_tasks; - auto buf_a = tt.mbf.create_buffer(range<1>(buff_size)); + auto buf_a = tt.mbf.create_buffer(range<1>(buff_size), true /* mark_as_host_initialized */); auto current_horizon = task_manager_testspy::get_current_horizon(tt.tm); CHECK_FALSE(current_horizon.has_value()); @@ -538,7 +539,7 @@ namespace detail { TEST_CASE("buffer accesses with empty ranges do not generate data-flow dependencies", "[task_manager][task-graph]") { auto tt = test_utils::task_test_context{}; - auto buf = tt.mbf.create_buffer(range<2>(32, 32)); + auto buf = tt.mbf.create_buffer(range<2>(32, 32), true /* mark_as_host_initialized */); const auto write_sr = GENERATE(values({subrange<2>{{16, 16}, {0, 0}}, subrange<2>{{16, 16}, {8, 8}}})); const auto read_sr = GENERATE(values({subrange<2>{{1, 1}, {0, 0}}, subrange<2>{{8, 8}, {16, 16}}})); @@ -713,5 +714,29 @@ namespace detail { CHECK(has_dependency(tt.tm, tid_b, tid_fence, dependency_kind::anti_dep)); } + TEST_CASE("task_manager throws in tests if it detects an uninitialized read", "[task_manager]") { + test_utils::task_test_context tt; + + SECTION("on a fully uninitialized buffer") { + auto buf = tt.mbf.create_buffer<1>({1}); + + CHECK_THROWS_WITH((test_utils::add_compute_task(tt.tm, [&](handler& cgh) { buf.get_access(cgh, all{}); })), + "Task T1 \"uninit_read\" declares a reading access on uninitialized B0 {[0,0,0] - [1,1,1]}. Make sure to construct the accessor with no_init " + "if possible."); + } + + SECTION("on a partially initialized buffer") { + auto buf = tt.mbf.create_buffer<2>({64, 64}); + test_utils::add_compute_task(tt.tm, [&](handler& cgh) { + buf.get_access(cgh, fixed<2>({{0, 0}, {32, 32}})); + }); + + CHECK_THROWS_WITH( + (test_utils::add_compute_task(tt.tm, [&](handler& cgh) { buf.get_access(cgh, all{}); })), + "Task T2 \"uninit_read\" declares a reading access on uninitialized B0 {[0,32,0] - [32,64,1], [32,0,0] - [64,64,1]}. Make sure to construct " + "the accessor with no_init if possible."); + } + } + } // namespace detail } // namespace celerity diff --git a/test/test_utils.h b/test/test_utils.h index b0c5d69ef..9b2f65513 100644 --- a/test/test_utils.h +++ b/test/test_utils.h @@ -210,8 +210,8 @@ namespace test_utils { const detail::buffer_id bid = m_next_buffer_id++; const auto buf = mock_buffer(bid, size); if(m_task_mngr != nullptr) { m_task_mngr->add_buffer(bid, Dims, detail::range_cast<3>(size), mark_as_host_initialized); } - if(m_schdlr != nullptr) { m_schdlr->notify_buffer_registered(bid, Dims, detail::range_cast<3>(size)); } - if(m_dggen != nullptr) { m_dggen->add_buffer(bid, Dims, detail::range_cast<3>(size)); } + if(m_schdlr != nullptr) { m_schdlr->notify_buffer_registered(bid, Dims, detail::range_cast<3>(size), mark_as_host_initialized); } + if(m_dggen != nullptr) { m_dggen->add_buffer(bid, Dims, detail::range_cast<3>(size), mark_as_host_initialized); } return buf; }