diff --git a/include/split.h b/include/split.h index cd8a6fd4..ba455719 100644 --- a/include/split.h +++ b/include/split.h @@ -1,5 +1,6 @@ #pragma once +#include "grid.h" #include "ranges.h" #include @@ -8,7 +9,7 @@ namespace celerity::detail { -std::vector> split_1d(const chunk<3>& full_chunk, const range<3>& granularity, const size_t num_chunks); -std::vector> split_2d(const chunk<3>& full_chunk, const range<3>& granularity, const size_t num_chunks); +std::vector> split_1d(const box<3>& full_box, const range<3>& granularity, const size_t num_boxs); +std::vector> split_2d(const box<3>& full_box, const range<3>& granularity, const size_t num_boxs); } // namespace celerity::detail diff --git a/src/command_graph_generator.cc b/src/command_graph_generator.cc index c6961842..da56682e 100644 --- a/src/command_graph_generator.cc +++ b/src/command_graph_generator.cc @@ -140,13 +140,15 @@ void command_graph_generator::report_overlapping_writes(const task& tsk, const b } std::vector command_graph_generator::split_task_and_assign_chunks(const task& tsk) const { - const chunk<3> full_chunk{tsk.get_global_offset(), tsk.get_global_size(), tsk.get_global_size()}; + const box<3> full_chunk{subrange<3>(tsk.get_global_offset(), tsk.get_global_size())}; const size_t num_chunks = m_num_nodes * m_test_chunk_multiplier; const auto chunks = ([&] { if(tsk.get_type() == task_type::collective || tsk.get_type() == task_type::fence) { - std::vector> chunks; + std::vector> chunks; for(size_t nid = 0; nid < m_num_nodes; ++nid) { - chunks.push_back(chunk_cast<3>(chunk<1>{id<1>{tsk.get_type() == task_type::collective ? nid : 0}, ones, {m_num_nodes}})); + const id<1> min = tsk.get_type() == task_type::collective ? nid : 0; + const id<1> max = min + 1; + chunks.push_back(box_cast<3>(box<1>{min, max})); } return chunks; } @@ -157,7 +159,7 @@ std::vector command_graph_generator::sp if(tsk.get_hint() != nullptr) { return split_2d(full_chunk, tsk.get_granularity(), num_chunks); } return split_1d(full_chunk, tsk.get_granularity(), num_chunks); } - return std::vector>{full_chunk}; + return std::vector>{full_chunk}; })(); assert(chunks.size() <= num_chunks); // We may have created less than requested assert(!chunks.empty()); @@ -171,7 +173,7 @@ std::vector command_graph_generator::sp std::vector assigned_chunks; for(size_t i = 0; i < chunks.size(); ++i) { const node_id nid = (i / chunks_per_node) % m_num_nodes; - assigned_chunks.push_back({nid, chunks[i]}); + assigned_chunks.push_back({nid, chunk<3>(chunks[i].get_min(), chunks[i].get_range(), tsk.get_global_size())}); } return assigned_chunks; } diff --git a/src/instruction_graph_generator.cc b/src/instruction_graph_generator.cc index 3aa669b1..0d2dc73f 100644 --- a/src/instruction_graph_generator.cc +++ b/src/instruction_graph_generator.cc @@ -1502,12 +1502,11 @@ std::vector generator_impl::split_task_execution_range(const ex tsk.has_variable_split() && tsk.get_side_effect_map().empty() && tsk.get_collective_group_id() == non_collective_group_id; const auto split = tsk.get_hint() != nullptr ? split_2d : split_1d; - const auto command_sr = ecmd.get_execution_range(); - const auto command_chunk = chunk<3>(command_sr.offset, command_sr.range, tsk.get_global_size()); + const auto command_chunk = box<3>(ecmd.get_execution_range()); // As a heuristic to keep inter-device communication to a minimum, we split the execution range twice when oversubscription is active: Once to obtain // contiguous chunks per device, and one more (below) to subdivide the ranges on each device (which can help with computation-communication overlap). - std::vector> coarse_chunks; + std::vector> coarse_chunks; if(is_splittable_locally && tsk.get_execution_target() == execution_target::device) { coarse_chunks = split(command_chunk, tsk.get_granularity(), m_system.devices.size()); } else { @@ -1537,7 +1536,7 @@ std::vector generator_impl::split_task_execution_range(const ex for(size_t coarse_idx = 0; coarse_idx < coarse_chunks.size(); ++coarse_idx) { for(const auto& fine_chunk : split(coarse_chunks[coarse_idx], tsk.get_granularity(), oversubscribe_factor)) { auto& localized_chunk = concurrent_chunks.emplace_back(); - localized_chunk.execution_range = box(subrange(fine_chunk.offset, fine_chunk.range)); + localized_chunk.execution_range = fine_chunk; if(tsk.get_execution_target() == execution_target::device) { assert(coarse_idx < m_system.devices.size()); localized_chunk.memory_id = m_system.devices[coarse_idx].native_memory; diff --git a/src/split.cc b/src/split.cc index 271b3380..7d3bad9b 100644 --- a/src/split.cc +++ b/src/split.cc @@ -17,26 +17,26 @@ namespace { using namespace celerity; using namespace celerity::detail; -[[maybe_unused]] void sanity_check_split(const chunk<3>& full_chunk, const std::vector>& split) { +[[maybe_unused]] void sanity_check_split(const box<3>& full_chunk, const std::vector>& split) { region<3> reconstructed_chunk; for(auto& chnk : split) { - assert(region_intersection(reconstructed_chunk, box<3>(chnk)).empty()); - reconstructed_chunk = region_union(box<3>(chnk), reconstructed_chunk); + assert(region_intersection(reconstructed_chunk, chnk).empty()); + reconstructed_chunk = region_union(chnk, reconstructed_chunk); } - assert(region_difference(reconstructed_chunk, box<3>(full_chunk)).empty()); + assert(region_difference(reconstructed_chunk, full_chunk).empty()); } template std::tuple, range, range> compute_small_and_large_chunks( - const chunk<3>& full_chunk, const range<3>& granularity, const std::array& actual_num_chunks) { + const box<3>& full_chunk, const range<3>& granularity, const std::array& actual_num_chunks) { range small_chunk_size{zeros}; range large_chunk_size{zeros}; range num_large_chunks{zeros}; for(int d = 0; d < Dims; ++d) { - const size_t ideal_chunk_size = full_chunk.range[d] / actual_num_chunks[d]; + const size_t ideal_chunk_size = full_chunk.get_range()[d] / actual_num_chunks[d]; small_chunk_size[d] = (ideal_chunk_size / granularity[d]) * granularity[d]; large_chunk_size[d] = small_chunk_size[d] + granularity[d]; - num_large_chunks[d] = (full_chunk.range[d] - small_chunk_size[d] * actual_num_chunks[d]) / granularity[d]; + num_large_chunks[d] = (full_chunk.get_range()[d] - small_chunk_size[d] * actual_num_chunks[d]) / granularity[d]; } return {small_chunk_size, large_chunk_size, num_large_chunks}; } @@ -51,9 +51,9 @@ std::tuple, range, range> compute_small_and_large_chunks * @returns The number of chunks that can be created in dimension 0 and dimension 1, respectively. These are at most * (f0, f1) or (f1, f0), however may be less if constrained by the split granularity. */ -std::array assign_split_factors_2d(const chunk<3>& full_chunk, const range<3>& granularity, const size_t factor, const size_t num_chunks) { +std::array assign_split_factors_2d(const box<3>& full_chunk, const range<3>& granularity, const size_t factor, const size_t num_chunks) { assert(num_chunks % factor == 0); - const size_t max_chunks[2] = {full_chunk.range[0] / granularity[0], full_chunk.range[1] / granularity[1]}; + const size_t max_chunks[2] = {full_chunk.get_range()[0] / granularity[0], full_chunk.get_range()[1] / granularity[1]}; const size_t f0 = factor; const size_t f1 = num_chunks / factor; @@ -71,12 +71,12 @@ std::array assign_split_factors_2d(const chunk<3>& full_chunk, const // If domain is square(-ish), prefer splitting along slower dimension. // (These bounds have been chosen arbitrarily!) - const double squareishness = std::sqrt(full_chunk.range.size()) / static_cast(full_chunk.range[0]); + const double squareishness = std::sqrt(full_chunk.get_area()) / static_cast(full_chunk.get_range()[0]); if(squareishness > 0.95 && squareishness < 1.05) { return (f0 >= f1) ? split_0_1 : split_1_0; } // For non-square domains, prefer split that produces shorter edges (compare sum of circumferences) - const auto circ0 = full_chunk.range[0] / split_0_1[0] + full_chunk.range[1] / split_0_1[1]; - const auto circ1 = full_chunk.range[0] / split_1_0[0] + full_chunk.range[1] / split_1_0[1]; + const auto circ0 = full_chunk.get_range()[0] / split_0_1[0] + full_chunk.get_range()[1] / split_0_1[1]; + const auto circ1 = full_chunk.get_range()[0] / split_1_0[0] + full_chunk.get_range()[1] / split_1_0[1]; return circ0 < circ1 ? split_0_1 : split_1_0; // TODO: Yet another heuristic we may want to consider is how even chunk sizes are, @@ -87,28 +87,35 @@ std::array assign_split_factors_2d(const chunk<3>& full_chunk, const namespace celerity::detail { -std::vector> split_1d(const chunk<3>& full_chunk, const range<3>& granularity, const size_t num_chunks) { +std::vector> split_1d(const box<3>& full_chunk, const range<3>& granularity, const size_t num_chunks) { #ifndef NDEBUG assert(num_chunks > 0); for(int d = 0; d < 3; ++d) { assert(granularity[d] > 0); - assert(full_chunk.range[d] % granularity[d] == 0); + assert(full_chunk.get_range()[d] % granularity[d] == 0); } #endif // Due to split granularity requirements or if num_workers > global_size[0], // we may not be able to create the requested number of chunks. - const std::array actual_num_chunks = {std::min(num_chunks, full_chunk.range[0] / granularity[0])}; + const std::array actual_num_chunks = {std::min(num_chunks, full_chunk.get_range()[0] / granularity[0])}; const auto [small_chunk_size, large_chunk_size, num_large_chunks] = compute_small_and_large_chunks<1>(full_chunk, granularity, actual_num_chunks); - std::vector> result(actual_num_chunks[0], {full_chunk.offset, full_chunk.range, full_chunk.global_size}); + std::vector> result; + result.reserve(actual_num_chunks[0]); for(auto i = 0u; i < num_large_chunks[0]; ++i) { - result[i].range[0] = large_chunk_size[0]; - result[i].offset[0] += i * large_chunk_size[0]; + id<3> min = full_chunk.get_min(); + id<3> max = full_chunk.get_max(); + min[0] += i * large_chunk_size[0]; + max[0] = min[0] + large_chunk_size[0]; + result.emplace_back(min, max); } for(auto i = num_large_chunks[0]; i < actual_num_chunks[0]; ++i) { - result[i].range[0] = small_chunk_size[0]; - result[i].offset[0] += num_large_chunks[0] * large_chunk_size[0] + (i - num_large_chunks[0]) * small_chunk_size[0]; + id<3> min = full_chunk.get_min(); + id<3> max = full_chunk.get_max(); + min[0] += num_large_chunks[0] * large_chunk_size[0] + (i - num_large_chunks[0]) * small_chunk_size[0]; + max[0] = min[0] + small_chunk_size[0]; + result.emplace_back(min, max); } #ifndef NDEBUG @@ -119,12 +126,12 @@ std::vector> split_1d(const chunk<3>& full_chunk, const range<3>& granu } // TODO: Make the split dimensions configurable for 3D chunks? -std::vector> split_2d(const chunk<3>& full_chunk, const range<3>& granularity, const size_t num_chunks) { +std::vector> split_2d(const box<3>& full_chunk, const range<3>& granularity, const size_t num_chunks) { #ifndef NDEBUG assert(num_chunks > 0); for(int d = 0; d < 3; ++d) { assert(granularity[d] > 0); - assert(full_chunk.range[d] % granularity[d] == 0); + assert(full_chunk.get_range()[d] % granularity[d] == 0); } #endif @@ -147,21 +154,23 @@ std::vector> split_2d(const chunk<3>& full_chunk, const range<3>& granu const auto actual_num_chunks = best_chunk_counts; const auto [small_chunk_size, large_chunk_size, num_large_chunks] = compute_small_and_large_chunks<2>(full_chunk, granularity, actual_num_chunks); - std::vector> result(actual_num_chunks[0] * actual_num_chunks[1], {full_chunk.offset, full_chunk.range, full_chunk.global_size}); - id<3> offset = full_chunk.offset; + std::vector> result; + result.reserve(actual_num_chunks[0] * actual_num_chunks[1]); + id<3> offset = full_chunk.get_min(); for(size_t j = 0; j < actual_num_chunks[0]; ++j) { range<2> chunk_size = {(j < num_large_chunks[0]) ? large_chunk_size[0] : small_chunk_size[0], 0}; for(size_t i = 0; i < actual_num_chunks[1]; ++i) { chunk_size[1] = (i < num_large_chunks[1]) ? large_chunk_size[1] : small_chunk_size[1]; - auto& chnk = result[j * actual_num_chunks[1] + i]; - chnk.offset = offset; - chnk.range[0] = chunk_size[0]; - chnk.range[1] = chunk_size[1]; + const id<3> min = offset; + id<3> max = full_chunk.get_max(); + max[0] = min[0] + chunk_size[0]; + max[1] = min[1] + chunk_size[1]; + result.emplace_back(min, max); offset[1] += chunk_size[1]; } offset[0] += chunk_size[0]; - offset[1] = full_chunk.offset[1]; + offset[1] = full_chunk.get_min()[1]; } #ifndef NDEBUG diff --git a/test/split_tests.cc b/test/split_tests.cc index 97cc6986..20c9bb6f 100644 --- a/test/split_tests.cc +++ b/test/split_tests.cc @@ -1,5 +1,3 @@ -#include - #include #include #include @@ -14,19 +12,18 @@ using namespace celerity::detail; namespace { template -chunk<3> make_full_chunk(range range) { - return {id<3>{}, range_cast<3>(range), range_cast<3>(range)}; +box<3> make_full_chunk(range range) { + return {id<3>{}, range_cast<3>(range)}; } -void check_1d_split(const chunk<3>& full_chunk, const std::vector>& split_chunks, const std::vector& chunk_ranges) { +void check_1d_split(const box<3>& full_chunk, const std::vector>& split_chunks, const std::vector& chunk_ranges) { REQUIRE(split_chunks.size() == chunk_ranges.size()); - id<3> offset = full_chunk.offset; + id<3> offset = full_chunk.get_min(); for(size_t i = 0; i < split_chunks.size(); ++i) { const auto& chnk = split_chunks[i]; - REQUIRE_LOOP(chnk.offset == offset); - REQUIRE_LOOP(chnk.range[0] == chunk_ranges[i]); - REQUIRE_LOOP(chnk.global_size == full_chunk.global_size); - offset[0] += split_chunks[i].range[0]; + REQUIRE_LOOP(chnk.get_min() == offset); + REQUIRE_LOOP(chnk.get_range()[0] == chunk_ranges[i]); + offset[0] += chnk.get_range()[0]; } } @@ -48,21 +45,20 @@ void check_1d_split(const chunk<3>& full_chunk, const std::vector>& spl * to the width of an individual chunk. */ void check_2d_split( - const chunk<3>& full_chunk, const std::vector>& split_chunks, const std::vector>>& chunk_ranges) { + const box<3>& full_chunk, const std::vector>& split_chunks, const std::vector>>& chunk_ranges) { REQUIRE(split_chunks.size() == std::accumulate(chunk_ranges.begin(), chunk_ranges.end(), size_t(0), [](size_t c, auto& p) { return c + p.second.size(); })); REQUIRE(std::all_of(chunk_ranges.begin(), chunk_ranges.end(), [&](auto& p) { return p.second.size() == chunk_ranges[0].second.size(); })); - id<3> offset = full_chunk.offset; + id<3> offset = full_chunk.get_min(); for(size_t j = 0; j < chunk_ranges.size(); ++j) { const auto& [height, widths] = chunk_ranges[j]; for(size_t i = 0; i < widths.size(); ++i) { const auto& chnk = split_chunks[j * chunk_ranges[0].second.size() + i]; - REQUIRE_LOOP(chnk.offset == offset); - REQUIRE_LOOP(chnk.range[0] == height); - REQUIRE_LOOP(chnk.range[1] == widths[i]); - REQUIRE_LOOP(chnk.global_size == full_chunk.global_size); + REQUIRE_LOOP(chnk.get_min() == offset); + REQUIRE_LOOP(chnk.get_range()[0] == height); + REQUIRE_LOOP(chnk.get_range()[1] == widths[i]); offset[1] += widths[i]; } - offset[1] = full_chunk.offset[1]; + offset[1] = full_chunk.get_min()[1]; offset[0] += height; } } @@ -94,13 +90,13 @@ TEST_CASE("split_1d creates fewer chunks than requested if mandated by granulari } TEST_CASE("split_1d preserves offset of original chunk", "[split]") { - const auto full_chunk = chunk<3>{{37, 42, 7}, {128, 1, 1}, {128, 1, 1}}; + const auto full_chunk = box<3>{subrange<3>({37, 42, 7}, {128, 1, 1})}; const auto chunks = split_1d(full_chunk, ones, 4); - CHECK(chunks[0].offset == id<3>{37 + 0, 42, 7}); - CHECK(chunks[1].offset == id<3>{37 + 32, 42, 7}); - CHECK(chunks[2].offset == id<3>{37 + 64, 42, 7}); - CHECK(chunks[3].offset == id<3>{37 + 96, 42, 7}); + CHECK(chunks[0].get_min() == id<3>{37 + 0, 42, 7}); + CHECK(chunks[1].get_min() == id<3>{37 + 32, 42, 7}); + CHECK(chunks[2].get_min() == id<3>{37 + 64, 42, 7}); + CHECK(chunks[3].get_min() == id<3>{37 + 96, 42, 7}); check_1d_split(full_chunk, chunks, {32, 32, 32, 32}); } @@ -109,7 +105,7 @@ TEST_CASE("split_1d preserves ranges of original chunk in other dimensions", "[s const auto full_chunk = make_full_chunk<3>({128, 42, 341}); const auto chunks = split_1d(full_chunk, ones, 4); for(size_t i = 0; i < 4; ++i) { - REQUIRE_LOOP(chunks[0].range == range<3>{32, 42, 341}); + REQUIRE_LOOP(chunks[0].get_range() == range<3>{32, 42, 341}); } } @@ -251,19 +247,19 @@ TEST_CASE("split_2d minimizes edge lengths for non-square domains") { } TEST_CASE("split_2d preserves offset of original chunk", "[split]") { - const auto full_chunk = chunk<3>{{37, 42, 7}, {64, 64, 1}, {128, 128, 1}}; + const auto full_chunk = box<3>{subrange<3>({37, 42, 7}, {64, 64, 1})}; const auto chunks = split_2d(full_chunk, ones, 4); - CHECK(chunks[0].offset == id<3>{37, 42, 7}); - CHECK(chunks[1].offset == id<3>{37, 42 + 32, 7}); - CHECK(chunks[2].offset == id<3>{37 + 32, 42 + 0, 7}); - CHECK(chunks[3].offset == id<3>{37 + 32, 42 + 32, 7}); + CHECK(chunks[0].get_min() == id<3>{37, 42, 7}); + CHECK(chunks[1].get_min() == id<3>{37, 42 + 32, 7}); + CHECK(chunks[2].get_min() == id<3>{37 + 32, 42 + 0, 7}); + CHECK(chunks[3].get_min() == id<3>{37 + 32, 42 + 32, 7}); } TEST_CASE("split_2d preserves ranges of original chunk in other dimensions", "[split]") { const auto full_chunk = make_full_chunk<3>({128, 128, 341}); const auto chunks = split_2d(full_chunk, ones, 4); for(size_t i = 0; i < 4; ++i) { - REQUIRE_LOOP(chunks[i].range == range<3>{64, 64, 341}); + REQUIRE_LOOP(chunks[i].get_range() == range<3>{64, 64, 341}); } }