Skip to content

Commit

Permalink
Change split functions to work on box instead of chunk
Browse files Browse the repository at this point in the history
  • Loading branch information
psalz committed Dec 20, 2024
1 parent 6a2b416 commit 95b117a
Show file tree
Hide file tree
Showing 5 changed files with 76 additions and 69 deletions.
5 changes: 3 additions & 2 deletions include/split.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#pragma once

#include "grid.h"
#include "ranges.h"

#include <cstddef>
Expand All @@ -8,7 +9,7 @@

namespace celerity::detail {

std::vector<chunk<3>> split_1d(const chunk<3>& full_chunk, const range<3>& granularity, const size_t num_chunks);
std::vector<chunk<3>> split_2d(const chunk<3>& full_chunk, const range<3>& granularity, const size_t num_chunks);
std::vector<box<3>> split_1d(const box<3>& full_box, const range<3>& granularity, const size_t num_boxs);
std::vector<box<3>> split_2d(const box<3>& full_box, const range<3>& granularity, const size_t num_boxs);

} // namespace celerity::detail
12 changes: 7 additions & 5 deletions src/command_graph_generator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -140,13 +140,15 @@ void command_graph_generator::report_overlapping_writes(const task& tsk, const b
}

std::vector<command_graph_generator::assigned_chunk> command_graph_generator::split_task_and_assign_chunks(const task& tsk) const {
const chunk<3> full_chunk{tsk.get_global_offset(), tsk.get_global_size(), tsk.get_global_size()};
const box<3> full_chunk{subrange<3>(tsk.get_global_offset(), tsk.get_global_size())};
const size_t num_chunks = m_num_nodes * m_test_chunk_multiplier;
const auto chunks = ([&] {
if(tsk.get_type() == task_type::collective || tsk.get_type() == task_type::fence) {
std::vector<chunk<3>> chunks;
std::vector<box<3>> chunks;
for(size_t nid = 0; nid < m_num_nodes; ++nid) {
chunks.push_back(chunk_cast<3>(chunk<1>{id<1>{tsk.get_type() == task_type::collective ? nid : 0}, ones, {m_num_nodes}}));
const id<1> min = tsk.get_type() == task_type::collective ? nid : 0;
const id<1> max = min + 1;
chunks.push_back(box_cast<3>(box<1>{min, max}));
}
return chunks;
}
Expand All @@ -157,7 +159,7 @@ std::vector<command_graph_generator::assigned_chunk> command_graph_generator::sp
if(tsk.get_hint<experimental::hints::split_2d>() != nullptr) { return split_2d(full_chunk, tsk.get_granularity(), num_chunks); }
return split_1d(full_chunk, tsk.get_granularity(), num_chunks);
}
return std::vector<chunk<3>>{full_chunk};
return std::vector<box<3>>{full_chunk};
})();
assert(chunks.size() <= num_chunks); // We may have created less than requested
assert(!chunks.empty());
Expand All @@ -171,7 +173,7 @@ std::vector<command_graph_generator::assigned_chunk> command_graph_generator::sp
std::vector<assigned_chunk> assigned_chunks;
for(size_t i = 0; i < chunks.size(); ++i) {
const node_id nid = (i / chunks_per_node) % m_num_nodes;
assigned_chunks.push_back({nid, chunks[i]});
assigned_chunks.push_back({nid, chunk<3>(chunks[i].get_min(), chunks[i].get_range(), tsk.get_global_size())});
}
return assigned_chunks;
}
Expand Down
7 changes: 3 additions & 4 deletions src/instruction_graph_generator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1502,12 +1502,11 @@ std::vector<localized_chunk> generator_impl::split_task_execution_range(const ex
tsk.has_variable_split() && tsk.get_side_effect_map().empty() && tsk.get_collective_group_id() == non_collective_group_id;
const auto split = tsk.get_hint<experimental::hints::split_2d>() != nullptr ? split_2d : split_1d;

const auto command_sr = ecmd.get_execution_range();
const auto command_chunk = chunk<3>(command_sr.offset, command_sr.range, tsk.get_global_size());
const auto command_chunk = box<3>(ecmd.get_execution_range());

// As a heuristic to keep inter-device communication to a minimum, we split the execution range twice when oversubscription is active: Once to obtain
// contiguous chunks per device, and one more (below) to subdivide the ranges on each device (which can help with computation-communication overlap).
std::vector<chunk<3>> coarse_chunks;
std::vector<box<3>> coarse_chunks;
if(is_splittable_locally && tsk.get_execution_target() == execution_target::device) {
coarse_chunks = split(command_chunk, tsk.get_granularity(), m_system.devices.size());
} else {
Expand Down Expand Up @@ -1537,7 +1536,7 @@ std::vector<localized_chunk> generator_impl::split_task_execution_range(const ex
for(size_t coarse_idx = 0; coarse_idx < coarse_chunks.size(); ++coarse_idx) {
for(const auto& fine_chunk : split(coarse_chunks[coarse_idx], tsk.get_granularity(), oversubscribe_factor)) {
auto& localized_chunk = concurrent_chunks.emplace_back();
localized_chunk.execution_range = box(subrange(fine_chunk.offset, fine_chunk.range));
localized_chunk.execution_range = fine_chunk;
if(tsk.get_execution_target() == execution_target::device) {
assert(coarse_idx < m_system.devices.size());
localized_chunk.memory_id = m_system.devices[coarse_idx].native_memory;
Expand Down
67 changes: 38 additions & 29 deletions src/split.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,26 +17,26 @@ namespace {
using namespace celerity;
using namespace celerity::detail;

[[maybe_unused]] void sanity_check_split(const chunk<3>& full_chunk, const std::vector<chunk<3>>& split) {
[[maybe_unused]] void sanity_check_split(const box<3>& full_chunk, const std::vector<box<3>>& split) {
region<3> reconstructed_chunk;
for(auto& chnk : split) {
assert(region_intersection(reconstructed_chunk, box<3>(chnk)).empty());
reconstructed_chunk = region_union(box<3>(chnk), reconstructed_chunk);
assert(region_intersection(reconstructed_chunk, chnk).empty());
reconstructed_chunk = region_union(chnk, reconstructed_chunk);
}
assert(region_difference(reconstructed_chunk, box<3>(full_chunk)).empty());
assert(region_difference(reconstructed_chunk, full_chunk).empty());
}

template <int Dims>
std::tuple<range<Dims>, range<Dims>, range<Dims>> compute_small_and_large_chunks(
const chunk<3>& full_chunk, const range<3>& granularity, const std::array<size_t, Dims>& actual_num_chunks) {
const box<3>& full_chunk, const range<3>& granularity, const std::array<size_t, Dims>& actual_num_chunks) {
range<Dims> small_chunk_size{zeros};
range<Dims> large_chunk_size{zeros};
range<Dims> num_large_chunks{zeros};
for(int d = 0; d < Dims; ++d) {
const size_t ideal_chunk_size = full_chunk.range[d] / actual_num_chunks[d];
const size_t ideal_chunk_size = full_chunk.get_range()[d] / actual_num_chunks[d];
small_chunk_size[d] = (ideal_chunk_size / granularity[d]) * granularity[d];
large_chunk_size[d] = small_chunk_size[d] + granularity[d];
num_large_chunks[d] = (full_chunk.range[d] - small_chunk_size[d] * actual_num_chunks[d]) / granularity[d];
num_large_chunks[d] = (full_chunk.get_range()[d] - small_chunk_size[d] * actual_num_chunks[d]) / granularity[d];
}
return {small_chunk_size, large_chunk_size, num_large_chunks};
}
Expand All @@ -51,9 +51,9 @@ std::tuple<range<Dims>, range<Dims>, range<Dims>> compute_small_and_large_chunks
* @returns The number of chunks that can be created in dimension 0 and dimension 1, respectively. These are at most
* (f0, f1) or (f1, f0), however may be less if constrained by the split granularity.
*/
std::array<size_t, 2> assign_split_factors_2d(const chunk<3>& full_chunk, const range<3>& granularity, const size_t factor, const size_t num_chunks) {
std::array<size_t, 2> assign_split_factors_2d(const box<3>& full_chunk, const range<3>& granularity, const size_t factor, const size_t num_chunks) {
assert(num_chunks % factor == 0);
const size_t max_chunks[2] = {full_chunk.range[0] / granularity[0], full_chunk.range[1] / granularity[1]};
const size_t max_chunks[2] = {full_chunk.get_range()[0] / granularity[0], full_chunk.get_range()[1] / granularity[1]};
const size_t f0 = factor;
const size_t f1 = num_chunks / factor;

Expand All @@ -71,12 +71,12 @@ std::array<size_t, 2> assign_split_factors_2d(const chunk<3>& full_chunk, const

// If domain is square(-ish), prefer splitting along slower dimension.
// (These bounds have been chosen arbitrarily!)
const double squareishness = std::sqrt(full_chunk.range.size()) / static_cast<double>(full_chunk.range[0]);
const double squareishness = std::sqrt(full_chunk.get_area()) / static_cast<double>(full_chunk.get_range()[0]);
if(squareishness > 0.95 && squareishness < 1.05) { return (f0 >= f1) ? split_0_1 : split_1_0; }

// For non-square domains, prefer split that produces shorter edges (compare sum of circumferences)
const auto circ0 = full_chunk.range[0] / split_0_1[0] + full_chunk.range[1] / split_0_1[1];
const auto circ1 = full_chunk.range[0] / split_1_0[0] + full_chunk.range[1] / split_1_0[1];
const auto circ0 = full_chunk.get_range()[0] / split_0_1[0] + full_chunk.get_range()[1] / split_0_1[1];
const auto circ1 = full_chunk.get_range()[0] / split_1_0[0] + full_chunk.get_range()[1] / split_1_0[1];
return circ0 < circ1 ? split_0_1 : split_1_0;

// TODO: Yet another heuristic we may want to consider is how even chunk sizes are,
Expand All @@ -87,28 +87,35 @@ std::array<size_t, 2> assign_split_factors_2d(const chunk<3>& full_chunk, const

namespace celerity::detail {

std::vector<chunk<3>> split_1d(const chunk<3>& full_chunk, const range<3>& granularity, const size_t num_chunks) {
std::vector<box<3>> split_1d(const box<3>& full_chunk, const range<3>& granularity, const size_t num_chunks) {
#ifndef NDEBUG
assert(num_chunks > 0);
for(int d = 0; d < 3; ++d) {
assert(granularity[d] > 0);
assert(full_chunk.range[d] % granularity[d] == 0);
assert(full_chunk.get_range()[d] % granularity[d] == 0);
}
#endif

// Due to split granularity requirements or if num_workers > global_size[0],
// we may not be able to create the requested number of chunks.
const std::array<size_t, 1> actual_num_chunks = {std::min(num_chunks, full_chunk.range[0] / granularity[0])};
const std::array<size_t, 1> actual_num_chunks = {std::min(num_chunks, full_chunk.get_range()[0] / granularity[0])};
const auto [small_chunk_size, large_chunk_size, num_large_chunks] = compute_small_and_large_chunks<1>(full_chunk, granularity, actual_num_chunks);

std::vector<chunk<3>> result(actual_num_chunks[0], {full_chunk.offset, full_chunk.range, full_chunk.global_size});
std::vector<box<3>> result;
result.reserve(actual_num_chunks[0]);
for(auto i = 0u; i < num_large_chunks[0]; ++i) {
result[i].range[0] = large_chunk_size[0];
result[i].offset[0] += i * large_chunk_size[0];
id<3> min = full_chunk.get_min();
id<3> max = full_chunk.get_max();
min[0] += i * large_chunk_size[0];
max[0] = min[0] + large_chunk_size[0];
result.emplace_back(min, max);
}
for(auto i = num_large_chunks[0]; i < actual_num_chunks[0]; ++i) {
result[i].range[0] = small_chunk_size[0];
result[i].offset[0] += num_large_chunks[0] * large_chunk_size[0] + (i - num_large_chunks[0]) * small_chunk_size[0];
id<3> min = full_chunk.get_min();
id<3> max = full_chunk.get_max();
min[0] += num_large_chunks[0] * large_chunk_size[0] + (i - num_large_chunks[0]) * small_chunk_size[0];
max[0] = min[0] + small_chunk_size[0];
result.emplace_back(min, max);
}

#ifndef NDEBUG
Expand All @@ -119,12 +126,12 @@ std::vector<chunk<3>> split_1d(const chunk<3>& full_chunk, const range<3>& granu
}

// TODO: Make the split dimensions configurable for 3D chunks?
std::vector<chunk<3>> split_2d(const chunk<3>& full_chunk, const range<3>& granularity, const size_t num_chunks) {
std::vector<box<3>> split_2d(const box<3>& full_chunk, const range<3>& granularity, const size_t num_chunks) {
#ifndef NDEBUG
assert(num_chunks > 0);
for(int d = 0; d < 3; ++d) {
assert(granularity[d] > 0);
assert(full_chunk.range[d] % granularity[d] == 0);
assert(full_chunk.get_range()[d] % granularity[d] == 0);
}
#endif

Expand All @@ -147,21 +154,23 @@ std::vector<chunk<3>> split_2d(const chunk<3>& full_chunk, const range<3>& granu
const auto actual_num_chunks = best_chunk_counts;
const auto [small_chunk_size, large_chunk_size, num_large_chunks] = compute_small_and_large_chunks<2>(full_chunk, granularity, actual_num_chunks);

std::vector<chunk<3>> result(actual_num_chunks[0] * actual_num_chunks[1], {full_chunk.offset, full_chunk.range, full_chunk.global_size});
id<3> offset = full_chunk.offset;
std::vector<box<3>> result;
result.reserve(actual_num_chunks[0] * actual_num_chunks[1]);
id<3> offset = full_chunk.get_min();

for(size_t j = 0; j < actual_num_chunks[0]; ++j) {
range<2> chunk_size = {(j < num_large_chunks[0]) ? large_chunk_size[0] : small_chunk_size[0], 0};
for(size_t i = 0; i < actual_num_chunks[1]; ++i) {
chunk_size[1] = (i < num_large_chunks[1]) ? large_chunk_size[1] : small_chunk_size[1];
auto& chnk = result[j * actual_num_chunks[1] + i];
chnk.offset = offset;
chnk.range[0] = chunk_size[0];
chnk.range[1] = chunk_size[1];
const id<3> min = offset;
id<3> max = full_chunk.get_max();
max[0] = min[0] + chunk_size[0];
max[1] = min[1] + chunk_size[1];
result.emplace_back(min, max);
offset[1] += chunk_size[1];
}
offset[0] += chunk_size[0];
offset[1] = full_chunk.offset[1];
offset[1] = full_chunk.get_min()[1];
}

#ifndef NDEBUG
Expand Down
54 changes: 25 additions & 29 deletions test/split_tests.cc
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
#include <unordered_set>

#include <catch2/catch_template_test_macros.hpp>
#include <catch2/catch_test_macros.hpp>
#include <catch2/generators/catch_generators_range.hpp>
Expand All @@ -14,19 +12,18 @@ using namespace celerity::detail;
namespace {

template <int Dims>
chunk<3> make_full_chunk(range<Dims> range) {
return {id<3>{}, range_cast<3>(range), range_cast<3>(range)};
box<3> make_full_chunk(range<Dims> range) {
return {id<3>{}, range_cast<3>(range)};
}

void check_1d_split(const chunk<3>& full_chunk, const std::vector<chunk<3>>& split_chunks, const std::vector<size_t>& chunk_ranges) {
void check_1d_split(const box<3>& full_chunk, const std::vector<box<3>>& split_chunks, const std::vector<size_t>& chunk_ranges) {
REQUIRE(split_chunks.size() == chunk_ranges.size());
id<3> offset = full_chunk.offset;
id<3> offset = full_chunk.get_min();
for(size_t i = 0; i < split_chunks.size(); ++i) {
const auto& chnk = split_chunks[i];
REQUIRE_LOOP(chnk.offset == offset);
REQUIRE_LOOP(chnk.range[0] == chunk_ranges[i]);
REQUIRE_LOOP(chnk.global_size == full_chunk.global_size);
offset[0] += split_chunks[i].range[0];
REQUIRE_LOOP(chnk.get_min() == offset);
REQUIRE_LOOP(chnk.get_range()[0] == chunk_ranges[i]);
offset[0] += chnk.get_range()[0];
}
}

Expand All @@ -48,21 +45,20 @@ void check_1d_split(const chunk<3>& full_chunk, const std::vector<chunk<3>>& spl
* to the width of an individual chunk.
*/
void check_2d_split(
const chunk<3>& full_chunk, const std::vector<chunk<3>>& split_chunks, const std::vector<std::pair<size_t, std::vector<size_t>>>& chunk_ranges) {
const box<3>& full_chunk, const std::vector<box<3>>& split_chunks, const std::vector<std::pair<size_t, std::vector<size_t>>>& chunk_ranges) {
REQUIRE(split_chunks.size() == std::accumulate(chunk_ranges.begin(), chunk_ranges.end(), size_t(0), [](size_t c, auto& p) { return c + p.second.size(); }));
REQUIRE(std::all_of(chunk_ranges.begin(), chunk_ranges.end(), [&](auto& p) { return p.second.size() == chunk_ranges[0].second.size(); }));
id<3> offset = full_chunk.offset;
id<3> offset = full_chunk.get_min();
for(size_t j = 0; j < chunk_ranges.size(); ++j) {
const auto& [height, widths] = chunk_ranges[j];
for(size_t i = 0; i < widths.size(); ++i) {
const auto& chnk = split_chunks[j * chunk_ranges[0].second.size() + i];
REQUIRE_LOOP(chnk.offset == offset);
REQUIRE_LOOP(chnk.range[0] == height);
REQUIRE_LOOP(chnk.range[1] == widths[i]);
REQUIRE_LOOP(chnk.global_size == full_chunk.global_size);
REQUIRE_LOOP(chnk.get_min() == offset);
REQUIRE_LOOP(chnk.get_range()[0] == height);
REQUIRE_LOOP(chnk.get_range()[1] == widths[i]);
offset[1] += widths[i];
}
offset[1] = full_chunk.offset[1];
offset[1] = full_chunk.get_min()[1];
offset[0] += height;
}
}
Expand Down Expand Up @@ -94,13 +90,13 @@ TEST_CASE("split_1d creates fewer chunks than requested if mandated by granulari
}

TEST_CASE("split_1d preserves offset of original chunk", "[split]") {
const auto full_chunk = chunk<3>{{37, 42, 7}, {128, 1, 1}, {128, 1, 1}};
const auto full_chunk = box<3>{subrange<3>({37, 42, 7}, {128, 1, 1})};
const auto chunks = split_1d(full_chunk, ones, 4);

CHECK(chunks[0].offset == id<3>{37 + 0, 42, 7});
CHECK(chunks[1].offset == id<3>{37 + 32, 42, 7});
CHECK(chunks[2].offset == id<3>{37 + 64, 42, 7});
CHECK(chunks[3].offset == id<3>{37 + 96, 42, 7});
CHECK(chunks[0].get_min() == id<3>{37 + 0, 42, 7});
CHECK(chunks[1].get_min() == id<3>{37 + 32, 42, 7});
CHECK(chunks[2].get_min() == id<3>{37 + 64, 42, 7});
CHECK(chunks[3].get_min() == id<3>{37 + 96, 42, 7});

check_1d_split(full_chunk, chunks, {32, 32, 32, 32});
}
Expand All @@ -109,7 +105,7 @@ TEST_CASE("split_1d preserves ranges of original chunk in other dimensions", "[s
const auto full_chunk = make_full_chunk<3>({128, 42, 341});
const auto chunks = split_1d(full_chunk, ones, 4);
for(size_t i = 0; i < 4; ++i) {
REQUIRE_LOOP(chunks[0].range == range<3>{32, 42, 341});
REQUIRE_LOOP(chunks[0].get_range() == range<3>{32, 42, 341});
}
}

Expand Down Expand Up @@ -251,19 +247,19 @@ TEST_CASE("split_2d minimizes edge lengths for non-square domains") {
}

TEST_CASE("split_2d preserves offset of original chunk", "[split]") {
const auto full_chunk = chunk<3>{{37, 42, 7}, {64, 64, 1}, {128, 128, 1}};
const auto full_chunk = box<3>{subrange<3>({37, 42, 7}, {64, 64, 1})};
const auto chunks = split_2d(full_chunk, ones, 4);
CHECK(chunks[0].offset == id<3>{37, 42, 7});
CHECK(chunks[1].offset == id<3>{37, 42 + 32, 7});
CHECK(chunks[2].offset == id<3>{37 + 32, 42 + 0, 7});
CHECK(chunks[3].offset == id<3>{37 + 32, 42 + 32, 7});
CHECK(chunks[0].get_min() == id<3>{37, 42, 7});
CHECK(chunks[1].get_min() == id<3>{37, 42 + 32, 7});
CHECK(chunks[2].get_min() == id<3>{37 + 32, 42 + 0, 7});
CHECK(chunks[3].get_min() == id<3>{37 + 32, 42 + 32, 7});
}

TEST_CASE("split_2d preserves ranges of original chunk in other dimensions", "[split]") {
const auto full_chunk = make_full_chunk<3>({128, 128, 341});
const auto chunks = split_2d(full_chunk, ones, 4);
for(size_t i = 0; i < 4; ++i) {
REQUIRE_LOOP(chunks[i].range == range<3>{64, 64, 341});
REQUIRE_LOOP(chunks[i].get_range() == range<3>{64, 64, 341});
}
}

Expand Down

0 comments on commit 95b117a

Please sign in to comment.