Also pin benchmark threads in microbenchmarks (without runtime)

celerity · Nov 20, 2024 · 7622cd3 · 7622cd3
1 parent 749bc6d
commit 7622cd3
Show file tree

Hide file tree

Showing 4 changed files with 30 additions and 1 deletion.
diff --git a/src/platform_specific/affinity.unix.cc b/src/platform_specific/affinity.unix.cc
@@ -3,6 +3,7 @@
 #include <mutex>
 #include <optional>
 #include <unordered_map>
+#include <vector>
 
 #include <pthread.h>
 #include <sched.h>

diff --git a/test/dag_benchmarks.cc b/test/dag_benchmarks.cc
@@ -18,6 +18,7 @@ struct bench_graph_node : intrusive_graph_node<bench_graph_node> {};
 
 // try to cover the dependency counts we'll see in practice
 TEMPLATE_TEST_CASE_SIG("benchmark intrusive graph dependency handling with N nodes", "[benchmark][group:graph-nodes]", ((int N), N), 1, 10, 100) {
+	test_utils::benchmark_thread_pinner pinner;
 	// note that bench_graph_nodes are created/destroyed *within* the BENCHMARK
 	// in the first two cases while the latter 2 cases only operate on already
 	// existing nodes -- this is intentional; both cases are relevant in practise
@@ -61,6 +62,7 @@ TEMPLATE_TEST_CASE_SIG("benchmark intrusive graph dependency handling with N nod
 }
 
 TEST_CASE("benchmark task handling", "[benchmark][group:task-graph]") {
+	test_utils::benchmark_thread_pinner pinner;
 	constexpr int N = 10000;
 	constexpr int report_interval = 10;
 
@@ -97,7 +99,6 @@ static constexpr instruction_graph_generator::policy_set benchmark_instruction_g
     /* overlapping_write_error */ CELERITY_ACCESS_PATTERN_DIAGNOSTICS ? error_policy::panic : error_policy::ignore,
 };
 
-
 struct task_manager_benchmark_context {
 	const size_t num_nodes = 1;
 	task_graph tdag;
@@ -242,6 +243,8 @@ class restartable_thread {
 	std::thread m_thread{&restartable_thread::main, this};
 
 	void main() {
+		// This thread is used for scheduling, so pin it to the scheduler core
+		detail::thread_pinning::pin_this_thread(detail::thread_pinning::thread_type::scheduler);
 		std::unique_lock lk{m_mutex};
 		for(;;) {
 			m_update.wait(lk, [this] { return !std::holds_alternative<empty>(m_next); });
@@ -470,28 +473,33 @@ void run_benchmarks(BenchmarkContextFactory&& make_ctx) {
 }
 
 TEST_CASE("generating large task graphs", "[benchmark][group:task-graph]") {
+	test_utils::benchmark_thread_pinner pinner;
 	run_benchmarks([] { return task_manager_benchmark_context{}; });
 }
 
 TEMPLATE_TEST_CASE_SIG("generating large command graphs for N nodes", "[benchmark][group:command-graph]", ((size_t NumNodes), NumNodes), 1, 4, 16) {
+	test_utils::benchmark_thread_pinner pinner;
 	run_benchmarks([] { return command_graph_generator_benchmark_context{NumNodes}; });
 }
 
 TEMPLATE_TEST_CASE_SIG(
     "generating large instruction graphs for N devices", "[benchmark][group:instruction-graph]", ((size_t NumDevices), NumDevices), 1, 4, 16) {
+	test_utils::benchmark_thread_pinner pinner;
 	constexpr static size_t num_nodes = 2;
 	run_benchmarks([] { return instruction_graph_generator_benchmark_context(num_nodes, NumDevices); });
 }
 
 TEMPLATE_TEST_CASE_SIG("generating large instruction graphs for N devices without d2d copy support", "[benchmark][group:instruction-graph]",
     ((size_t NumDevices), NumDevices), 1, 4, 16) {
+	test_utils::benchmark_thread_pinner pinner;
 	constexpr static size_t num_nodes = 2;
 	run_benchmarks([] { return instruction_graph_generator_benchmark_context(num_nodes, NumDevices, false /* supports_d2d_copies */); });
 }
 
 TEMPLATE_TEST_CASE_SIG("building command- and instruction graphs in a dedicated scheduler thread for N nodes", "[benchmark][group:scheduler]",
     ((size_t NumNodes), NumNodes), 1, 4) //
 {
+	test_utils::benchmark_thread_pinner pinner;
 	constexpr static size_t num_devices = 1;
 	SECTION("reference: single-threaded immediate graph generation") {
 		run_benchmarks([&] { return command_graph_generator_benchmark_context(NumNodes); });

diff --git a/test/grid_benchmarks.cc b/test/grid_benchmarks.cc
@@ -12,6 +12,7 @@ using namespace celerity;
 using namespace celerity::detail;
 
 TEST_CASE("normalizing randomized box sets - 2d", "[benchmark][group:grid]") {
+	test_utils::benchmark_thread_pinner pinner;
 	const auto [label, grid_size, max_box_size, num_boxes] = GENERATE(values<std::tuple<const char*, size_t, size_t, size_t>>({
 	    {"small", 10, 5, 4},
 	    {"medium", 50, 1, 50},
@@ -33,6 +34,7 @@ TEST_CASE("normalizing randomized box sets - 2d", "[benchmark][group:grid]") {
 }
 
 TEST_CASE("normalizing randomized box sets - 3d", "[benchmark][group:grid]") {
+	test_utils::benchmark_thread_pinner pinner;
 	const auto [label, grid_size, max_box_size, num_boxes] = GENERATE(values<std::tuple<const char*, size_t, size_t, size_t>>({
 	    {"small", 10, 5, 4},
 	    {"medium", 50, 1, 50},
@@ -66,6 +68,7 @@ box_vector<Dims> create_box_tiling(const size_t n_per_side) {
 }
 
 TEMPLATE_TEST_CASE_SIG("normalizing a fully mergeable tiling of boxes", "[benchmark][group:grid]", ((int Dims), Dims), 1, 2, 3) {
+	test_utils::benchmark_thread_pinner pinner;
 	const auto [label, n] = GENERATE(values<std::tuple<const char*, size_t>>({
 	    {"small", 4},
 	    {"medium", 50},
@@ -92,6 +95,7 @@ TEMPLATE_TEST_CASE_SIG("normalizing a fully mergeable tiling of boxes", "[benchm
 }
 
 TEST_CASE("performing set operations between randomized regions - 2d", "[benchmark][group:grid]") {
+	test_utils::benchmark_thread_pinner pinner;
 	const auto [label, grid_size, max_box_size, num_boxes] = GENERATE(values<std::tuple<const char*, size_t, size_t, size_t>>({
 	    {"small", 10, 5, 4},
 	    {"medium", 50, 1, 50},
@@ -129,6 +133,7 @@ TEST_CASE("performing set operations between randomized regions - 2d", "[benchma
 }
 
 TEST_CASE("performing set operations between randomized regions - 3d", "[benchmark][group:grid]") {
+	test_utils::benchmark_thread_pinner pinner;
 	const auto [label, grid_size, max_box_size, num_boxes] = GENERATE(values<std::tuple<const char*, size_t, size_t, size_t>>({
 	    {"small", 10, 5, 4},
 	    {"medium", 50, 1, 50},
@@ -158,6 +163,7 @@ box_vector<2> create_interlocking_boxes(const size_t num_boxes_per_side) {
 }
 
 TEST_CASE("normalizing a fully mergeable, complex tiling of boxes - 2d", "[benchmark][group:grid]") {
+	test_utils::benchmark_thread_pinner pinner;
 	const auto [label, n] = GENERATE(values<std::tuple<const char*, size_t>>({
 	    {"small", 10},
 	    {"large", 200},

diff --git a/test/test_utils.h b/test/test_utils.h
@@ -156,6 +156,20 @@ namespace detail {
 
 namespace test_utils {
 
+	// Pin the benchmark threads (even in absence of a runtime) for more consistent results
+	struct benchmark_thread_pinner {
+		benchmark_thread_pinner() {
+			const detail::thread_pinning::runtime_configuration cfg{
+			    .enabled = true,
+			    .use_backend_device_submission_threads = false,
+			};
+			m_thread_pinner.emplace(cfg);
+			detail::thread_pinning::pin_this_thread(detail::thread_pinning::thread_type::application);
+		}
+
+		std::optional<detail::thread_pinning::thread_pinner> m_thread_pinner;
+	};
+
 	inline const detail::task* find_task(const detail::task_graph& tdag, const detail::task_id tid) {
 		return detail::graph_testspy::find_node_if(tdag, [tid](const detail::task& tsk) { return tsk.get_id() == tid; });
 	}