Skip to content

Commit

Permalink
Add test of various parallel scenarios
Browse files Browse the repository at this point in the history
  • Loading branch information
abadams committed Oct 28, 2024
1 parent edeae97 commit 4b25a9e
Show file tree
Hide file tree
Showing 2 changed files with 99 additions and 0 deletions.
1 change: 1 addition & 0 deletions test/performance/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ tests(GROUPS performance multithreaded
matrix_multiplication.cpp
memory_profiler.cpp
parallel_performance.cpp
parallel_scenarios.cpp
profiler.cpp
rfactor.cpp
sort.cpp
Expand Down
98 changes: 98 additions & 0 deletions test/performance/parallel_scenarios.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
#include "Halide.h"
#include "halide_thread_pool.h"

using namespace Halide;

int main(int argc, char **argv) {
Param<int> inner_iterations, outer_iterations, memory_limit;
ImageParam input(Float(32), 1);

Func f, g;
Var x;

RDom r(0, inner_iterations);
// Make an inner loop with a floating point sqrt, some integer
// multiply-adds, and a random int generation, and a random memory access.
f(x) = sum(sqrt(input(random_int(r) % memory_limit)));

g() = f(0) + f(outer_iterations - 1);

f.compute_root().parallel(x);

auto out = Runtime::Buffer<float>::make_scalar();
const int max_memory = 100 * 1024 * 1024;
Runtime::Buffer<float> in(max_memory);
in.fill(17.0f);

auto callable = g.compile_to_callable({inner_iterations, outer_iterations, memory_limit, input});

// We want the full distribution of runtimes, not the denoised min, so we
// won't use Tools::benchmark here.

int native_threads = Halide::Internal::JITSharedRuntime::get_num_threads();

auto bench = [&](bool m, bool c, int i, int o) {
const int num_samples = 128;
const int memory_limit = m ? max_memory : 128;

auto bench_one = [&]() {
auto t1 = std::chrono::high_resolution_clock::now();
callable(i, o, memory_limit, in, out);
auto t2 = std::chrono::high_resolution_clock::now();
return 1e9 * std::chrono::duration<float>(t2 - t1).count() / (i * o);
};

std::vector<float> times(num_samples);
if (c) {
Halide::Tools::ThreadPool<void> thread_pool;
const int num_tasks = 8;
const int samples_per_task = num_samples / num_tasks;
Halide::Internal::JITSharedRuntime::set_num_threads(num_tasks * native_threads);
std::vector<std::future<void>> futures(num_tasks);
for (size_t t = 0; t < futures.size(); t++) {
futures[t] = thread_pool.async(
[&](size_t t) {
bench_one();
for (int s = 0; s < samples_per_task; s++) {
size_t idx = t * samples_per_task + s;
times[idx] = bench_one();
}
},
t);
}
for (auto &f : futures) {
f.get();
}
} else {
Halide::Internal::JITSharedRuntime::set_num_threads(native_threads);
bench_one();
for (int s = 0; s < num_samples; s++) {
times[s] = bench_one();
}
}
std::sort(times.begin(), times.end());
printf("%d %d %d %d ", m, c, i, o);
const int n = 8;
int off = (num_samples / n) / 2;
for (int i = 0; i < n; i++) {
printf("%g ", times[off + (num_samples * i) / n]);
}
printf("\n");
};

// The output is designed to be copy-pasted into a spreadsheet, not read by a human
printf("memory_bound contended inner outer t0 t1 t2 t3 t4 t5 t7\n");
for (bool contended : {false, true}) {
for (bool memory_bound : {false, true}) {
for (int i : {1 << 0, 1 << 6, 1 << 12, 1 << 18}) {
for (int o : {1, 2, 4, 8, 16, 32, 64, 128, 256}) {
bench(memory_bound, contended, i, o);
}
}
}
}

printf("Success!\n");

return 0;
}

0 comments on commit 4b25a9e

Please sign in to comment.