Skip to content

Commit

Permalink
Improve benchmark plots
Browse files Browse the repository at this point in the history
* Add simple Stats class for statistics
* Add error bars based on SEM to plots where mean is reported
* Add a warmup run
* Run 20 steps/iterations
* Improve plot display (bar borders, xtic nomirror)

Fixes: #401
  • Loading branch information
bernhardmgruber committed Nov 22, 2023
1 parent 07ae18d commit d81094c
Show file tree
Hide file tree
Showing 10 changed files with 275 additions and 174 deletions.
38 changes: 17 additions & 21 deletions examples/alpaka/daxpy/daxpy.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// Copyright 2022 Bernhard Manfred Gruber
// SPDX-License-Identifier: LGPL-3.0-or-later

#include "../../common/Stats.hpp"
#include "../../common/Stopwatch.hpp"
#include "../../common/env.hpp"

Expand All @@ -15,8 +16,7 @@

constexpr auto problemSize = std::size_t{1024} * 1024 * 128;
constexpr auto gpuBlockSize = std::size_t{256};
constexpr auto warmupSteps = 1;
constexpr auto steps = 5;
constexpr auto steps = 20; // number of steps to calculate, excluding 1 warmup run
constexpr auto alpha = 3.14;

static_assert(problemSize % gpuBlockSize == 0);
Expand All @@ -40,18 +40,15 @@ void daxpy(std::ofstream& plotFile)
}
watch.printAndReset("init");

double sum = 0;
for(std::size_t s = 0; s < warmupSteps + steps; ++s)
common::Stats stats;
for(std::size_t s = 0; s < steps + 1; ++s)
{
#pragma omp parallel for
for(std::ptrdiff_t i = 0; i < problemSize; i++)
z[i] = alpha * x[i] + y[i];
if(s < warmupSteps)
watch.printAndReset("daxpy (warmup)");
else
sum += watch.printAndReset("daxpy");
stats(watch.printAndReset("daxpy"));
}
plotFile << std::quoted(title) << "\t" << sum / steps << '\n';
plotFile << std::quoted(title) << "\t" << stats.mean() << "\t" << stats.sem() << '\n';
}

template<typename Acc>
Expand Down Expand Up @@ -116,8 +113,8 @@ void daxpyAlpakaLlama(std::string mappingName, std::ofstream& plotFile, Mapping
alpaka::Vec<Dim, Size>{Size{1}});
watch = {};

double sum = 0;
for(std::size_t s = 0; s < warmupSteps + steps; ++s)
common::Stats stats;
for(std::size_t s = 0; s < steps + 1; ++s)
{
auto kernel = [] ALPAKA_FN_ACC(
const Acc& acc,
Expand All @@ -137,10 +134,7 @@ void daxpyAlpakaLlama(std::string mappingName, std::ofstream& plotFile, Mapping
llama::shallowCopy(viewY),
alpha,
llama::shallowCopy(viewZ));
if(s < warmupSteps)
watch.printAndReset("daxpy (warmup)");
else
sum += watch.printAndReset("daxpy");
stats(watch.printAndReset("daxpy"));
}

for(std::size_t i = 0; i < Mapping::blobCount; i++)
Expand All @@ -150,7 +144,7 @@ void daxpyAlpakaLlama(std::string mappingName, std::ofstream& plotFile, Mapping
}
watch.printAndReset("copy D->H");

plotFile << std::quoted(title) << "\t" << sum / steps << '\n';
plotFile << std::quoted(title) << "\t" << stats.mean() << "\t" << stats.sem() << '\n';
}

auto main() -> int
Expand All @@ -164,19 +158,21 @@ try
problemSize * sizeof(double) / 1024 / 1024,
env);

std::ofstream plotFile{"daxpy.sh"};
std::ofstream plotFile{"daxpy_alpaka.sh"};
plotFile.exceptions(std::ios::badbit | std::ios::failbit);
plotFile << fmt::format(
R"(#!/usr/bin/gnuplot -p
# {}
set title "daxpy CPU {}Mi doubles"
set style data histograms
set style fill solid
set xtics rotate by 45 right
set style histogram errorbars
set style fill solid border -1
set xtics rotate by 45 right nomirror
set key off
set yrange [0:*]
set ylabel "runtime [s]"
$data << EOD
"" "runtime" "runtime_sem"
)",
env,
problemSize / 1024 / 1024);
Expand Down Expand Up @@ -261,9 +257,9 @@ set ylabel "runtime [s]"
llama::Constant<10>>{extents});

plotFile << R"(EOD
plot $data using 2:xtic(1)
plot $data using 2:3:xtic(1) ti col
)";
std::cout << "Plot with: ./daxpy.sh\n";
std::cout << "Plot with: ./daxpy_alpaka.sh\n";

return 0;
}
Expand Down
25 changes: 14 additions & 11 deletions examples/alpaka/nbody/nbody.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// Copyright 2022 Bernhard Manfred Gruber
// SPDX-License-Identifier: LGPL-3.0-or-later

#include "../../common/Stats.hpp"
#include "../../common/Stopwatch.hpp"
#include "../../common/env.hpp"

Expand All @@ -23,7 +24,7 @@
using FP = float;

constexpr auto problemSize = 64 * 1024; ///< total number of particles
constexpr auto steps = 5; ///< number of steps to calculate
constexpr auto steps = 20; ///< number of steps to calculate, excluding 1 warmup run
constexpr auto allowRsqrt = true; // rsqrt can be way faster, but less accurate
constexpr auto runUpate = true; // run update step. Useful to disable for benchmarking the move step.

Expand Down Expand Up @@ -324,22 +325,23 @@ void run(std::ostream& plotFile)
alpaka::Vec<Dim, Size>{static_cast<Size>(threadsPerBlock)},
alpaka::Vec<Dim, Size>{static_cast<Size>(elementsPerThread)}};

double sumUpdate = 0;
double sumMove = 0;
for(int s = 0; s < steps; ++s)
common::Stats statsUpdate;
common::Stats statsMove;
for(int s = 0; s < steps + 1; ++s)
{
if constexpr(runUpate)
{
auto updateKernel = UpdateKernel<elementsPerThread, MappingSM>{};
alpaka::exec<Acc>(queue, workdiv, updateKernel, llama::shallowCopy(accView));
sumUpdate += watch.printAndReset("update", '\t');
statsUpdate(watch.printAndReset("update", '\t'));
}

auto moveKernel = MoveKernel<elementsPerThread>{};
alpaka::exec<Acc>(queue, workdiv, moveKernel, llama::shallowCopy(accView));
sumMove += watch.printAndReset("move");
statsMove(watch.printAndReset("move"));
}
plotFile << std::quoted(title) << "\t" << sumUpdate / steps << '\t' << sumMove / steps << '\n';
plotFile << std::quoted(title) << "\t" << statsUpdate.mean() << "\t" << statsUpdate.sem() << '\t'
<< statsMove.mean() << "\t" << statsMove.sem() << '\n';

for(std::size_t i = 0; i < mapping.blobCount; i++)
alpaka::memcpy(queue, hostView.blobs()[i], accView.blobs()[i]);
Expand Down Expand Up @@ -372,20 +374,21 @@ try
# {}
set title "nbody alpaka {}ki particles on {}"
set style data histograms
set style fill solid
set xtics rotate by 45 right
set style histogram errorbars
set style fill solid border -1
set xtics rotate by 45 right nomirror
set key out top center maxrows 3
set yrange [0:*]
set y2range [0:*]
set ylabel "update runtime [s]"
set y2label "move runtime [s]"
set y2tics auto
$data << EOD
"" "update" "update_sem" "move" "move_sem"
)",
env,
problemSize / 1024,
alpaka::getAccName<Acc>());
plotFile << "\"\"\t\"update\"\t\"move\"\n";

run<Acc, AoS, AoS>(plotFile);
if constexpr(hasSharedMem<Acc>)
Expand All @@ -405,7 +408,7 @@ set y2tics auto
run<Acc, SplitGpuGems, AoS>(plotFile);

plotFile << R"(EOD
plot $data using 2:xtic(1) ti col axis x1y1, "" using 3 ti col axis x1y2
plot $data using 2:3:xtic(1) ti col axis x1y1, "" using 4:5 ti col axis x1y2
)";
std::cout << "Plot with: ./nbody_alpaka.sh\n";

Expand Down
4 changes: 2 additions & 2 deletions examples/alpaka/pic/pic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -890,8 +890,8 @@ try
# {}
set title "PIC grid {}x{} {}k particles on {}"
set style data histograms
set style fill solid
set xtics rotate by 45 right
set style fill solid border -1
set xtics rotate by 45 right nomirror
set key out top center maxrows 3
set yrange [0:*]
set ylabel "runtime [s]"
Expand Down
29 changes: 14 additions & 15 deletions examples/alpaka/vectoradd/vectoradd.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// Copyright 2022 Alexander Matthes, Bernhard Manfred Gruber
// SPDX-License-Identifier: CC0-1.0

#include "../../common/Stats.hpp"
#include "../../common/Stopwatch.hpp"
#include "../../common/env.hpp"

Expand All @@ -15,7 +16,7 @@
using Size = std::size_t;

constexpr auto problemSize = Size{64 * 1024 * 1024 + 3}; ///< problem size
constexpr auto steps = 10;
constexpr auto steps = 20; // excluding 1 warmup run
constexpr auto aosoaLanes = 32;
constexpr auto elements = Size{32};

Expand Down Expand Up @@ -47,7 +48,7 @@ inline constexpr bool isGpu<alpaka::AccGpuCudaRt<Dim, Size>> = true;
#endif

template<std::size_t Elems>
struct ComputeKernel
struct Kernel
{
template<typename Acc, typename View>
LLAMA_FN_HOST_ACC_INLINE void operator()(const Acc& acc, View a, View b) const
Expand Down Expand Up @@ -162,18 +163,13 @@ try
const auto workdiv = alpaka::getValidWorkDiv<Acc>(devAcc, problemSize, elements, false);
std::cout << "Workdiv: " << workdiv << "\n";

double acc = 0;
for(std::size_t s = 0; s < steps; ++s)
common::Stats stats;
for(std::size_t s = 0; s < steps + 1; ++s)
{
alpaka::exec<Acc>(
queue,
workdiv,
ComputeKernel<elements>{},
llama::shallowCopy(devA),
llama::shallowCopy(devB));
acc += chrono.printAndReset("Compute kernel");
alpaka::exec<Acc>(queue, workdiv, Kernel<elements>{}, llama::shallowCopy(devA), llama::shallowCopy(devB));
stats(chrono.printAndReset("vectoradd"));
}
plotFile << "\"" << mappingname << "\"\t" << acc / steps << '\n';
plotFile << "\"LLAMA " << mappingname << "\"\t" << stats.mean() << "\t" << stats.sem() << '\n';

for(std::size_t i = 0; i < blobCount; i++)
{
Expand Down Expand Up @@ -202,11 +198,14 @@ auto main() -> int
# {}
set title "vectoradd alpaka {}Mi elements on {}"
set style data histograms
set style fill solid
set xtics rotate by 45 right
set style histogram errorbars
set style fill solid border -1
set xtics rotate by 45 right nomirror
set key off
set yrange [0:*]
set ylabel "runtime [s]"
$data << EOD
"" "runtime" "runtime_sem"
)",
env,
problemSize / 1024 / 1024,
Expand All @@ -215,7 +214,7 @@ set ylabel "runtime [s]"
boost::mp11::mp_for_each<boost::mp11::mp_iota_c<6>>([&](auto ic) { run<decltype(ic)::value>(plotFile); });

plotFile << R"(EOD
plot $data using 2:xtic(1) ti "compute kernel"
plot $data using 2:3:xtic(1) ti col
)";
std::cout << "Plot with: ./vectoradd_alpaka.sh\n";
return 0;
Expand Down
67 changes: 67 additions & 0 deletions examples/common/Stats.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
// Copyright 2023 Bernhard Manfred Gruber
// SPDX-License-Identifier: LGPL-3.0-or-later

#pragma once

#include <cmath>
#include <numeric>
#include <vector>

namespace common
{
struct Stats
{
bool skipNext = true; // to ignore 1 warmup run
std::vector<double> values;

Stats()
{
// make sure benchmarks don't incur a reallocation
values.reserve(100);
}

void operator()(double val)
{
if(skipNext)
skipNext = false;
else
values.push_back(val);
}

auto sum() const -> double
{
return std::reduce(values.begin(), values.end());
}

auto mean() const -> double
{
return sum() / static_cast<double>(values.size());
}

// sample standard deviation
auto sstddev() const -> double
{
double sum = 0;
const auto m = mean();
for(auto v : values)
sum += (v - m) * (v - m);
return std::sqrt(sum / static_cast<double>(values.size() - 1));
}

auto sem() const -> double
{
return sstddev() / std::sqrt(values.size());
}

auto operator+=(const Stats& s) -> Stats&
{
values.insert(values.end(), s.values.begin(), s.values.end());
return *this;
}

friend auto operator+(Stats a, const Stats& b) -> Stats
{
return a += b;
}
};
} // namespace common
8 changes: 8 additions & 0 deletions examples/common/Stopwatch.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,14 @@ struct Stopwatch
return seconds;
}

auto getAndReset() -> double
{
const auto end = clock::now();
const auto seconds = std::chrono::duration<double>{end - last}.count();
last = clock::now();
return seconds;
}

private:
clock::time_point last = clock::now();
};
Loading

0 comments on commit d81094c

Please sign in to comment.