Improve benchmark plots

* Add simple Stats class for statistics * Add error bars based on SEM to plots where mean is reported * Add a warmup run * Run 20 steps/iterations * Improve plot display (bar borders, xtic nomirror) Fixes: #401
alpaka-group · Nov 22, 2023 · d81094c · d81094c
1 parent 07ae18d
commit d81094c
Show file tree

Hide file tree

Showing 10 changed files with 275 additions and 174 deletions.
diff --git a/examples/alpaka/daxpy/daxpy.cpp b/examples/alpaka/daxpy/daxpy.cpp
@@ -1,6 +1,7 @@
 // Copyright 2022 Bernhard Manfred Gruber
 // SPDX-License-Identifier: LGPL-3.0-or-later
 
+#include "../../common/Stats.hpp"
 #include "../../common/Stopwatch.hpp"
 #include "../../common/env.hpp"
 
@@ -15,8 +16,7 @@
 
 constexpr auto problemSize = std::size_t{1024} * 1024 * 128;
 constexpr auto gpuBlockSize = std::size_t{256};
-constexpr auto warmupSteps = 1;
-constexpr auto steps = 5;
+constexpr auto steps = 20; // number of steps to calculate, excluding 1 warmup run
 constexpr auto alpha = 3.14;
 
 static_assert(problemSize % gpuBlockSize == 0);
@@ -40,18 +40,15 @@ void daxpy(std::ofstream& plotFile)
     }
     watch.printAndReset("init");
 
-    double sum = 0;
-    for(std::size_t s = 0; s < warmupSteps + steps; ++s)
+    common::Stats stats;
+    for(std::size_t s = 0; s < steps + 1; ++s)
     {
 #pragma omp parallel for
         for(std::ptrdiff_t i = 0; i < problemSize; i++)
             z[i] = alpha * x[i] + y[i];
-        if(s < warmupSteps)
-            watch.printAndReset("daxpy (warmup)");
-        else
-            sum += watch.printAndReset("daxpy");
+        stats(watch.printAndReset("daxpy"));
     }
-    plotFile << std::quoted(title) << "\t" << sum / steps << '\n';
+    plotFile << std::quoted(title) << "\t" << stats.mean() << "\t" << stats.sem() << '\n';
 }
 
 template<typename Acc>
@@ -116,8 +113,8 @@ void daxpyAlpakaLlama(std::string mappingName, std::ofstream& plotFile, Mapping
         alpaka::Vec<Dim, Size>{Size{1}});
     watch = {};
 
-    double sum = 0;
-    for(std::size_t s = 0; s < warmupSteps + steps; ++s)
+    common::Stats stats;
+    for(std::size_t s = 0; s < steps + 1; ++s)
     {
         auto kernel = [] ALPAKA_FN_ACC(
                           const Acc& acc,
@@ -137,10 +134,7 @@ void daxpyAlpakaLlama(std::string mappingName, std::ofstream& plotFile, Mapping
             llama::shallowCopy(viewY),
             alpha,
             llama::shallowCopy(viewZ));
-        if(s < warmupSteps)
-            watch.printAndReset("daxpy (warmup)");
-        else
-            sum += watch.printAndReset("daxpy");
+        stats(watch.printAndReset("daxpy"));
     }
 
     for(std::size_t i = 0; i < Mapping::blobCount; i++)
@@ -150,7 +144,7 @@ void daxpyAlpakaLlama(std::string mappingName, std::ofstream& plotFile, Mapping
     }
     watch.printAndReset("copy D->H");
 
-    plotFile << std::quoted(title) << "\t" << sum / steps << '\n';
+    plotFile << std::quoted(title) << "\t" << stats.mean() << "\t" << stats.sem() << '\n';
 }
 
 auto main() -> int
@@ -164,19 +158,21 @@ try
         problemSize * sizeof(double) / 1024 / 1024,
         env);
 
-    std::ofstream plotFile{"daxpy.sh"};
+    std::ofstream plotFile{"daxpy_alpaka.sh"};
     plotFile.exceptions(std::ios::badbit | std::ios::failbit);
     plotFile << fmt::format(
         R"(#!/usr/bin/gnuplot -p
 # {}
 set title "daxpy CPU {}Mi doubles"
 set style data histograms
-set style fill solid
-set xtics rotate by 45 right
+set style histogram errorbars
+set style fill solid border -1
+set xtics rotate by 45 right nomirror
 set key off
 set yrange [0:*]
 set ylabel "runtime [s]"
 $data << EOD
+""	"runtime"	"runtime_sem"
 )",
         env,
         problemSize / 1024 / 1024);
@@ -261,9 +257,9 @@ set ylabel "runtime [s]"
             llama::Constant<10>>{extents});
 
     plotFile << R"(EOD
-plot $data using 2:xtic(1)
+plot $data using 2:3:xtic(1) ti col
 )";
-    std::cout << "Plot with: ./daxpy.sh\n";
+    std::cout << "Plot with: ./daxpy_alpaka.sh\n";
 
     return 0;
 }

diff --git a/examples/alpaka/nbody/nbody.cpp b/examples/alpaka/nbody/nbody.cpp
@@ -1,6 +1,7 @@
 // Copyright 2022 Bernhard Manfred Gruber
 // SPDX-License-Identifier: LGPL-3.0-or-later
 
+#include "../../common/Stats.hpp"
 #include "../../common/Stopwatch.hpp"
 #include "../../common/env.hpp"
 
@@ -23,7 +24,7 @@
 using FP = float;
 
 constexpr auto problemSize = 64 * 1024; ///< total number of particles
-constexpr auto steps = 5; ///< number of steps to calculate
+constexpr auto steps = 20; ///< number of steps to calculate, excluding 1 warmup run
 constexpr auto allowRsqrt = true; // rsqrt can be way faster, but less accurate
 constexpr auto runUpate = true; // run update step. Useful to disable for benchmarking the move step.
 
@@ -324,22 +325,23 @@ void run(std::ostream& plotFile)
         alpaka::Vec<Dim, Size>{static_cast<Size>(threadsPerBlock)},
         alpaka::Vec<Dim, Size>{static_cast<Size>(elementsPerThread)}};
 
-    double sumUpdate = 0;
-    double sumMove = 0;
-    for(int s = 0; s < steps; ++s)
+    common::Stats statsUpdate;
+    common::Stats statsMove;
+    for(int s = 0; s < steps + 1; ++s)
     {
         if constexpr(runUpate)
         {
             auto updateKernel = UpdateKernel<elementsPerThread, MappingSM>{};
             alpaka::exec<Acc>(queue, workdiv, updateKernel, llama::shallowCopy(accView));
-            sumUpdate += watch.printAndReset("update", '\t');
+            statsUpdate(watch.printAndReset("update", '\t'));
         }
 
         auto moveKernel = MoveKernel<elementsPerThread>{};
         alpaka::exec<Acc>(queue, workdiv, moveKernel, llama::shallowCopy(accView));
-        sumMove += watch.printAndReset("move");
+        statsMove(watch.printAndReset("move"));
     }
-    plotFile << std::quoted(title) << "\t" << sumUpdate / steps << '\t' << sumMove / steps << '\n';
+    plotFile << std::quoted(title) << "\t" << statsUpdate.mean() << "\t" << statsUpdate.sem() << '\t'
+             << statsMove.mean() << "\t" << statsMove.sem() << '\n';
 
     for(std::size_t i = 0; i < mapping.blobCount; i++)
         alpaka::memcpy(queue, hostView.blobs()[i], accView.blobs()[i]);
@@ -372,20 +374,21 @@ try
 # {}
 set title "nbody alpaka {}ki particles on {}"
 set style data histograms
-set style fill solid
-set xtics rotate by 45 right
+set style histogram errorbars
+set style fill solid border -1
+set xtics rotate by 45 right nomirror
 set key out top center maxrows 3
 set yrange [0:*]
 set y2range [0:*]
 set ylabel "update runtime [s]"
 set y2label "move runtime [s]"
 set y2tics auto
 $data << EOD
+""	"update"	"update_sem"	"move"	"move_sem"
 )",
         env,
         problemSize / 1024,
         alpaka::getAccName<Acc>());
-    plotFile << "\"\"\t\"update\"\t\"move\"\n";
 
     run<Acc, AoS, AoS>(plotFile);
     if constexpr(hasSharedMem<Acc>)
@@ -405,7 +408,7 @@ set y2tics auto
     run<Acc, SplitGpuGems, AoS>(plotFile);
 
     plotFile << R"(EOD
-plot $data using 2:xtic(1) ti col axis x1y1, "" using 3 ti col axis x1y2
+plot $data using 2:3:xtic(1) ti col axis x1y1, "" using 4:5 ti col axis x1y2
 )";
     std::cout << "Plot with: ./nbody_alpaka.sh\n";
 

diff --git a/examples/alpaka/pic/pic.cpp b/examples/alpaka/pic/pic.cpp
@@ -890,8 +890,8 @@ try
 # {}
 set title "PIC grid {}x{} {}k particles on {}"
 set style data histograms
-set style fill solid
-set xtics rotate by 45 right
+set style fill solid border -1
+set xtics rotate by 45 right nomirror
 set key out top center maxrows 3
 set yrange [0:*]
 set ylabel "runtime [s]"

diff --git a/examples/alpaka/vectoradd/vectoradd.cpp b/examples/alpaka/vectoradd/vectoradd.cpp
@@ -1,6 +1,7 @@
 // Copyright 2022 Alexander Matthes, Bernhard Manfred Gruber
 // SPDX-License-Identifier: CC0-1.0
 
+#include "../../common/Stats.hpp"
 #include "../../common/Stopwatch.hpp"
 #include "../../common/env.hpp"
 
@@ -15,7 +16,7 @@
 using Size = std::size_t;
 
 constexpr auto problemSize = Size{64 * 1024 * 1024 + 3}; ///< problem size
-constexpr auto steps = 10;
+constexpr auto steps = 20; // excluding 1 warmup run
 constexpr auto aosoaLanes = 32;
 constexpr auto elements = Size{32};
 
@@ -47,7 +48,7 @@ inline constexpr bool isGpu<alpaka::AccGpuCudaRt<Dim, Size>> = true;
 #endif
 
 template<std::size_t Elems>
-struct ComputeKernel
+struct Kernel
 {
     template<typename Acc, typename View>
     LLAMA_FN_HOST_ACC_INLINE void operator()(const Acc& acc, View a, View b) const
@@ -162,18 +163,13 @@ try
     const auto workdiv = alpaka::getValidWorkDiv<Acc>(devAcc, problemSize, elements, false);
     std::cout << "Workdiv: " << workdiv << "\n";
 
-    double acc = 0;
-    for(std::size_t s = 0; s < steps; ++s)
+    common::Stats stats;
+    for(std::size_t s = 0; s < steps + 1; ++s)
     {
-        alpaka::exec<Acc>(
-            queue,
-            workdiv,
-            ComputeKernel<elements>{},
-            llama::shallowCopy(devA),
-            llama::shallowCopy(devB));
-        acc += chrono.printAndReset("Compute kernel");
+        alpaka::exec<Acc>(queue, workdiv, Kernel<elements>{}, llama::shallowCopy(devA), llama::shallowCopy(devB));
+        stats(chrono.printAndReset("vectoradd"));
     }
-    plotFile << "\"" << mappingname << "\"\t" << acc / steps << '\n';
+    plotFile << "\"LLAMA " << mappingname << "\"\t" << stats.mean() << "\t" << stats.sem() << '\n';
 
     for(std::size_t i = 0; i < blobCount; i++)
     {
@@ -202,11 +198,14 @@ auto main() -> int
 # {}
 set title "vectoradd alpaka {}Mi elements on {}"
 set style data histograms
-set style fill solid
-set xtics rotate by 45 right
+set style histogram errorbars
+set style fill solid border -1
+set xtics rotate by 45 right nomirror
+set key off
 set yrange [0:*]
 set ylabel "runtime [s]"
 $data << EOD
+""	"runtime"	"runtime_sem"
 )",
         env,
         problemSize / 1024 / 1024,
@@ -215,7 +214,7 @@ set ylabel "runtime [s]"
     boost::mp11::mp_for_each<boost::mp11::mp_iota_c<6>>([&](auto ic) { run<decltype(ic)::value>(plotFile); });
 
     plotFile << R"(EOD
-plot $data using 2:xtic(1) ti "compute kernel"
+plot $data using 2:3:xtic(1) ti col
 )";
     std::cout << "Plot with: ./vectoradd_alpaka.sh\n";
     return 0;

diff --git a/examples/common/Stats.hpp b/examples/common/Stats.hpp
@@ -0,0 +1,67 @@
+// Copyright 2023 Bernhard Manfred Gruber
+// SPDX-License-Identifier: LGPL-3.0-or-later
+
+#pragma once
+
+#include <cmath>
+#include <numeric>
+#include <vector>
+
+namespace common
+{
+    struct Stats
+    {
+        bool skipNext = true; // to ignore 1 warmup run
+        std::vector<double> values;
+
+        Stats()
+        {
+            // make sure benchmarks don't incur a reallocation
+            values.reserve(100);
+        }
+
+        void operator()(double val)
+        {
+            if(skipNext)
+                skipNext = false;
+            else
+                values.push_back(val);
+        }
+
+        auto sum() const -> double
+        {
+            return std::reduce(values.begin(), values.end());
+        }
+
+        auto mean() const -> double
+        {
+            return sum() / static_cast<double>(values.size());
+        }
+
+        // sample standard deviation
+        auto sstddev() const -> double
+        {
+            double sum = 0;
+            const auto m = mean();
+            for(auto v : values)
+                sum += (v - m) * (v - m);
+            return std::sqrt(sum / static_cast<double>(values.size() - 1));
+        }
+
+        auto sem() const -> double
+        {
+            return sstddev() / std::sqrt(values.size());
+        }
+
+        auto operator+=(const Stats& s) -> Stats&
+        {
+            values.insert(values.end(), s.values.begin(), s.values.end());
+            return *this;
+        }
+
+        friend auto operator+(Stats a, const Stats& b) -> Stats
+        {
+            return a += b;
+        }
+    };
+} // namespace common
diff --git a/examples/common/Stopwatch.hpp b/examples/common/Stopwatch.hpp
@@ -20,6 +20,14 @@ struct Stopwatch
         return seconds;
     }
 
+    auto getAndReset() -> double
+    {
+        const auto end = clock::now();
+        const auto seconds = std::chrono::duration<double>{end - last}.count();
+        last = clock::now();
+        return seconds;
+    }
+
 private:
     clock::time_point last = clock::now();
 };