diff --git a/benchmarks/babelstream/src/babelStreamCommon.hpp b/benchmarks/babelstream/src/babelStreamCommon.hpp index a22f7d032d3..8885b73b80c 100644 --- a/benchmarks/babelstream/src/babelStreamCommon.hpp +++ b/benchmarks/babelstream/src/babelStreamCommon.hpp @@ -28,10 +28,11 @@ namespace [[maybe_unused]] constexpr auto minArrSize = 1024 * 128; // Scalar value for Mul and Triad kernel parameters. - [[maybe_unused]] constexpr auto scalarVal = 2.0f; + [[maybe_unused]] constexpr double scalarVal = 0.4; // Block thread extent for DotKernel test work division parameters. [[maybe_unused]] constexpr auto blockThreadExtentMain = 1024; + [[maybe_unused]] constexpr auto dotGridBlockExtent = 256; // Number of runs for each kernel, can be changed by command line arguments. // At least 100 runs are recommended for good benchmarking. @@ -39,7 +40,7 @@ namespace [[maybe_unused]] auto numberOfRuns = 2; // Data input value for babelstream. - [[maybe_unused]] constexpr auto valA = 1.0f; + [[maybe_unused]] constexpr double valA = 0.1; //! handleCustomArguments Gets custom cmd line arguments from the all arguments. //! Namely gets --array-size=1234 and --number-runs=1234 and keeps the others which are @@ -111,7 +112,7 @@ namespace { if constexpr(std::is_floating_point_v) { - return std::fabs(a - b) < std::numeric_limits::epsilon() * static_cast(100.0); + return std::fabs(a - b) < (std::numeric_limits::epsilon() * static_cast(100.0)); } else if constexpr(std::is_integral_v) { diff --git a/benchmarks/babelstream/src/babelStreamMainTest.cpp b/benchmarks/babelstream/src/babelStreamMainTest.cpp index 79ec6216508..2abd10eafc6 100644 --- a/benchmarks/babelstream/src/babelStreamMainTest.cpp +++ b/benchmarks/babelstream/src/babelStreamMainTest.cpp @@ -22,7 +22,7 @@ * Can be run with custom arguments as well as catch2 arguments * Run with Custom arguments: * ./babelstream --array-size=33554432 --number-runs=100 - * Runt with default array size and num runs: + * Run with default array size and num runs: * ./babelstream * Run with Catch2 arguments and defaul arrary size and num runs: * ./babelstream --success @@ -76,12 +76,12 @@ struct CopyKernel //! \tparam T The data type //! \param acc The accelerator to be executed on. //! \param a Pointer for vector a - //! \param b Pointer for vector b + //! \param c Pointer for vector c template - ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* a, T* b) const + ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* a, T* c) const { auto const [index] = alpaka::getIdx(acc); - b[index] = a[index]; + c[index] = a[index]; } }; @@ -92,14 +92,14 @@ struct MultKernel //! \tparam TAcc The accelerator environment to be executed on. //! \tparam T The data type //! \param acc The accelerator to be executed on. - //! \param a Pointer for vector a + //! \param c Pointer for vector c //! \param b Pointer for result vector b template - ALPAKA_FN_ACC void operator()(TAcc const& acc, T* const a, T* b) const + ALPAKA_FN_ACC void operator()(TAcc const& acc, T* const c, T* b) const { const T scalar = static_cast(scalarVal); auto const [i] = alpaka::getIdx(acc); - b[i] = scalar * a[i]; + b[i] = scalar * c[i]; } }; @@ -132,11 +132,11 @@ struct TriadKernel //! \param b Pointer for vector b //! \param c Pointer for result vector c template - ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* a, T const* b, T* c) const + ALPAKA_FN_ACC void operator()(TAcc const& acc, T* a, T const* b, T const* c) const { const T scalar = static_cast(scalarVal); auto const [i] = alpaka::getIdx(acc); - c[i] = a[i] + scalar * b[i]; + a[i] = b[i] + scalar * c[i]; } }; @@ -151,6 +151,7 @@ struct DotKernel //! \param a Pointer for vector a //! \param b Pointer for vector b //! \param sum Pointer for result vector consisting sums for each block + //! \param arraySize the size of the array template ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* a, T const* b, T* sum, alpaka::Idx arraySize) const { @@ -316,23 +317,23 @@ void testKernels() }, "InitKernel"); - // Test the copy-kernel. Copy A one by one to B. + // Test the copy-kernel. Copy A one by one to C. measureKernelExec( - [&]() { alpaka::exec(queue, workDivCopy, CopyKernel(), bufAccInputAPtr, bufAccInputBPtr); }, + [&]() { alpaka::exec(queue, workDivCopy, CopyKernel(), bufAccInputAPtr, bufAccOutputCPtr); }, "CopyKernel"); - // Test the scaling-kernel. Calculate B=scalar*A. + // Test the scaling-kernel. Calculate B=scalar*C. Where C = A. measureKernelExec( - [&]() { alpaka::exec(queue, workDivMult, MultKernel(), bufAccInputAPtr, bufAccInputBPtr); }, + [&]() { alpaka::exec(queue, workDivMult, MultKernel(), bufAccOutputCPtr, bufAccInputBPtr); }, "MultKernel"); - // Test the addition-kernel. Calculate C=A+B. Where B=scalar*A. + // Test the addition-kernel. Calculate C=A+B. Where B=scalar*C or B=scalar*A. measureKernelExec( [&]() { alpaka::exec(queue, workDivAdd, AddKernel(), bufAccInputAPtr, bufAccInputBPtr, bufAccOutputCPtr); }, "AddKernel"); - // Test the Triad-kernel. Calculate C=A+scalar*B where B=scalar*A. + // Test the Triad-kernel. Calculate A=B+scalar*C. Where C is A+scalar*A. measureKernelExec( [&]() { alpaka::exec(queue, workDivTriad, TriadKernel(), bufAccInputAPtr, bufAccInputBPtr, bufAccOutputCPtr); }, @@ -350,19 +351,21 @@ void testKernels() DataType initVal{static_cast(0.0)}; DataType sumErrC{initVal}, sumErrB{initVal}, sumErrA{initVal}; - auto const expectedC = static_cast(valA + scalarVal * scalarVal * valA); + auto const expectedB = static_cast(scalarVal * valA); - auto const expectedA = static_cast(valA); + auto const expectedC = static_cast(static_cast(valA) + expectedB); + auto const expectedA = static_cast(expectedB + static_cast(scalarVal) * expectedC); // sum of the errors for each array for(Idx i = 0; i < arraySize; ++i) { - sumErrC += bufHostOutputC[static_cast(i)] - expectedC; - sumErrB += bufHostOutputB[static_cast(i)] - expectedB; - sumErrA += bufHostOutputA[static_cast(i)] - expectedA; + sumErrC += std::fabs(bufHostOutputC[static_cast(i)] - expectedC); + sumErrB += std::fabs(bufHostOutputB[static_cast(i)] - expectedB); + sumErrA += std::fabs(bufHostOutputA[static_cast(i)] - expectedA); } // Normalize and compare sum of the errors + // Use a different equality check if floating point errors exceed precision of FuzzyEqual function REQUIRE(FuzzyEqual(sumErrC / static_cast(arraySize) / expectedC, static_cast(0.0))); REQUIRE(FuzzyEqual(sumErrB / static_cast(arraySize) / expectedB, static_cast(0.0))); REQUIRE(FuzzyEqual(sumErrA / static_cast(arraySize) / expectedA, static_cast(0.0))); @@ -375,7 +378,7 @@ void testKernels() // Threads per block for Dot kernel constexpr Idx blockThreadExtent = blockThreadExtentMain; // Blocks per grid for Dot kernel - constexpr Idx gridBlockExtent = static_cast(256); + const Idx gridBlockExtent = static_cast(dotGridBlockExtent); // Vector of sums of each block auto bufAccSumPerBlock = alpaka::allocBuf(devAcc, gridBlockExtent); auto bufHostSumPerBlock = alpaka::allocBuf(devHost, gridBlockExtent); @@ -401,8 +404,12 @@ void testKernels() DataType const* sumPtr = std::data(bufHostSumPerBlock); auto const result = std::reduce(sumPtr, sumPtr + gridBlockExtent, DataType{0}); - // Since vector values are 1, dot product should be identical to arraySize - REQUIRE(FuzzyEqual(static_cast(result), static_cast(arraySize * 2))); + + auto const expectedSum = static_cast(arraySize) * expectedA * expectedB; + // dot product should be identical to arraySize*valA*valB + // Use a different equality check if floating point errors exceed precision of FuzzyEqual function + REQUIRE(FuzzyEqual((static_cast(result) - expectedSum) / expectedSum, static_cast(0.0))); + // Add workdiv to the list of workdivs to print later metaData.setItem(BMInfoDataType::WorkDivDot, workDivDot); }