diff --git a/benchmarks/babelstream/src/babelStreamCommon.hpp b/benchmarks/babelstream/src/babelStreamCommon.hpp index 8885b73b80c..d3fd6496bb7 100644 --- a/benchmarks/babelstream/src/babelStreamCommon.hpp +++ b/benchmarks/babelstream/src/babelStreamCommon.hpp @@ -24,6 +24,7 @@ namespace // To prevent timeouts in CI, a smaller default value is used. [[maybe_unused]] auto arraySizeMain = 1024 * 1024; + // Minimum array size to be used. [[maybe_unused]] constexpr auto minArrSize = 1024 * 128; @@ -39,8 +40,19 @@ namespace // To prevent timeouts in CI, a small value is used. [[maybe_unused]] auto numberOfRuns = 2; - // Data input value for babelstream. + // Data input values for babelstream. [[maybe_unused]] constexpr double valA = 0.1; + [[maybe_unused]] constexpr double valB = 0.2; + [[maybe_unused]] constexpr double valC = 0.2; + + enum class KernelsToRun + { + All, // init, add, copy, mul, triad + Triad, // only init and triad + NStream // only init and nstream + }; + + [[maybe_unused]] KernelsToRun kernelsToBeExecuted{KernelsToRun::All}; //! handleCustomArguments Gets custom cmd line arguments from the all arguments. //! Namely gets --array-size=1234 and --number-runs=1234 and keeps the others which are @@ -80,6 +92,26 @@ namespace std::cout << "Using default number of runs: " << numberOfRuns << std::endl; } } + else if(arg.rfind("--run-kernels=", 0) == 0) + { + auto const kernelsString = std::string(arg.substr(14)); // Convert to integer + if(kernelsString == "nstream") + { + std::cout << "Only nstream kernel will be executed." << std::endl; + kernelsToBeExecuted = KernelsToRun::NStream; + } + else if(kernelsString == "triad") + { + kernelsToBeExecuted = KernelsToRun::Triad; + std::cout << "Only triad kernel will be executed." << std::endl; + } + else if(kernelsString == "all") + { + // The variable kernelsToBeExecuted default value is "all"; + kernelsToBeExecuted = KernelsToRun::All; + std::cout << "All 5 babelstream kernels are going to be executed." << std::endl; + } + } else { // If it's not a custom argument, keep it for Catch2 @@ -90,6 +122,10 @@ namespace std::cout << "Usage of custom arguments (arguments which are not Catch2): --array-size=33554432 and " "--number-runs=100" << std::endl; + std::cout << "If you want to run only nstream kernel or triad kernel use --run-kernels=nstream or " + "--run-kernels=triad. Otherwise all 5 standar kernels will be run. Init, Copy, Mul, Add, " + "Triad. (and Dot if multithreaded acc is set)" + << std::endl; } } @@ -220,6 +256,7 @@ namespace WorkDivTriad, WorkDivMult, WorkDivDot, + WorkDivNStream, DeviceName, TimeUnit, KernelNames, @@ -354,11 +391,13 @@ namespace { std::stringstream ss; // define lambda to add values to a string stream created already - auto addItemValue = [&, this](BMInfoDataType item) { - ss << "\n" << typeToTypeStr(item) << ":" << metaDataMap.at(item); + auto addItemValue = [&, this](BMInfoDataType item) + { + if(metaDataMap.count(item) != 0) + ss << "\n" << typeToTypeStr(item) << ":" << metaDataMap.at(item); }; - // Initially chose some data to serialize + // Initially chose some data to serialize from the meta-data map to add to string stream ss << "\n"; addItemValue(BMInfoDataType::AcceleratorType); addItemValue(BMInfoDataType::NumRuns); @@ -372,7 +411,7 @@ namespace addItemValue(BMInfoDataType::WorkDivTriad); if(metaDataMap.count(BMInfoDataType::WorkDivDot) != 0) addItemValue(BMInfoDataType::WorkDivDot); - + addItemValue(BMInfoDataType::WorkDivNStream); auto getItemFromStrList = [this](BMInfoDataType item, int index) -> std::string { std::string const str = metaDataMap.at(item); diff --git a/benchmarks/babelstream/src/babelStreamMainTest.cpp b/benchmarks/babelstream/src/babelStreamMainTest.cpp index 2abd10eafc6..d9176e2471a 100644 --- a/benchmarks/babelstream/src/babelStreamMainTest.cpp +++ b/benchmarks/babelstream/src/babelStreamMainTest.cpp @@ -20,8 +20,12 @@ * Some implementations and the documents are accessible through https://github.com/UoB-HPC * * Can be run with custom arguments as well as catch2 arguments - * Run with Custom arguments: + * Run with Custom arguments and for kernels: init,copy, mul, add, triad (and dot kernel if a multi-thread acc): * ./babelstream --array-size=33554432 --number-runs=100 + * Run with Custom arguments and select from 3 kernel groups: all, triad, nstream + * ./babelstream --array-size=33554432 --number-runs=100 --run-kernels=triad + * ./babelstream --array-size=33554432 --number-runs=100 --run-kernels=nstream + * ./babelstream --array-size=33554432 --number-runs=100 --run-kernels=all (default case) * Run with default array size and num runs: * ./babelstream * Run with Catch2 arguments and defaul arrary size and num runs: @@ -59,11 +63,11 @@ struct InitKernel //! \param a Pointer for vector a //! \param initA the value to set all items in the vector template - ALPAKA_FN_ACC void operator()(TAcc const& acc, T* a, T* b, T* c, T initA) const + ALPAKA_FN_ACC void operator()(TAcc const& acc, T* a, T* b, T* c, T initA, T initB) const { auto const [i] = alpaka::getIdx(acc); a[i] = initA; - b[i] = static_cast(0.0); + b[i] = initB; c[i] = static_cast(0.0); } }; @@ -100,6 +104,7 @@ struct MultKernel const T scalar = static_cast(scalarVal); auto const [i] = alpaka::getIdx(acc); b[i] = scalar * c[i]; + // if(i==1) printf("in mull kernel. b[i] = %f ", b[i] ); } }; @@ -140,6 +145,25 @@ struct TriadKernel } }; +// Not one of 5 babelstream kernels +struct NstreamKernel +{ + template + ALPAKA_FN_ACC void operator()(TAcc const& acc, T* a, T const* b, T const* c) const + { + const T scalar = static_cast(scalarVal); + auto const [i] = alpaka::getIdx(acc); + a[i] += b[i] + scalar * c[i]; + if(i == 1) + printf( + "In nstream kernel calculate a[i] : %f from b[i]: %f, scalar: %f, c[i]: %f \n", + a[i], + b[i], + scalar, + c[i]); + } +}; + //! Dot product of two vectors. The result is not a scalar but a vector of block-level dot products. For the //! BabelStream implementation and documentation: https://github.com/UoB-HPC struct DotKernel @@ -187,6 +211,12 @@ struct DotKernel template void testKernels() { + if(kernelsToBeExecuted == KernelsToRun::All) + { + std::cout << "Init, Copy, Mul, Add Kernels will be executed. if accelerator has multi-thread per block; dot " + "kernel will be executed too." + << std::endl; + } using Acc = TAcc; // Define the index domain // Set the number of dimensions as an integral constant. Set to 1 for 1D. @@ -252,7 +282,8 @@ void testKernels() bufAccInputAPtr, bufAccInputBPtr, bufAccOutputCPtr, - static_cast(valA)); + static_cast(valA), + static_cast(valB)); auto const workDivCopy = alpaka::getValidWorkDiv(kernelCfg, devAcc, CopyKernel(), bufAccInputAPtr, bufAccInputBPtr); auto const workDivMult @@ -268,6 +299,14 @@ void testKernels() bufAccInputBPtr, bufAccOutputCPtr); + auto const workDivNStream = alpaka::getValidWorkDiv( + kernelCfg, + devAcc, + NstreamKernel(), + bufAccInputAPtr, + bufAccInputBPtr, + bufAccOutputCPtr); + // Vector of average run-times of babelstream kernels std::vector avgExecTimesOfKernels; std::vector minExecTimesOfKernels; @@ -313,142 +352,264 @@ void testKernels() bufAccInputAPtr, bufAccInputBPtr, bufAccOutputCPtr, - static_cast(valA)); + static_cast(valA), + static_cast(valB)); }, "InitKernel"); - // Test the copy-kernel. Copy A one by one to C. - measureKernelExec( - [&]() { alpaka::exec(queue, workDivCopy, CopyKernel(), bufAccInputAPtr, bufAccOutputCPtr); }, - "CopyKernel"); + if(kernelsToBeExecuted == KernelsToRun::All) + { + // Test the copy-kernel. Copy A one by one to C. + measureKernelExec( + [&]() { alpaka::exec(queue, workDivCopy, CopyKernel(), bufAccInputAPtr, bufAccOutputCPtr); }, + "CopyKernel"); - // Test the scaling-kernel. Calculate B=scalar*C. Where C = A. - measureKernelExec( - [&]() { alpaka::exec(queue, workDivMult, MultKernel(), bufAccOutputCPtr, bufAccInputBPtr); }, - "MultKernel"); + // Test the scaling-kernel. Calculate B=scalar*C. Where C = A. + measureKernelExec( + [&]() { alpaka::exec(queue, workDivMult, MultKernel(), bufAccOutputCPtr, bufAccInputBPtr); }, + "MultKernel"); - // Test the addition-kernel. Calculate C=A+B. Where B=scalar*C or B=scalar*A. - measureKernelExec( - [&]() - { alpaka::exec(queue, workDivAdd, AddKernel(), bufAccInputAPtr, bufAccInputBPtr, bufAccOutputCPtr); }, - "AddKernel"); + // Test the addition-kernel. Calculate C=A+B. Where B=scalar*C or B=scalar*A. + measureKernelExec( + [&]() + { alpaka::exec(queue, workDivAdd, AddKernel(), bufAccInputAPtr, bufAccInputBPtr, bufAccOutputCPtr); }, + "AddKernel"); + } - // Test the Triad-kernel. Calculate A=B+scalar*C. Where C is A+scalar*A. - measureKernelExec( - [&]() - { alpaka::exec(queue, workDivTriad, TriadKernel(), bufAccInputAPtr, bufAccInputBPtr, bufAccOutputCPtr); }, - "TriadKernel"); + if(kernelsToBeExecuted == KernelsToRun::All || kernelsToBeExecuted == KernelsToRun::Triad) + { + // Test the Triad-kernel. Calculate A=B+scalar*C. Where C is A+scalar*A. + measureKernelExec( + [&]() { + alpaka::exec( + queue, + workDivTriad, + TriadKernel(), + bufAccInputAPtr, + bufAccInputBPtr, + bufAccOutputCPtr); + }, + "TriadKernel"); + } + + else if(kernelsToBeExecuted == KernelsToRun::NStream) + { + // Test the NStream-kernel. Calculate A += B + scalar * C; + measureKernelExec( + [&]() { + alpaka::exec( + queue, + workDivNStream, + NstreamKernel(), + bufAccInputAPtr, + bufAccInputBPtr, + bufAccOutputCPtr); + }, + "NstreamKernel"); + } - // Copy arrays back to host + // Copy arrays back to host since the execution of kernels except dot kernel finished alpaka::memcpy(queue, bufHostOutputC, bufAccOutputC, arraySize); alpaka::memcpy(queue, bufHostOutputB, bufAccInputB, arraySize); alpaka::memcpy(queue, bufHostOutputA, bufAccInputA, arraySize); - // Verify the results - // - // Find sum of the errors as sum of the differences from expected values - DataType initVal{static_cast(0.0)}; - DataType sumErrC{initVal}, sumErrB{initVal}, sumErrA{initVal}; - - auto const expectedB = static_cast(scalarVal * valA); - auto const expectedC = static_cast(static_cast(valA) + expectedB); - auto const expectedA = static_cast(expectedB + static_cast(scalarVal) * expectedC); + // VERIFICATION and BW Calculation - // sum of the errors for each array - for(Idx i = 0; i < arraySize; ++i) + std::vector bandwidthsPerKernel; + std::vector bytesReadWriteMB; + if(kernelsToBeExecuted == KernelsToRun::All) { - sumErrC += std::fabs(bufHostOutputC[static_cast(i)] - expectedC); - sumErrB += std::fabs(bufHostOutputB[static_cast(i)] - expectedB); - sumErrA += std::fabs(bufHostOutputA[static_cast(i)] - expectedA); - } + // Verify the results for all kernels + // + // Find sum of the errors as sum of the differences from expected values + DataType initVal{static_cast(0.0)}; + DataType sumErrC{initVal}, sumErrB{initVal}, sumErrA{initVal}; - // Normalize and compare sum of the errors - // Use a different equality check if floating point errors exceed precision of FuzzyEqual function - REQUIRE(FuzzyEqual(sumErrC / static_cast(arraySize) / expectedC, static_cast(0.0))); - REQUIRE(FuzzyEqual(sumErrB / static_cast(arraySize) / expectedB, static_cast(0.0))); - REQUIRE(FuzzyEqual(sumErrA / static_cast(arraySize) / expectedA, static_cast(0.0))); - alpaka::wait(queue); - // Test Dot kernel with specific blocksize which is larger than 1 - if constexpr(alpaka::accMatchesTags) - { - using WorkDiv = alpaka::WorkDivMembers; - // Threads per block for Dot kernel - constexpr Idx blockThreadExtent = blockThreadExtentMain; - // Blocks per grid for Dot kernel - const Idx gridBlockExtent = static_cast(dotGridBlockExtent); - // Vector of sums of each block - auto bufAccSumPerBlock = alpaka::allocBuf(devAcc, gridBlockExtent); - auto bufHostSumPerBlock = alpaka::allocBuf(devHost, gridBlockExtent); - // A specific work-division is used for dotKernel - auto const workDivDot = WorkDiv{Vec{gridBlockExtent}, Vec{blockThreadExtent}, Vec::all(1)}; + auto const expectedB = static_cast(scalarVal * valA); + auto const expectedC = static_cast(static_cast(valA) + expectedB); + auto const expectedA = static_cast(expectedB + static_cast(scalarVal) * expectedC); - measureKernelExec( - [&]() - { - alpaka::exec( - queue, - workDivDot, - DotKernel(), // Dot kernel - alpaka::getPtrNative(bufAccInputA), - alpaka::getPtrNative(bufAccInputB), - alpaka::getPtrNative(bufAccSumPerBlock), - static_cast>(arraySize)); - }, - "DotKernel"); + // sum of the errors for each array + for(Idx i = 0; i < arraySize; ++i) + { + sumErrC += std::fabs(bufHostOutputC[static_cast(i)] - expectedC); + sumErrB += std::fabs(bufHostOutputB[static_cast(i)] - expectedB); + sumErrA += std::fabs(bufHostOutputA[static_cast(i)] - expectedA); + } - alpaka::memcpy(queue, bufHostSumPerBlock, bufAccSumPerBlock, gridBlockExtent); + // Normalize and compare sum of the errors + // Use a different equality check if floating point errors exceed precision of FuzzyEqual function + REQUIRE(FuzzyEqual(sumErrC / static_cast(arraySize) / expectedC, static_cast(0.0))); + REQUIRE(FuzzyEqual(sumErrB / static_cast(arraySize) / expectedB, static_cast(0.0))); + REQUIRE(FuzzyEqual(sumErrA / static_cast(arraySize) / expectedA, static_cast(0.0))); alpaka::wait(queue); - DataType const* sumPtr = std::data(bufHostSumPerBlock); - auto const result = std::reduce(sumPtr, sumPtr + gridBlockExtent, DataType{0}); - auto const expectedSum = static_cast(arraySize) * expectedA * expectedB; - // dot product should be identical to arraySize*valA*valB - // Use a different equality check if floating point errors exceed precision of FuzzyEqual function - REQUIRE(FuzzyEqual((static_cast(result) - expectedSum) / expectedSum, static_cast(0.0))); + // Test Dot kernel with specific blocksize which is larger than 1 + if constexpr(alpaka::accMatchesTags) + { + using WorkDiv = alpaka::WorkDivMembers; + // Threads per block for Dot kernel + constexpr Idx blockThreadExtent = blockThreadExtentMain; + // Blocks per grid for Dot kernel + const Idx gridBlockExtent = static_cast(dotGridBlockExtent); + // Vector of sums of each block + auto bufAccSumPerBlock = alpaka::allocBuf(devAcc, gridBlockExtent); + auto bufHostSumPerBlock = alpaka::allocBuf(devHost, gridBlockExtent); + // A specific work-division is used for dotKernel + auto const workDivDot = WorkDiv{Vec{gridBlockExtent}, Vec{blockThreadExtent}, Vec::all(1)}; + + measureKernelExec( + [&]() + { + alpaka::exec( + queue, + workDivDot, + DotKernel(), // Dot kernel + alpaka::getPtrNative(bufAccInputA), + alpaka::getPtrNative(bufAccInputB), + alpaka::getPtrNative(bufAccSumPerBlock), + static_cast>(arraySize)); + }, + "DotKernel"); + + alpaka::memcpy(queue, bufHostSumPerBlock, bufAccSumPerBlock, gridBlockExtent); + alpaka::wait(queue); - // Add workdiv to the list of workdivs to print later - metaData.setItem(BMInfoDataType::WorkDivDot, workDivDot); - } + DataType const* sumPtr = std::data(bufHostSumPerBlock); + auto const result = std::reduce(sumPtr, sumPtr + gridBlockExtent, DataType{0}); + auto const expectedSum = static_cast(arraySize) * expectedA * expectedB; + // dot product should be identical to arraySize*valA*valB + // Use a different equality check if floating point errors exceed precision of FuzzyEqual function + REQUIRE( + FuzzyEqual((static_cast(result) - expectedSum) / expectedSum, static_cast(0.0))); - // - // Calculate and Display Benchmark Results - // - std::vector bytesReadWriteMB = { - getDataThroughput(2u, static_cast(arraySize)), - getDataThroughput(2u, static_cast(arraySize)), - getDataThroughput(2u, static_cast(arraySize)), - getDataThroughput(3u, static_cast(arraySize)), - getDataThroughput(3u, static_cast(arraySize)), - getDataThroughput(2u, static_cast(arraySize)), - }; + // Add workdiv to the list of workdivs to print later + metaData.setItem(BMInfoDataType::WorkDivDot, workDivDot); + } - // calculate the bandwidth as throughput per seconds - std::vector bandwidthsPerKernel; - if(minExecTimesOfKernels.size() == kernelLabels.size()) + + // + // Calculate and Display Benchmark Results for All Kernels + // + bytesReadWriteMB = { + getDataThroughput(2u, static_cast(arraySize)), // init + getDataThroughput(2u, static_cast(arraySize)), // copy + getDataThroughput(2u, static_cast(arraySize)), // mul + getDataThroughput(3u, static_cast(arraySize)), // add + getDataThroughput(3u, static_cast(arraySize)), // triad + getDataThroughput(2u, static_cast(arraySize)), // dot + }; + + // calculate the bandwidth as throughput per seconds + + if(minExecTimesOfKernels.size() == kernelLabels.size()) + { + for(size_t i = 0; i < minExecTimesOfKernels.size(); ++i) + { + bandwidthsPerKernel.push_back(calculateBandwidth(bytesReadWriteMB.at(i), minExecTimesOfKernels.at(i))); + } + } + + metaData.setItem(BMInfoDataType::WorkDivInit, workDivInit); + metaData.setItem(BMInfoDataType::WorkDivCopy, workDivCopy); + metaData.setItem(BMInfoDataType::WorkDivAdd, workDivAdd); + metaData.setItem(BMInfoDataType::WorkDivMult, workDivMult); + metaData.setItem(BMInfoDataType::WorkDivTriad, workDivTriad); + + } // for all 5 kernels and dot if accelerator is suitable + + + // Verify the Triad Kernel for "only-triad" run case and Calculate BW + else if(kernelsToBeExecuted == KernelsToRun::Triad) { - for(size_t i = 0; i < minExecTimesOfKernels.size(); ++i) + // Verify triad + DataType sumErrA{0.0}; + auto const expectedA = static_cast(valB + scalarVal * valC); + + // sum of the errors for each array + for(Idx i = 0; i < arraySize; ++i) { - bandwidthsPerKernel.push_back(calculateBandwidth(bytesReadWriteMB.at(i), minExecTimesOfKernels.at(i))); + sumErrA += std::fabs(bufHostOutputA[static_cast(i)] - expectedA); } + + REQUIRE(FuzzyEqual(sumErrA / static_cast(arraySize) / expectedA, static_cast(0.0))); + + // Calculate and record benchmark results + bytesReadWriteMB = { + getDataThroughput(2u, static_cast(arraySize)), // init + getDataThroughput(3u, static_cast(arraySize)) // triad + }; + + // calculate the bandwidth as throughput per seconds + + if(minExecTimesOfKernels.size() == kernelLabels.size()) + { + for(size_t i = 0; i < minExecTimesOfKernels.size(); ++i) + { + bandwidthsPerKernel.push_back(calculateBandwidth(bytesReadWriteMB.at(i), minExecTimesOfKernels.at(i))); + } + } + + metaData.setItem(BMInfoDataType::WorkDivTriad, workDivTriad); + } + + // Verify NStream Kernel Result and Calculate BW + else if(kernelsToBeExecuted == KernelsToRun::NStream) + { + // Verify NStream + DataType sumErrA{0.0}; + DataType expectedA = 0.0; + for(int i = 0; i < numberOfRuns - 1; i++) + expectedA += static_cast(valA + valB + static_cast(scalarVal) * valC); + + // sum of the errors for each array + for(Idx i = 0; i < arraySize; ++i) + { + if(i == 1) + { + std::cout << std::setprecision(12) << "std::fabs(bufHostOutputA[static_cast(i)] - expectedA):" + << std::fabs(bufHostOutputA[static_cast(i)] - expectedA) << std::endl; + std::cout << std::setprecision(12) + << "bufHostOutputA[static_cast(i)]:" << bufHostOutputA[static_cast(i)] + << std::endl; + } + sumErrA += std::fabs(bufHostOutputA[static_cast(i)] - expectedA); + } + + std::cout << "is zero:" << sumErrA / static_cast(arraySize) / expectedA << std::endl; + REQUIRE(FuzzyEqual(sumErrA / static_cast(arraySize) / expectedA, static_cast(0.0))); + + + // Calculate and record benchmark results + bytesReadWriteMB = { + getDataThroughput(2u, static_cast(arraySize)), // init + getDataThroughput(4u, static_cast(arraySize)) // NStream + }; + + // calculate the bandwidth as throughput per seconds + + if(minExecTimesOfKernels.size() == kernelLabels.size()) + { + for(size_t i = 0; i < minExecTimesOfKernels.size(); ++i) + { + bandwidthsPerKernel.push_back(calculateBandwidth(bytesReadWriteMB.at(i), minExecTimesOfKernels.at(i))); + } + } + + metaData.setItem(BMInfoDataType::WorkDivNStream, workDivNStream); } + // Setting fields of Benchmark Info map. All information about benchmark and results are stored in a single map metaData.setItem(BMInfoDataType::TimeStamp, getCurrentTimestamp()); metaData.setItem(BMInfoDataType::NumRuns, std::to_string(numberOfRuns)); metaData.setItem(BMInfoDataType::DataSize, std::to_string(arraySizeMain)); metaData.setItem(BMInfoDataType::DataType, dataTypeStr); - - metaData.setItem(BMInfoDataType::WorkDivInit, workDivInit); - metaData.setItem(BMInfoDataType::WorkDivCopy, workDivCopy); - metaData.setItem(BMInfoDataType::WorkDivAdd, workDivAdd); - metaData.setItem(BMInfoDataType::WorkDivMult, workDivMult); - metaData.setItem(BMInfoDataType::WorkDivTriad, workDivTriad); - // Device and accelerator metaData.setItem(BMInfoDataType::DeviceName, alpaka::getName(devAcc)); metaData.setItem(BMInfoDataType::AcceleratorType, alpaka::getAccName()); @@ -469,15 +630,15 @@ void testKernels() using TestAccs1D = alpaka::test::EnabledAccs, std::uint32_t>; // Run for all Accs given by the argument -TEMPLATE_LIST_TEST_CASE("TEST: Babelstream Five Kernels", "[benchmark-test]", TestAccs1D) +TEMPLATE_LIST_TEST_CASE("TEST: Babelstream Kernels", "[benchmark-test]", TestAccs1D) { using Acc = TestType; // Run tests for the float data type testKernels(); } -// Run for all Accs given by the argument -TEMPLATE_LIST_TEST_CASE("TEST: Babelstream Five Kernels", "[benchmark-test]", TestAccs1D) +// // Run for all Accs given by the argument +TEMPLATE_LIST_TEST_CASE("TEST: Babelstream Kernels", "[benchmark-test]", TestAccs1D) { using Acc = TestType; // Run tests for the double data type